# 1. Write a Python function to normalize text from the Netflix dataset:
- strip leading/trailing whitespace,
- convert to lowercase,
- replace common special characters from fields like title or description.

In [4]:
import csv
import re

def normalize_text(text):
    """
    Clean/normalize a single text field
    """
    if text is None or text == '':
        return ''
    
    # Strip whitespace
    text = text.strip()
    
    # Convert to lowercase
    text = text.lower()
    
    # Replace special characters (keep letters, numbers, spaces, and commas for CSV)
    text = re.sub(r'[^a-z0-9\s,]', '', text)
    
    # Replace multiple spaces with single space
    text = re.sub(r'\s+', ' ', text)
    
    return text.strip()


def clean_netflix_dataset(input_filepath, output_filepath):
    """
    Read the Netflix CSV, clean text fields, and save to new file
    """
    
    # Fields na gusto i-normalize (title and description)
    fields_to_clean = ['title', 'description']
    
    cleaned_count = 0
    
    try:
        with open(input_filepath, 'r', encoding='utf-8') as infile:
            csv_reader = csv.DictReader(infile)
            
            # Get all column names
            fieldnames = csv_reader.fieldnames
            
            # Open output file for writing
            with open(output_filepath, 'w', encoding='utf-8', newline='') as outfile:
                csv_writer = csv.DictWriter(outfile, fieldnames=fieldnames)
                
                # Write header
                csv_writer.writeheader()
                
                print("Cleaning Netflix dataset...")
                print("="*70)
                
                # Process each row
                for i, row in enumerate(csv_reader):
                    
                    # Clean the specified fields
                    for field in fields_to_clean:
                        if field in row and row[field]:
                            original = row[field]
                            row[field] = normalize_text(original)
                            
                            # Show first 5 examples lang para makita mo yung changes
                            if i < 5:
                                print(f"\nRow {i+1} - {field}:")
                                print(f"  Before: {original[:60]}...")
                                print(f"  After:  {row[field][:60]}...")
                    
                    # Write cleaned row to output file
                    csv_writer.writerow(row)
                    cleaned_count += 1
                    
                    # Progress indicator every 1000 rows
                    if (i + 1) % 1000 == 0:
                        print(f"\nProcessed {i + 1} rows...")
                
                print("\n" + "="*70)
                print(f"✓ Cleaning complete!")
                print(f"✓ Total rows cleaned: {cleaned_count}")
                print(f"✓ Output saved to: {output_filepath}")
                
    except FileNotFoundError:
        print(f"Error: File '{input_filepath}' not found!")
    except Exception as e:
        print(f"Error: {e}")


# Run the cleaning
if __name__ == "__main__":
    input_file = r'netflix.csv'
    output_file = r'netflix_cleaned.csv'
    
    clean_netflix_dataset(input_file, output_file)


Cleaning Netflix dataset...

Row 1 - title:
  Before: Chocolate...
  After:  chocolate...

Row 1 - description:
  Before: Brought together by meaningful meals in the past and present...
  After:  brought together by meaningful meals in the past and present...

Row 2 - title:
  Before: Guatemala: Heart of the Mayan World...
  After:  guatemala heart of the mayan world...

Row 2 - description:
  Before: From Sierra de las Minas to Esquipulas, explore Guatemala's ...
  After:  from sierra de las minas to esquipulas, explore guatemalas c...

Row 3 - title:
  Before: The Zoya Factor...
  After:  the zoya factor...

Row 3 - description:
  Before: A goofy copywriter unwittingly convinces the Indian cricket ...
  After:  a goofy copywriter unwittingly convinces the indian cricket ...

Row 4 - title:
  Before: Atlantics...
  After:  atlantics...

Row 4 - description:
  Before: Arranged to marry a rich man, young Ada is crushed when her ...
  After:  arranged to marry a rich man, young ada is cr

# 2. Parse the netflix_titles.csv file manually:
- read the first 10 lines as raw text using Python's file handling,
- extract specific fields (like title and release_year) from each line,
- summarize basic contents (count lines, identify unique categories from a few lines).

In [5]:
def parse_netflix_csv_manual(filepath, num_lines=10):
    """
    Manually parse CSV file without using pandas/csv library
    Does ALL three requirements:
    1. Read first 10 lines as raw text
    2. Extract specific fields (title and release_year)
    3. Summarize basic contents
    """
    
    results = {
        'raw_lines': [],           # para sa requirement 1: raw text
        'titles': [],              # para sa requirement 2: extracted fields
        'release_years': [],       # para sa requirement 2: extracted fields
        'types': [],               # para sa requirement 3: categories
        'total_lines': 0
    }
    
    try:
        # Open file in read mode
        with open(filepath, 'r', encoding='utf-8') as file:
            
            print("="*70)
            print("REQUIREMENT 1: Reading first 10 lines as RAW TEXT")
            print("="*70)
            print()
            
            # Read header first
            header_line = file.readline().strip()
            results['raw_lines'].append(header_line)
            results['total_lines'] += 1
            
            print(f"Header (raw text):")
            print(header_line)
            print("\n" + "="*70)
            print()
            
            # Parse header to find column positions
            columns = header_line.split(',')
            
            # Read next num_lines lines AS RAW TEXT
            print(f"Reading next {num_lines} lines as RAW TEXT:")
            print()
            
            for i in range(num_lines):
                line = file.readline()
                
                if not line:  # end of file
                    break
                
                line = line.strip()
                results['raw_lines'].append(line)
                results['total_lines'] += 1
                
                # Show raw text (first 100 chars lang para hindi masyadong mahaba)
                print(f"Line {i+1} (raw): {line[:100]}...")
                print()
        
        # Now process those raw lines
        print("="*70)
        print("REQUIREMENT 2: Extracting SPECIFIC FIELDS (title, release_year)")
        print("="*70)
        print()
        
        # Skip header (index 0), process data lines
        for i, raw_line in enumerate(results['raw_lines'][1:], start=1):
            # Simple split by comma (may issues with quoted commas, pero manual parsing nga eh)
            fields = raw_line.split(',')
            
            # Try to extract fields based on position
            # Column positions from header: show_id(0), title(1), ..., release_year(6), ..., type(11)
            try:
                title = fields[1] if len(fields) > 1 else 'N/A'
                year = fields[6] if len(fields) > 6 else 'N/A'
                content_type = fields[11] if len(fields) > 11 else 'N/A'
                
                # Store extracted fields
                results['titles'].append(title)
                results['release_years'].append(year)
                results['types'].append(content_type)
                
                # Print extracted fields
                print(f"Line {i}:")
                print(f"  Title extracted: {title[:50]}...")
                print(f"  Year extracted:  {year}")
                print()
                
            except IndexError:
                print(f"Line {i}: Could not parse (not enough fields)")
                print()
        
        # Summarize basic contents
        print("="*70)
        print("REQUIREMENT 3: SUMMARIZE BASIC CONTENTS")
        print("="*70)
        print()
        
        # Count lines
        print(f"Total lines read: {results['total_lines']} (including header)")
        print(f"Data lines processed: {len(results['titles'])}")
        print()
        
        # Identify unique categories from a few lines
        unique_types = set(results['types'])
        print(f"Unique content types found: {unique_types}")
        print()
        
        # Count each type
        type_counts = {}
        for t in results['types']:
            type_counts[t] = type_counts.get(t, 0) + 1
        
        print("Type distribution from first 10 lines:")
        for content_type, count in type_counts.items():
            print(f"  {content_type}: {count}")
        print()
        
        # Show some statistics about the years
        valid_years = [y for y in results['release_years'] if y.isdigit()]
        if valid_years:
            print(f"Release years found: {', '.join(valid_years)}")
            print(f"Year range: {min(valid_years)} to {max(valid_years)}")
        else:
            print("No valid release years found in extracted data")
        
        print("\n" + "="*70)
        print("✓ All three requirements completed!")
        print("="*70)
        
        return results
        
    except FileNotFoundError:
        print(f"Error: File '{filepath}' not found!")
        return None
    except Exception as e:
        print(f"Error: {e}")
        return None


# Run the parser
if __name__ == "__main__":
    filepath = r'netflix.csv'
    
    print("MANUAL CSV PARSING - ALL REQUIREMENTS")
    print()
    
    results = parse_netflix_csv_manual(filepath, num_lines=10)
    
    if results:
        print("\n✓ Parse completed successfully!")
        print(f"✓ Captured {len(results['raw_lines'])} raw lines")
        print(f"✓ Extracted {len(results['titles'])} titles")
        print(f"✓ Summarized {len(results['types'])} content types")

MANUAL CSV PARSING - ALL REQUIREMENTS

REQUIREMENT 1: Reading first 10 lines as RAW TEXT

Header (raw text):
show_id,title,director,cast,country,date_added,release_year,rating,duration,listed_in,description,type


Reading next 10 lines as RAW TEXT:

Line 1 (raw): 81193313,Chocolate,,"Ha Ji-won, Yoon Kye-sang, Jang Seung-jo, Kang Bu-ja, Lee Jae-ryong, Min Jin-woo...

Line 2 (raw): 81197050,Guatemala: Heart of the Mayan World,"Luis Ara, Ignacio Jaunsolo",Christian Morales,,"Novemb...

Line 3 (raw): 81213894,The Zoya Factor,Abhishek Sharma,"Sonam Kapoor, Dulquer Salmaan, Sanjay Kapoor, Sikander Khe...

Line 4 (raw): 81082007,Atlantics,Mati Diop,"Mama Sane, Amadou Mbow, Ibrahima Traore, Nicole Sougou, Amina Kane, Ma...

Line 5 (raw): 80213643,Chip and Potato,,"Abigail Oliver, Andrea Libman, Briana Buckmaster, Brian Dobson, Chance Hu...

Line 6 (raw): 81172754,Crazy people,Moses Inwang,"Ramsey Nouah, Chigul, Sola Sobowale, Ireti Doyle, Ben Touitou, F...

Line 7 (raw): 81120982,I Lost My Bod