# 1. Load & Explore

In [None]:
import pandas as pd

# 1.1 Load the CSV file 
df = pd.read_csv('netflix.csv')

# 1.2 Display first 5 rows - para makita ko anong itsura ng data
print("=== First 5 Rows ===")
print(df.head())
print("\n")

# 1.2 Get summary information - eto yung buod ng dataset, makikita ko data types at missing values
print("=== DataFrame Info ===")
print(df.info())
print("\n")

# 1.2 Basic statistics - para sa numerical columns lang, makikita ko mean, min, max, etc.
print("=== Basic Statistics ===")
print(df.describe())
print("\n")

# 1.3 Check shape - ilan rows at columns
print("=== Shape of DataFrame ===")
print(f"Rows: {df.shape[0]}, Columns: {df.shape[1]}")
print("\n")

# 1.3 List all column names - lahat ng column names, para sure ako sa spelling
print("=== Column Names ===")
print(df.columns.tolist())

=== First 5 Rows ===
    show_id                                title                    director  \
0  81193313                            Chocolate                         NaN   
1  81197050  Guatemala: Heart of the Mayan World  Luis Ara, Ignacio Jaunsolo   
2  81213894                      The Zoya Factor             Abhishek Sharma   
3  81082007                            Atlantics                   Mati Diop   
4  80213643                      Chip and Potato                         NaN   

                                                cast  \
0  Ha Ji-won, Yoon Kye-sang, Jang Seung-jo, Kang ...   
1                                  Christian Morales   
2  Sonam Kapoor, Dulquer Salmaan, Sanjay Kapoor, ...   
3  Mama Sane, Amadou Mbow, Ibrahima Traore, Nicol...   
4  Abigail Oliver, Andrea Libman, Briana Buckmast...   

                    country         date_added  release_year rating  \
0               South Korea  November 30, 2019          2019  TV-14   
1                  

# 2. Data Cleaning & Preprocessing - Missing Values


In [10]:
# 2.1 Fill missing values in 'director' column with "Unknown"
# gamit .fillna() para i-replace yung NaN values
df['director'] = df['director'].fillna('Unknown')

# 2.1 Fill missing values in 'cast' column with "Unknown"
df['cast'] = df['cast'].fillna('Unknown')

# 2.2 Fill missing values in 'country' column with "Unavailable"
df['country'] = df['country'].fillna('Unavailable')

# 2.3 Drop rows where 'date_added' is missing
# mas okay i-drop kasi kailangan accurate date para sa analysis
df = df.dropna(subset=['date_added'])

# 2.3 Drop rows where 'rating' is missing
# importante rin rating para sa filtering later
df = df.dropna(subset=['rating'])

# Check kung ilan na-drop - para malaman if effective yung cleaning
print(f"New shape after cleaning: {df.shape[0]} rows, {df.shape[1]} columns")
print("\n")

# Verify na wala nang missing values sa important columns
print("=== Missing Values Check ===")
print(df[['director', 'cast', 'country', 'date_added', 'rating']].isnull().sum())

New shape after cleaning: 5186 rows, 12 columns


=== Missing Values Check ===
director      0
cast          0
country       0
date_added    0
rating        0
dtype: int64


# 3. Data Type Conversion

In [11]:
# 3.1 Convert 'date_added' to datetime object
# from string "November 30, 2019" to proper datetime format
df['date_added'] = pd.to_datetime(df['date_added'], errors='coerce')

# Verify conversion
print("=== Date Added - First 5 Values ===")
print(df['date_added'].head())
print(f"Data type: {df['date_added'].dtype}\n")

# 3.2-3.4 Extract numerical duration
# Kailangan i-handle yung dalawang format: "93 min" (movies) at "1 Season" (TV shows)

# Create a function para i-extract yung number from duration string
def extract_duration(duration_str):
    """
    Extracts number from duration string.
    Para sa movies: "93 min" -> 93 (integer)
    Para sa TV shows: "1 Season" -> "1 Season" (keep as string)
    """
    if pd.isna(duration_str):  # kung walang value, return None
        return None
    
    # Split the string para mahiwalay yung number at unit
    parts = duration_str.split()
    
    if len(parts) >= 2:
        number = parts[0]  # first part is always the number
        unit = parts[1]     # second part is the unit (min, Season, Seasons)
        
        # 3.3 For movies (unit = "min"), convert to integer
        if unit == "min":
            try:
                return int(number)
            except ValueError:
                return None
        
        # 3.4 For TV shows (unit = "Season" or "Seasons"), keep as string
        else:
            return duration_str
    
    return None

# Apply the function sa duration column
df['duration_cleaned'] = df['duration'].apply(extract_duration)

# Verify the results - check both movies and TV shows
print("=== Duration Cleaning Results ===")
print("Movies (first 5):")
print(df[df['type'] == 'Movie'][['title', 'duration', 'duration_cleaned']].head())
print("\nTV Shows (first 5):")
print(df[df['type'] == 'TV Show'][['title', 'duration', 'duration_cleaned']].head())

# Check data types ng new column
print(f"\nOriginal duration type: {df['duration'].dtype}")
print(f"Cleaned duration type: {df['duration_cleaned'].dtype}")

=== Date Added - First 5 Values ===
0   2019-11-30
1   2019-11-30
2   2019-11-30
3   2019-11-29
5   2019-11-29
Name: date_added, dtype: datetime64[ns]
Data type: datetime64[ns]

=== Duration Cleaning Results ===
Movies (first 5):
                                 title duration duration_cleaned
1  Guatemala: Heart of the Mayan World   67 min               67
2                      The Zoya Factor  135 min              135
3                            Atlantics  106 min              106
5                         Crazy people  107 min              107
6                       I Lost My Body   81 min               81

TV Shows (first 5):
                      title  duration duration_cleaned
0                 Chocolate  1 Season         1 Season
11     Sugar Rush Christmas  1 Season         1 Season
13     The Charming Stepmom  1 Season         1 Season
15  The Movies That Made Us  1 Season         1 Season
17                   Levius  1 Season         1 Season

Original duration type: obje

# 4. Text Normalization

In [12]:
# 4.1 Standardize the 'type' column - make sure consistent capitalization
# gamit .str.title() para gawing "Movie" at "TV Show" (proper case)
df['type'] = df['type'].str.strip().str.title()

# Verify standardization
print("=== Unique values in 'type' column ===")
print(df['type'].value_counts())
print("\n")

# 4.2 Clean leading/trailing whitespace from 'title' column
# .str.strip() removes extra spaces sa simula at dulo ng string
df['title'] = df['title'].str.strip()

# 4.2 Clean leading/trailing whitespace from 'description' column
df['description'] = df['description'].str.strip()

# Verify by checking kung may nag-change (compare before/after lengths)
print("=== Whitespace Cleaning Complete ===")
print(f"Title column cleaned: {df['title'].notna().sum()} entries")
print(f"Description column cleaned: {df['description'].notna().sum()} entries")

=== Unique values in 'type' column ===
type
Movie      3931
Tv Show    1255
Name: count, dtype: int64


=== Whitespace Cleaning Complete ===
Title column cleaned: 5186 entries
Description column cleaned: 5186 entries


# 5. Data Transformation & Analysis

In [None]:
# 5.1 Filtering: Create new DataFrame with only Movies released after 2018
# gamit boolean indexing - dalawang condition: type == 'Movie' AND release_year > 2018
movies_after_2018 = df[(df['type'] == 'Movie') & (df['release_year'] > 2018)]

# Display results ng filtering
print("=== Movies Released After 2018 ===")
print(f"Total movies after 2018: {len(movies_after_2018)} movies")
print("\nFirst 5 movies:")
print(movies_after_2018[['title', 'release_year', 'type']].head())
print("\n")

# 5.2 Sorting: Sort entire DataFrame by release_year (descending), then by title (ascending)
# gamit .sort_values() with multiple columns
# ascending=[False, True] means: release_year descending, title ascending
df = df.sort_values(by=['release_year', 'title'], ascending=[False, True])

# Reset index para malinis index numbers after sorting
df = df.reset_index(drop=True)

# Verify sorting
print("=== Sorted DataFrame ===")
print("First 10 rows (should be newest year, alphabetical titles):")
print(df[['title', 'release_year', 'type']].head(10))

=== Movies Released After 2018 ===
Total movies after 2018: 347 movies

First 5 movies:
                                  title  release_year   type
1   Guatemala: Heart of the Mayan World          2019  Movie
2                       The Zoya Factor          2019  Movie
3                             Atlantics          2019  Movie
6                        I Lost My Body          2019  Movie
16                         Holiday Rush          2019  Movie


=== Sorted DataFrame ===
First 10 rows (should be newest year, alphabetical titles):
                title  release_year     type
0  Maradona in Mexico          2020  Tv Show
1              15-Aug          2019    Movie
2                1994          2019  Tv Show
3              45 rpm          2019  Tv Show
4                  4L          2019    Movie
5           7 (Seven)          2019  Tv Show
6                 706          2019    Movie
7              7SEEDS          2019  Tv Show
8               90 ML          2019    Movie
9      A 

# 6. Output - Save Transformed Data

In [14]:
# 6.1 Save the final cleaned and transformed DataFrame to CSV
# gamit ko index=False para hindi na ma-save yung index column
output_filename = 'netflix_transformed_for_analysis.csv'
df.to_csv(output_filename, index=False)

print(f"=== Data Successfully Saved! ===")
print(f"Filename: {output_filename}")
print(f"Total rows saved: {len(df)}")
print(f"Total columns saved: {len(df.columns)}")
print("\n")

# Display final summary statistics
print("=== Final DataFrame Summary ===")
print(df.info())

=== Data Successfully Saved! ===
Filename: netflix_transformed_for_analysis.csv
Total rows saved: 5186
Total columns saved: 13


=== Final DataFrame Summary ===
<class 'pandas.core.frame.DataFrame'>
RangeIndex: 5186 entries, 0 to 5185
Data columns (total 13 columns):
 #   Column            Non-Null Count  Dtype         
---  ------            --------------  -----         
 0   show_id           5186 non-null   int64         
 1   title             5186 non-null   object        
 2   director          5186 non-null   object        
 3   cast              5186 non-null   object        
 4   country           5186 non-null   object        
 5   date_added        5186 non-null   datetime64[ns]
 6   release_year      5186 non-null   int64         
 7   rating            5186 non-null   object        
 8   duration          5186 non-null   object        
 9   listed_in         5186 non-null   object        
 10  description       5186 non-null   object        
 11  type              5186 no

In [15]:
# Load the saved file para verify na tama save
df_verify = pd.read_csv('netflix_transformed_for_analysis.csv')

print("=== Verification: Loading Saved File ===")
print(f"Loaded shape: {df_verify.shape}")
print("\nFirst 3 rows:")
print(df_verify.head(3))
print("\n✅ File saved and verified successfully!")

=== Verification: Loading Saved File ===
Loaded shape: (5186, 13)

First 3 rows:
    show_id               title              director  \
0  81034946  Maradona in Mexico               Unknown   
1  81033429              15-Aug    Swapnaneel Jayakar   
2  80991872                1994  Diego Enrique Osorno   

                                                cast  \
0                             Diego Armando Maradona   
1  Rahul Pethe, Mrunmayee Deshpande, Adinath Koth...   
2                                            Unknown   

                            country  date_added  release_year rating  \
0  Argentina, United States, Mexico  2019-11-13          2020  TV-MA   
1                             India  2019-03-29          2019  TV-14   
2                            Mexico  2019-05-17          2019  TV-MA   

   duration                                          listed_in  \
0  1 Season              Docuseries, Spanish-Language TV Shows   
1   124 min               Comedies, Dramas, 