 # Load YAML Configuration and Dataset

In [10]:
# Load YAML configuration file
import yaml
with open("../netflix_config.yaml", "r") as file:
    config = yaml.safe_load(file)

# Load dataset with pandas
import pandas as pd
df = pd.read_csv(config['input_data']['file'])  # Load dataset based on YAML path
df.head()  # Show the first 5 rows of the dataset

Unnamed: 0,show_id,type,title,release_year,rating,duration,listed_in,duration_int,duration_type,genres,platform
0,s1,Movie,Dick Johnson Is Dead,2020,PG-13,90 min,Documentaries,90,min,['Documentary'],Netflix
1,s2,TV Show,Blood & Water,2021,TV-MA,2 Seasons,"International TV Shows, TV Dramas, TV Mysteries",2,season,['International'],Netflix
2,s3,TV Show,Ganglands,2021,TV-MA,1 Season,"Crime TV Shows, International TV Shows, TV Act...",1,season,['Action/Thriller'],Netflix
3,s4,TV Show,Jailbirds New Orleans,2021,TV-MA,1 Season,"Docuseries, Reality TV",1,season,['Documentary'],Netflix
4,s5,TV Show,Kota Factory,2021,TV-MA,2 Seasons,"International TV Shows, Romantic TV Shows, TV ...",2,season,['International'],Netflix


# Check the Initial Structure of the DataFrame

In [13]:
# Check the initial structure of the DataFrame
print(df.columns)
print(df.info())
print(df.head())

Index(['show_id', 'type', 'title', 'release_year', 'rating', 'duration',
       'listed_in', 'duration_int', 'duration_type', 'genres', 'platform'],
      dtype='object')
<class 'pandas.core.frame.DataFrame'>
RangeIndex: 8706 entries, 0 to 8705
Data columns (total 11 columns):
 #   Column         Non-Null Count  Dtype 
---  ------         --------------  ----- 
 0   show_id        8706 non-null   object
 1   type           8706 non-null   object
 2   title          8706 non-null   object
 3   release_year   8706 non-null   int64 
 4   rating         8706 non-null   object
 5   duration       8706 non-null   object
 6   listed_in      8706 non-null   object
 7   duration_int   8706 non-null   int64 
 8   duration_type  8706 non-null   object
 9   genres         8706 non-null   object
 10  platform       8706 non-null   object
dtypes: int64(2), object(9)
memory usage: 748.3+ KB
None
  show_id     type                  title  release_year rating   duration  \
0      s1    Movie   Dick Joh

# Explore Structure and Metadata


In [16]:
# Explore structure and metadata
df.shape         # Check number of rows and columns
df.columns       # List all column names
df.info()        # Get data types and non-null counts per column

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 8706 entries, 0 to 8705
Data columns (total 11 columns):
 #   Column         Non-Null Count  Dtype 
---  ------         --------------  ----- 
 0   show_id        8706 non-null   object
 1   type           8706 non-null   object
 2   title          8706 non-null   object
 3   release_year   8706 non-null   int64 
 4   rating         8706 non-null   object
 5   duration       8706 non-null   object
 6   listed_in      8706 non-null   object
 7   duration_int   8706 non-null   int64 
 8   duration_type  8706 non-null   object
 9   genres         8706 non-null   object
 10  platform       8706 non-null   object
dtypes: int64(2), object(9)
memory usage: 748.3+ KB


# Load Original Dataset from YAML

In [19]:
# Load original dataset from YAML
original_df = pd.read_csv(config['input_data']['file'], usecols=['show_id'])

# Reassign show_id into your working df
df['show_id'] = original_df['show_id']

In [21]:
# Drop unnecessary columns (corrected column names)
columns_to_drop = [ 'director', 'cast', 'country', 'date_added', 'description']

# Use drop safely with existing column names
df = df.drop(columns=[col for col in columns_to_drop if col in df.columns])

# Confirm columns after dropping
print("\nColumns after dropping unnecessary ones:")
print(df.columns)



Columns after dropping unnecessary ones:
Index(['show_id', 'type', 'title', 'release_year', 'rating', 'duration',
       'listed_in', 'duration_int', 'duration_type', 'genres', 'platform'],
      dtype='object')


In [23]:
# Check missing values in descending order
df.isnull().sum().sort_values(ascending=False)

show_id          0
type             0
title            0
release_year     0
rating           0
duration         0
listed_in        0
duration_int     0
duration_type    0
genres           0
platform         0
dtype: int64

In [25]:
# Number of unique values per column
df.nunique().sort_values()

# Distribution of key categorical columns
df['type'].value_counts(normalize=True)  # Distribution of 'Movie' vs 'TV Show'
df['rating'].value_counts().head(10)     # Most common ratings

rating
TV-MA      3183
TV-14      2133
TV-PG       838
R           802
PG-13       490
TV-Y7       335
TV-Y        300
PG          287
TV-G        212
Unrated      85
Name: count, dtype: int64

In [27]:
# View a sample of the genres column
df['listed_in'].sample(10)

7110                          Action & Adventure, Dramas
2304                 Documentaries, International Movies
4574              Comedies, Dramas, International Movies
7546                                       Documentaries
7062                        Documentaries, Sports Movies
7189                                    Comedies, Dramas
3671                  International TV Shows, Reality TV
8612    Dramas, Independent Movies, International Movies
2204                      Comedies, International Movies
4225    Dramas, Independent Movies, International Movies
Name: listed_in, dtype: object

In [29]:
# First, ensure the 'genres' column is string type
df['genres'] = df['genres'].astype(str)

# Clean the brackets and quotes from the string
df['genres'] = df['genres'].str.replace("[", "").str.replace("]", "").str.replace("'", "").str.strip()

# Split by comma and select only the first genre
df['genres'] = df['genres'].apply(lambda x: x.split(",")[0].strip())


In [31]:
# Show a random sample of 10 rows with 'title' and 'genres' columns
print(df[['title', 'genres']].sample(10))
print(df[df['genres'].str.contains(r"[\[\]']")])

                           title           genres
6818              Ghost Whispers  Action/Thriller
866             Small Town Crime  Action/Thriller
3190                    Cut Bank            Drama
557                   Winchester           Horror
7855             Samudri Lootere             Kids
3784  March Comes in Like a Lion   Sci-Fi/Fantasy
1960             Kiss the Ground      Documentary
1487             Sakho & Mangane  Action/Thriller
331                Darwin’s Game   Sci-Fi/Fantasy
8094                   The 2000s      Documentary
Empty DataFrame
Columns: [show_id, type, title, release_year, rating, duration, listed_in, duration_int, duration_type, genres, platform]
Index: []


In [33]:
# Remove leading/trailing spaces in column names
df.columns = df.columns.str.strip()  

In [35]:
# Split 'duration' column into duration value and type (e.g., "90 min" → 90 + 'min')
df[['duration_int', 'duration_type']] = df['duration'].str.extract(r'(\d+)\s*(\w+)')
df['duration_int'] = pd.to_numeric(df['duration_int'], errors='coerce')  # Convert to numeric
df[['duration', 'duration_int', 'duration_type']].head()  # Preview new columns

Unnamed: 0,duration,duration_int,duration_type
0,90 min,90,min
1,2 Seasons,2,Seasons
2,1 Season,1,Season
3,1 Season,1,Season
4,2 Seasons,2,Seasons


In [37]:
# Frequency of duration types (e.g., 'min', 'Season')
df['duration_type'].value_counts() # Check types of duration (minutes or seasons)

# Standardize plural form to singular ('Seasons' → 'Season')
df['duration_type'] = df['duration_type'].replace('Seasons', 'Season') # Normalize to singular
df['duration_type'].value_counts()  # Confirm correction

duration_type
min       6128
Season    2578
Name: count, dtype: int64

In [39]:
# Check for missing values in 'duration_int'
df['duration_int'].isnull().sum()  # Check how many null durations remain
# Keep only rows where 'duration_int' is not null
df = df[df['duration_int'].notna()]  # Filter out rows without valid duration
# Inspect rows where duration was missing
df[df['duration_int'].isna()][['duration']]


Unnamed: 0,duration


In [41]:
# Remove duplicated rows based on 'show_id' and 'title', and 'release_year'
df = df.drop_duplicates(subset=['show_id', 'title', 'release_year'])  # Remove duplicates
print("Number of rows after removing duplicates:", df.shape[0])  # Check number of rows after deduplication

Number of rows after removing duplicates: 8706


In [43]:
# Confirm that duplicates are removed
duplicates = df.duplicated(subset=['show_id', 'title', 'release_year']).sum()  # Count remaining duplicates
print("Remaining duplicates:", duplicates) 

Remaining duplicates: 0


In [45]:
# Check missing values again per column (after cleaning)
df.isnull().sum().sort_values(ascending=False)

show_id          0
type             0
title            0
release_year     0
rating           0
duration         0
listed_in        0
duration_int     0
duration_type    0
genres           0
platform         0
dtype: int64

In [47]:
# Fill missing values with appropriate defaults
# Replace missing 'rating' with "Not Rated"
df['rating'] = df['rating'].fillna("Not Rated")

In [49]:
# Confirm again that no missing values remain after filling
df.isnull().sum().sort_values(ascending=False)

show_id          0
type             0
title            0
release_year     0
rating           0
duration         0
listed_in        0
duration_int     0
duration_type    0
genres           0
platform         0
dtype: int64

In [51]:
# Quick check of distributions on key columns
print(df['duration_type'].value_counts())  # Check distribution of 'duration_type'
print(df['type'].value_counts())           # Check distribution of 'type'
print(df['rating'].value_counts())         # Check distribution of 'rating'


duration_type
min       6128
Season    2578
Name: count, dtype: int64
type
Movie      6128
TV Show    2578
Name: count, dtype: int64
rating
TV-MA      3183
TV-14      2133
TV-PG       838
R           802
PG-13       490
TV-Y7       335
TV-Y        300
PG          287
TV-G        212
Unrated      85
G            41
Name: count, dtype: int64


In [53]:
# Check unique ratings before applying any replacements
print("\nUnique ratings in the original data:")
print(df['rating'].unique())

# Count how many times each rating appears
print("\nRating value counts:")
print(df['rating'].value_counts(dropna=False))



Unique ratings in the original data:
['PG-13' 'TV-MA' 'PG' 'TV-14' 'TV-PG' 'TV-Y' 'TV-Y7' 'R' 'TV-G' 'G'
 'Unrated']

Rating value counts:
rating
TV-MA      3183
TV-14      2133
TV-PG       838
R           802
PG-13       490
TV-Y7       335
TV-Y        300
PG          287
TV-G        212
Unrated      85
G            41
Name: count, dtype: int64


In [55]:
# Define the replacements dictionary
rating_replacements = {
    # Movie ratings
    '13+': 'PG-13',
    '16': 'R',
    '16+': 'R',
    '18+': 'R',
    '7+': 'G',                      
    'AGES_16_': 'R',
    'AGES_18_': 'R',
    'ALL': 'G',
    'ALL_AGES': 'G',
    'G': 'G',
    'NC-17': 'R',                   
    'NOT_RATE': 'Unrated',
    'NR': 'Unrated',
    'Not Rated': 'Unrated',
    'PG': 'PG',
    'PG-13': 'PG-13',
    'R': 'R',

    # TV Show ratings
    'TV-14': 'TV-14',
    'TV-G': 'TV-G',
    'TV-MA': 'TV-MA',
    'TV-NR': 'Unrated',
    'TV-PG': 'TV-PG',
    'TV-Y': 'TV-Y',
    'TV-Y7': 'TV-Y7',
    'UNRATED': 'Unrated'
}

In [57]:
def map_rating(row):
    if row['type'] == 'Movie':  # Si es una película
        replacements = {
            '13+': 'PG-13',
            '16+': 'R',
            '16+': 'R',
            '16+': 'R',
            '18+': 'R',
            '7+': 'G',
            'AGES_16+': 'R',
            'AGES_18+': 'R',
            'ALL': 'G',
            'ALL_AGES': 'G',
            'G': 'G',
            'NC-17': 'R',
            'NOT_RATE': 'Unrated',
            'NR': 'Unrated',
            'Not Rated': 'Unrated',
            'PG': 'PG',
            'PG-13': 'PG-13',
            'R': 'R'
        }
    else:  # Si es una serie de TV
        replacements = {
            'TV-14': 'TV-14',
            'TV-G': 'TV-G',
            'TV-MA': 'TV-MA',
            'TV-NR': 'Unrated',
            'TV-PG': 'TV-PG',
            'TV-Y': 'TV-Y',
            'TV-Y7': 'TV-Y7',
            'UNRATED': 'Unrated'
        }
    return replacements.get(row['rating'], row['rating'])  # Keep original if not found

# Apply the mapping
df['rating'] = df.apply(map_rating, axis=1)

In [59]:
# Check the unique values after replacements
print("\nUnique ratings after mapping:")
print(df['rating'].unique())

# Optional: Count how many times each rating appears
print("\nRating value counts after mapping:")
print(df['rating'].value_counts(dropna=False))


Unique ratings after mapping:
['PG-13' 'TV-MA' 'PG' 'TV-14' 'TV-PG' 'TV-Y' 'TV-Y7' 'R' 'TV-G' 'G'
 'Unrated']

Rating value counts after mapping:
rating
TV-MA      3183
TV-14      2133
TV-PG       838
R           802
PG-13       490
TV-Y7       335
TV-Y        300
PG          287
TV-G        212
Unrated      85
G            41
Name: count, dtype: int64


In [61]:
# Creates a contingency table (Movie or TV Show).
pd.crosstab(df['rating'], df['type'])

type,Movie,TV Show
rating,Unnamed: 1_level_1,Unnamed: 2_level_1
G,41,0
PG,287,0
PG-13,490,0
R,800,2
TV-14,1427,706
TV-G,126,86
TV-MA,2062,1121
TV-PG,540,298
TV-Y,131,169
TV-Y7,144,191


In [63]:
# Strip whitespace to ensure clean matching
df['rating'] = df['rating'].str.strip()

# Apply the replacements because you never know
df['rating'] = df['rating'].replace(rating_replacements)

# Check again unique values
print(df['rating'].unique())
print(df['rating'].value_counts(dropna=False))

['PG-13' 'TV-MA' 'PG' 'TV-14' 'TV-PG' 'TV-Y' 'TV-Y7' 'R' 'TV-G' 'G'
 'Unrated']
rating
TV-MA      3183
TV-14      2133
TV-PG       838
R           802
PG-13       490
TV-Y7       335
TV-Y        300
PG          287
TV-G        212
Unrated      85
G            41
Name: count, dtype: int64


In [65]:
# Standardize 'duration_type' to lowercase for consistency
df['duration_type'] = df['duration_type'].str.lower()

In [67]:
# Calculate the average duration by 'duration_type' (e.g., average movie length)
avg_duration = df.groupby('duration_type')['duration_int'].mean().reset_index()
print("Average duration per type:")
print(avg_duration)

Average duration per type:
  duration_type  duration_int
0           min     99.577187
1        season      1.688518


In [69]:
# Convert to string (if not already), then clean brackets and quotes
df['genres'] = df['genres'].astype(str).str.replace(r"[\[\]']", '', regex=True).str.strip()
# Keep only the first genre if multiple are listed
df['genres'] = df['genres'].apply(lambda x: x.split(',')[0].strip())


In [71]:
# Explodes 'genres' and gets unique values
unique_genres = df['genres'].explode().unique()
print(unique_genres)

['Documentary' 'International' 'Action/Thriller' 'Drama' 'Kids' 'Comedy'
 'Horror' '"Kids TV"' 'Reality' 'Sci-Fi/Fantasy' 'TV Shows' 'Movies'
 'Other']


In [73]:
# Replacing genre labels with unified categories for consistency
genre_replacements = {
    # Drama group
    'TV Dramas': 'Drama',
    'Dramas': 'Drama',
    'Romantic Movies': 'Drama',
    'Romantic TV Shows': 'Drama',
    'TV Mysteries': 'Drama',
    'Classic Movies': 'Drama',
    'Classic & Cult TV': 'Drama',

    # Comedy group
    'TV Comedies': 'Comedy',
    'Comedies': 'Comedy',
    'Stand-Up Comedy': 'Comedy',
    'Stand-Up Comedy & Talk Shows': 'Comedy',

    # Action & Thriller group
    'Action & Adventure': 'Action/Thriller',
    'TV Action & Adventure': 'Action/Thriller',
    'Thrillers': 'Action/Thriller',
    'TV Thrillers': 'Action/Thriller',
    'Crime TV Shows': 'Action/Thriller',
    'Crime': 'Action/Thriller',

    # Documentary group
    'Documentaries': 'Documentary',
    'Docuseries': 'Documentary',

    # International group
    'International TV Shows': 'International',
    'International Movies': 'International',
    'British TV Shows': 'International',
    'Spanish-Language TV Shows': 'International',

    # Kids & Family group
    'Children & Family Movies': 'Kids',
    'Kids TV': 'Kids',

    # Animation / Fantasy / Sci-Fi group
    'TV Sci-Fi & Fantasy': 'Sci-Fi/Fantasy',
    'Sci-Fi & Fantasy': 'Sci-Fi/Fantasy',
    'Anime Features': 'Sci-Fi/Fantasy',
    'Anime Series': 'Sci-Fi/Fantasy',
    'Korean TV Shows': 'Sci-Fi/Fantasy',

    # Horror group
    'TV Horror': 'Horror',
    'Horror Movies': 'Horror',

    # Reality group (simplified)
    'Reality TV': 'Reality',
    'Talk Show and Variety': 'Reality',
    'Talk Show': 'Reality',

    # Other group (everything else)
    'Music & Musicals': 'Other',
    'Sports Movies': 'Other',
    'Science & Nature TV': 'Other',
    'Faith & Spirituality': 'Other',
    'LGBTQ Movies': 'Other',
    'Independent Movies': 'Other',
    'Teen TV Shows': 'Other',
    'Cult Movies': 'Other'
}

In [75]:
# Convert 'genres' column to string and replace genres based on the genre_replacements dictionary
df['genres'] = df['genres'].astype(str)
df['genres'] = df['genres'].replace(genre_replacements)
df['genres'] = df['genres'].str.strip()

In [77]:
# Filter out invalid or missing genres, then split them into separate entries
df = df[df['genres'].notna() & (df['genres'] != 'nan') & (df['genres'] != '')]
df['genres'] = df['genres'].str.split(', ')

In [79]:
# Explode the 'genres' column to create a new row for each genre per show
df_exploded = df.explode('genres')
df_exploded['genres'] = df_exploded['genres'].str.strip()
df_exploded = df_exploded[df_exploded['genres'].notna() & (df_exploded['genres'] != 'nan') & (df_exploded['genres'] != '')]

In [81]:
# Count the most common genres and store the result
genre_counts = df_exploded['genres'].value_counts().reset_index()
genre_counts.columns = ['genre', 'count']

In [83]:
print("\nMost common genres after grouping:")
print(genre_counts.head(10))


Most common genres after grouping:
             genre  count
0            Drama   1789
1           Comedy   1689
2  Action/Thriller   1352
3    International   1126
4      Documentary   1040
5             Kids    605
6        "Kids TV"    373
7           Horror    286
8   Sci-Fi/Fantasy    207
9          Reality    117


In [85]:
print(df.columns)

Index(['show_id', 'type', 'title', 'release_year', 'rating', 'duration',
       'listed_in', 'duration_int', 'duration_type', 'genres', 'platform'],
      dtype='object')


In [87]:
# Count number of titles by release year
titles_per_year = df['release_year'].value_counts().sort_index().reset_index()
titles_per_year.columns = ['release_year', 'count']
print("\n🔹 Titles by release year:")
print(titles_per_year.tail(10))  # Show most recent years


🔹 Titles by release year:
    release_year  count
64          2012    229
65          2013    282
66          2014    343
67          2015    548
68          2016    878
69          2017   1015
70          2018   1140
71          2019   1030
72          2020    953
73          2021    592


In [89]:
# Import os library to interact with the file system
import os
print("File updated successfully:", os.path.getmtime('../data/clean/cleaned_netflix_data_with_platform.csv'))

File updated successfully: 1745570234.942207


In [91]:
#Export cleaned DataFrame to CSV for SQL import or future use
df['platform'] = 'Netflix'
df.to_csv('../data/clean/cleaned_netflix_data_with_platform.csv', index=False) #Save