In [None]:
# Load YAML configuration file
import yaml

with open("../netflix_config.yaml", "r") as file:
    config = yaml.safe_load(file)

# Load dataset with pandas
import pandas as pd
df = pd.read_csv(config['input_data']['file'])  # Load dataset based on YAML path
df.head()  # Show the first 5 rows of the dataset

In [None]:
# Check the initial structure of the DataFrame
print(df.columns)
print(df.info())
print(df.head())

In [None]:
# Explore structure and metadata
df.shape         # Check number of rows and columns
df.columns       # List all column names
df.info()        # Get data types and non-null counts per column

In [None]:
# Load original dataset from YAML
original_df = pd.read_csv(config['input_data']['file'], usecols=['show_id'])

# Reassign show_id into your working df
df['show_id'] = original_df['show_id']

In [None]:
# Drop unnecessary columns (corrected column names)
columns_to_drop = [ 'director', 'cast', 'country', 'date_added', 'description']

# Use drop safely with existing column names
df = df.drop(columns=[col for col in columns_to_drop if col in df.columns])

# Confirm columns after dropping
print("\nColumns after dropping unnecessary ones:")
print(df.columns)


In [None]:
# Check missing values in descending order
df.isnull().sum().sort_values(ascending=False)

In [None]:
# Number of unique values per column
df.nunique().sort_values()

# Distribution of key categorical columns
df['type'].value_counts(normalize=True)  # Distribution of 'Movie' vs 'TV Show'
df['rating'].value_counts().head(10)     # Most common ratings

In [None]:
# View a sample of the genres column
df['listed_in'].sample(10)

In [None]:
# First, ensure the 'genres' column is string type
df['genres'] = df['genres'].astype(str)

# Clean the brackets and quotes from the string
df['genres'] = df['genres'].str.replace("[", "").str.replace("]", "").str.replace("'", "").str.strip()

# Split by comma and select only the first genre
df['genres'] = df['genres'].apply(lambda x: x.split(",")[0].strip())


In [None]:
print(df[['title', 'genres']].sample(10))
print(df[df['genres'].str.contains(r"[\[\]']")])



In [None]:
df.columns = df.columns.str.strip()  # Remove leading/trailing spaces in column names


In [None]:
# Split 'duration' column into duration value and type (e.g., "90 min" → 90 + 'min')
df[['duration_int', 'duration_type']] = df['duration'].str.extract(r'(\d+)\s*(\w+)')
df['duration_int'] = pd.to_numeric(df['duration_int'], errors='coerce')  # Convert to numeric
df[['duration', 'duration_int', 'duration_type']].head()  # Preview new columns

In [None]:
# Frequency of duration types (e.g., 'min', 'Season')
df['duration_type'].value_counts() # Check types of duration (minutes or seasons)

# Standardize plural form to singular ('Seasons' → 'Season')
df['duration_type'] = df['duration_type'].replace('Seasons', 'Season') # Normalize to singular
df['duration_type'].value_counts()  # Confirm correction

In [None]:
# Check for missing values in 'duration_int'
df['duration_int'].isnull().sum()  # Check how many null durations remain
# Keep only rows where 'duration_int' is not null
df = df[df['duration_int'].notna()]  # Filter out rows without valid duration
# Inspect rows where duration was missing
df[df['duration_int'].isna()][['duration']]


In [None]:
# Remove duplicated rows based on 'show_id' and 'title', and 'release_year'
df = df.drop_duplicates(subset=['show_id', 'title', 'release_year'])  # Remove duplicates
print("Number of rows after removing duplicates:", df.shape[0])  # Check number of rows after deduplication

In [None]:
# Confirm that duplicates are removed
duplicates = df.duplicated(subset=['show_id', 'title', 'release_year']).sum()  # Count remaining duplicates
print("Remaining duplicates:", duplicates) 

In [None]:
# Check missing values again per column (after cleaning)
df.isnull().sum().sort_values(ascending=False)

In [None]:
# Fill missing values with appropriate defaults
# Replace missing 'rating' with "Not Rated"
df['rating'] = df['rating'].fillna("Not Rated")

In [None]:
# Confirm again that no missing values remain after filling
df.isnull().sum().sort_values(ascending=False)

In [None]:
# Quick check of distributions on key columns
print(df['duration_type'].value_counts())  # Check distribution of 'duration_type'
print(df['type'].value_counts())           # Check distribution of 'type'
print(df['rating'].value_counts())         # Check distribution of 'rating'


In [None]:
# Check unique ratings before applying any replacements
print("\nUnique ratings in the original data:")
print(df['rating'].unique())

# Count how many times each rating appears
print("\nRating value counts:")
print(df['rating'].value_counts(dropna=False))


In [None]:
# Define the replacements dictionary
rating_replacements = {
    # Movie ratings
    '13+': 'PG-13',
    '16': 'R',
    '16+': 'R',
    '18+': 'R',
    '7+': 'G',                      # Assumed G for movies
    'AGES_16_': 'R',
    'AGES_18_': 'R',
    'ALL': 'G',
    'ALL_AGES': 'G',
    'G': 'G',
    'NC-17': 'R',                   # Group NC-17 into R
    'NOT_RATE': 'Unrated',
    'NR': 'Unrated',
    'Not Rated': 'Unrated',
    'PG': 'PG',
    'PG-13': 'PG-13',
    'R': 'R',

    # TV Show ratings
    'TV-14': 'TV-14',
    'TV-G': 'TV-G',
    'TV-MA': 'TV-MA',
    'TV-NR': 'Unrated',
    'TV-PG': 'TV-PG',
    'TV-Y': 'TV-Y',
    'TV-Y7': 'TV-Y7',
    'UNRATED': 'Unrated'
}

In [None]:
def map_rating(row):
    if row['type'] == 'Movie':  # Si es una película
        replacements = {
            '13+': 'PG-13',
            '16+': 'R',
            '16+': 'R',
            '16+': 'R',
            '18+': 'R',
            '7+': 'G',
            'AGES_16+': 'R',
            'AGES_18+': 'R',
            'ALL': 'G',
            'ALL_AGES': 'G',
            'G': 'G',
            'NC-17': 'R',
            'NOT_RATE': 'Unrated',
            'NR': 'Unrated',
            'Not Rated': 'Unrated',
            'PG': 'PG',
            'PG-13': 'PG-13',
            'R': 'R'
        }
    else:  # Si es una serie de TV
        replacements = {
            'TV-14': 'TV-14',
            'TV-G': 'TV-G',
            'TV-MA': 'TV-MA',
            'TV-NR': 'Unrated',
            'TV-PG': 'TV-PG',
            'TV-Y': 'TV-Y',
            'TV-Y7': 'TV-Y7',
            'UNRATED': 'Unrated'
        }
    return replacements.get(row['rating'], row['rating'])  # Keep original if not found

# Apply the mapping
df['rating'] = df.apply(map_rating, axis=1)

In [None]:
# Check the unique values after replacements
print("\nUnique ratings after mapping:")
print(df['rating'].unique())

# Optional: Count how many times each rating appears
print("\nRating value counts after mapping:")
print(df['rating'].value_counts(dropna=False))

In [None]:
pd.crosstab(df['rating'], df['type'])

In [None]:
# Strip whitespace to ensure clean matching
df['rating'] = df['rating'].str.strip()

# Apply the replacements because you never know
df['rating'] = df['rating'].replace(rating_replacements)

# Check again unique values
print(df['rating'].unique())
print(df['rating'].value_counts(dropna=False))

In [None]:
# Standardize 'duration_type' to lowercase for consistency
df['duration_type'] = df['duration_type'].str.lower()

In [None]:
# Calculate the average duration by 'duration_type' (e.g., average movie length)
avg_duration = df.groupby('duration_type')['duration_int'].mean().reset_index()
print("Average duration per type:")
print(avg_duration)

In [None]:
# Convert to string (if not already), then clean brackets and quotes
df['genres'] = df['genres'].astype(str).str.replace(r"[\[\]']", '', regex=True).str.strip()
# Keep only the first genre if multiple are listed
df['genres'] = df['genres'].apply(lambda x: x.split(',')[0].strip())


In [None]:
unique_genres = df['genres'].explode().unique()
print(unique_genres)

In [None]:
genre_replacements = {
    # Drama group
    'TV Dramas': 'Drama',
    'Dramas': 'Drama',
    'Romantic Movies': 'Drama',
    'Romantic TV Shows': 'Drama',
    'TV Mysteries': 'Drama',
    'Classic Movies': 'Drama',
    'Classic & Cult TV': 'Drama',

    # Comedy group
    'TV Comedies': 'Comedy',
    'Comedies': 'Comedy',
    'Stand-Up Comedy': 'Comedy',
    'Stand-Up Comedy & Talk Shows': 'Comedy',

    # Action & Thriller group
    'Action & Adventure': 'Action/Thriller',
    'TV Action & Adventure': 'Action/Thriller',
    'Thrillers': 'Action/Thriller',
    'TV Thrillers': 'Action/Thriller',
    'Crime TV Shows': 'Action/Thriller',
    'Crime': 'Action/Thriller',

    # Documentary group
    'Documentaries': 'Documentary',
    'Docuseries': 'Documentary',

    # International group
    'International TV Shows': 'International',
    'International Movies': 'International',
    'British TV Shows': 'International',
    'Spanish-Language TV Shows': 'International',

    # Kids & Family group
    'Children & Family Movies': 'Kids',
    'Kids TV': 'Kids',

    # Animation / Fantasy / Sci-Fi group
    'TV Sci-Fi & Fantasy': 'Sci-Fi/Fantasy',
    'Sci-Fi & Fantasy': 'Sci-Fi/Fantasy',
    'Anime Features': 'Sci-Fi/Fantasy',
    'Anime Series': 'Sci-Fi/Fantasy',
    'Korean TV Shows': 'Sci-Fi/Fantasy',

    # Horror group
    'TV Horror': 'Horror',
    'Horror Movies': 'Horror',

    # Reality group (simplified)
    'Reality TV': 'Reality',
    'Talk Show and Variety': 'Reality',
    'Talk Show': 'Reality',

    # Other group (everything else)
    'Music & Musicals': 'Other',
    'Sports Movies': 'Other',
    'Science & Nature TV': 'Other',
    'Faith & Spirituality': 'Other',
    'LGBTQ Movies': 'Other',
    'Independent Movies': 'Other',
    'Teen TV Shows': 'Other',
    'Cult Movies': 'Other'
}


In [None]:
df['genres'] = df['genres'].astype(str)
df['genres'] = df['genres'].replace(genre_replacements)
df['genres'] = df['genres'].str.strip()


In [None]:
df = df[df['genres'].notna() & (df['genres'] != 'nan') & (df['genres'] != '')]
df['genres'] = df['genres'].str.split(', ')

In [None]:
df_exploded = df.explode('genres')
df_exploded['genres'] = df_exploded['genres'].str.strip()
df_exploded = df_exploded[df_exploded['genres'].notna() & (df_exploded['genres'] != 'nan') & (df_exploded['genres'] != '')]


In [None]:
genre_counts = df_exploded['genres'].value_counts().reset_index()
genre_counts.columns = ['genre', 'count']

In [None]:
print("\nMost common genres after grouping:")
print(genre_counts.head(10))

In [None]:
print(df.columns)


In [None]:
# Count number of titles by release year
titles_per_year = df['release_year'].value_counts().sort_index().reset_index()
titles_per_year.columns = ['release_year', 'count']
print("\n🔹 Titles by release year:")
print(titles_per_year.tail(10))  # Show most recent years

In [None]:
import os
print("File updated successfully:", os.path.getmtime('../data/clean/cleaned_netflix_data_with_platform.csv'))


In [None]:
#Export cleaned DataFrame to CSV for SQL import or future use
df['platform'] = 'Netflix'
df.to_csv('../data/clean/cleaned_netflix_data_with_platform.csv', index=False) #Save
