In [None]:
# Load YAML configuration file
import yaml

with open("../amazon_config.yaml", "r") as file:
    config = yaml.safe_load(file)

# Load dataset with pandas
import pandas as pd
df = pd.read_csv(config['input_data']['file'])  # Load dataset based on YAML path
df.head()  # Show the first 5 rows of the dataset

In [None]:
# Drop unnecessary columns (corrected column names)
columns_to_drop = [ 'director', 'cast', 'country', 'date_added', 'description']

# Use drop safely with existing column names
df = df.drop(columns=[col for col in columns_to_drop if col in df.columns])

# Confirm columns after dropping
print("\nColumns after dropping unnecessary ones:")
print(df.columns)

In [None]:
# Explore structure and metadata
df.shape         # Check number of rows and columns
df.columns       # List all column names
df.info()        # Get data types and non-null counts per column

In [None]:
# Check missing values in descending order
df.isnull().sum().sort_values(ascending=False)

In [None]:
# Number of unique values per column
df.nunique().sort_values()

# Distribution of key categorical columns
df['type'].value_counts(normalize=True)  # Distribution of 'Movie' vs 'TV Show'
df['rating'].value_counts().head(10)     # Most common ratings

In [None]:
print(df['rating'].unique())

In [None]:
# View a sample of the genres column
df['listed_in'].sample(30)

In [None]:
# Ensure the column is string and remove leading/trailing spaces
df['listed_in'] = df['listed_in'].astype(str).str.strip()

# Keep only the first genre if there are multiple (split by comma)
df['genres'] = df['listed_in'].apply(lambda x: x.split(',')[0].strip())

# Optional: Drop the original 'listed_in' column if you no longer need it
df = df.drop(columns=['listed_in'])

# Preview to confirm changes
print(df[['genres']].sample(10))


In [None]:
# Check all unique genres BEFORE replacements
print("\nUnique genres in Amazon (before grouping):")
print(df['genres'].unique())

# Optional: Count how many times each genre appears
print("\nGenre value counts (before grouping):")
print(df['genres'].value_counts(dropna=False))


In [None]:
genre_replacements = {
    # Drama group
    'Drama': 'Drama',
    'Romance': 'Drama',
    'Suspense': 'Drama',
    'Thrillers': 'Drama',  
    'Mystery': 'Drama',    

    # Comedy group
    'Comedy': 'Comedy',
    'Stand-Up Comedy': 'Comedy',
    'Talk Show and Variety': 'Comedy',
    'Talk Show': 'Comedy',
    'Talk Show and Variety': 'Comedy',

    # Action & Thriller group
    'Action': 'Action/Thriller',
    'Adventure': 'Action/Thriller',
    'Military and War': 'Action/Thriller',
    'Arthouse': 'Action/Thriller',
    'Suspense': 'Action/Thriller',

    # Documentary group
    'Documentary': 'Documentary',
    'Docuseries': 'Documentary',
    'Unscripted': 'Documentary',

    # International group
    'International': 'International',

    # Kids & Family group
    'Kids': 'Kids',
    'Young Adult Audience': 'Kids',

    # Sci-Fi / Fantasy / Animation group
    'Sci-Fi & Fantasy': 'Sci-Fi/Fantasy',
    'Science Fiction': 'Sci-Fi/Fantasy',
    'Fantasy': 'Sci-Fi/Fantasy',
    'Animation': 'Sci-Fi/Fantasy',
    'Anime': 'Sci-Fi/Fantasy',

    # Horror group
    'Horror': 'Horror',

    # Reality / Other group
    'Reality TV': 'Reality',
    'Sports': 'Other',
    'Fitness': 'Other',
    'Music Videos and Concerts': 'Other',
    'Arts': 'Other',
    'Faith and Spirituality': 'Other',
    'LGBTQ': 'Other',
    'Historical': 'Other',
    'Western': 'Other',
}


In [None]:
# Check unique genres AFTER applying replacements
print("\nUnique genres in Amazon (after grouping):")
print(df['genres'].unique())

In [None]:
df['genres'] = df['genres'].replace(genre_replacements)


In [None]:
# Optional: Count genre occurrences after grouping
print("\nGenre value counts (after grouping):")
print(df['genres'].value_counts(dropna=False))


In [None]:
# Split 'duration' column into duration value and type (e.g., "90 min" → 90 + 'min')
df[['duration_int', 'duration_type']] = df['duration'].str.extract(r'(\d+)\s*(\w+)')
df['duration_int'] = pd.to_numeric(df['duration_int'], errors='coerce')  # Convert to numeric
df[['duration', 'duration_int', 'duration_type']].head()  # Preview new columns

In [None]:
# Split genres into lists (for multi-label classification)

df['genres'] = df['genres'].str.split(', ')  # Check results

In [None]:
# Frequency of duration types (e.g., 'min', 'Season')
df['duration_type'].value_counts() # Check types of duration (minutes or seasons)

# Standardize plural form to singular ('Seasons' → 'Season')
df['duration_type'] = df['duration_type'].replace('Seasons', 'Season') # Normalize to singular
df['duration_type'].value_counts()  # Confirm correction

In [None]:
# Check for missing values in 'duration_int'
df['duration_int'].isnull().sum()  # Check how many null durations remain
# Keep only rows where 'duration_int' is not null
df = df[df['duration_int'].notna()]  # Filter out rows without valid duration
# Inspect rows where duration was missing
df[df['duration_int'].isna()][['duration']]


In [None]:
# Remove duplicated rows based on 'show_id', 'title', and 'release_year'
df = df.drop_duplicates(subset=['show_id', 'title', 'release_year'])  # Remove duplicates
print("Number of rows after removing duplicates:", df.shape[0])  # Check number of rows after deduplication

In [None]:
# Confirm that duplicates are removed
duplicates = df.duplicated(subset=['show_id', 'title', 'release_year']).sum()  # Count remaining duplicates
print("Remaining duplicates:", duplicates) 

In [None]:
# Check missing values again per column (after cleaning)
df.isnull().sum().sort_values(ascending=False)

In [None]:
import numpy as np  # Asegúrate de importar numpy para poder usar np.nan

# Fill missing values with appropriate defaults

# Replace missing 'rating' with "Not Rated"
df['rating'] = df['rating'].fillna("Not Rated")

# Final check: confirm there are no missing values left
print("\nMissing values per column after filling:")
print(df.isnull().sum().sort_values(ascending=False))


In [None]:
# Confirm again that no missing values remain after filling
df.isnull().sum().sort_values(ascending=False)

In [None]:
# Quick check of distributions on key columns
print(df['duration_type'].value_counts())  # Check distribution of 'duration_type'
print(df['type'].value_counts())           # Check distribution of 'type'
print(df['rating'].value_counts())         # Check distribution of 'rating'


In [None]:
print("Unique ratings:")
print(df['rating'].unique())

print("\nRating value counts:")
print(df['rating'].value_counts(dropna=False))


In [None]:
print("Unique types:")
print(df['type'].unique())

In [None]:
pd.crosstab(df['rating'], df['type'])


In [None]:
# Define the replacements dictionary
rating_replacements = {
    # Movie ratings
    '13+': 'PG-13',
    '16+': 'R',
    '16': 'R',
    '18+': 'R',
    '7+': 'G',
    'AGES_16+': 'R',
    'AGES_16_': 'R',
    'AGES_18_': 'R',
    'ALL': 'G',
    'ALL_AGES': 'G',
    'G': 'G',
    'NC-17': 'R',
    'NR': 'Unrated',
    'Not Rated': 'Unrated',
    'PG': 'PG',
    'PG-13': 'PG-13',
    'R': 'R',

    # TV Show ratings
    'TV-14': 'TV-14',
    'TV-G': 'TV-G',
    'TV-MA': 'TV-MA',
    'TV-NR': 'Unrated',
    'TV-PG': 'TV-PG',
    'TV-Y': 'TV-Y',
    'TV-Y7': 'TV-Y7',
    'UNRATED': 'Unrated',
    
    # Additional cases
    'AGE_18_': 'R',    # For content rated for 18+ should be R
    'NOT_RATE': 'Unrated',  # 'NOT_RATE' should be treated as 'Unrated'
    'nan': 'Unrated'   # Treat missing values (NaN) as 'Unrated'
}

# Function to map ratings based on 'type' (Movie or TV Show)
def map_rating(row):
    if row['type'] == 'Movie':  # For Movies
        return rating_replacements.get(row['rating'], row['rating'])  # Replace based on movie ratings
    else:  # For TV Shows
        return rating_replacements.get(row['rating'], row['rating'])  # Replace based on TV show ratings

# Apply the function to the DataFrame
df['rating'] = df.apply(map_rating, axis=1)

# Check the unique values after applying the replacements
print(df['rating'].unique())

In [None]:
# Apply the mapping function to the 'rating' column
df['rating'] = df.apply(map_rating, axis=1)


In [None]:
# Check the unique values after replacements
print("\nUnique ratings after mapping:")
print(df['rating'].unique())

# Optional: Count how many times each rating appears
print("\nRating value counts after mapping:")
print(df['rating'].value_counts(dropna=False))


In [None]:
# Standardize 'duration_type' to lowercase for consistency
df['duration_type'] = df['duration_type'].str.lower()

In [None]:
# Calculate the average duration by 'duration_type' (e.g., average movie length)
avg_duration = df.groupby('duration_type')['duration_int'].mean().reset_index()
print("Average duration per type:")
print(avg_duration)

In [None]:
# Count number of titles by release year
titles_per_year = df['release_year'].value_counts().sort_index().reset_index()
titles_per_year.columns = ['release_year', 'count']
print("\n🔹 Titles by release year:")
print(titles_per_year.tail(10))  # Show most recent years

In [None]:
# Check available columns
print(df.columns)

# Correct column order if platform exists
correct_order = ['show_id', 'type', 'title', 'release_year', 'rating', 'duration', 'duration_int', 'duration_type', 'genres', 'platform']

# Ensure that 'platform' column exists
if 'platform' in df.columns:
    df = df[correct_order]
    df['genres'] = df['genres'].str.replace(r"\[|\]", '', regex=True).str.strip()
else:
    print("'platform' column is missing in the DataFrame")


In [None]:
df['platform'] = 'Netflix'  
if 'platform' not in df.columns:
    df['platform'] = 'Netflix'  

correct_order = ['show_id', 'type', 'title', 'release_year', 'rating', 'duration', 'duration_int', 'duration_type', 'genres', 'platform']
df = df[correct_order]  


In [None]:
print(df.columns)


In [None]:
correct_order = ['show_id', 'type', 'title', 'release_year', 'rating', 'duration', 'duration_int', 'duration_type', 'genres', 'platform']
df = df[correct_order]

In [None]:
#Export cleaned DataFrame to CSV for SQL import or future use
df['platform'] = 'Prime'
df.to_csv('../data/clean/cleaned_prime_data.csv') #Save
import pandas as pd
df = pd.read_csv('../data/clean/cleaned_prime_data.csv')
print(df['platform'].value_counts())
print(df.shape)

In [None]:
df.to_csv('../data/clean/cleaned_prime_data.csv', index=False)

import os

