In [2]:
# Load YAML configuration file
import yaml

with open("../amazon_config.yaml", "r") as file:
    config = yaml.safe_load(file)

# Load dataset with pandas
import pandas as pd
df = pd.read_csv(config['input_data']['file'])  # Load dataset based on YAML path
df.head()  # Show the first 5 rows of the dataset

Unnamed: 0,show_id,type,title,director,cast,country,date_added,release_year,rating,duration,listed_in,description
0,s1,Movie,The Grand Seduction,Don McKellar,"Brendan Gleeson, Taylor Kitsch, Gordon Pinsent",Canada,"March 30, 2021",2014,,113 min,"Comedy, Drama",A small fishing village must procure a local d...
1,s2,Movie,Take Care Good Night,Girish Joshi,"Mahesh Manjrekar, Abhay Mahajan, Sachin Khedekar",India,"March 30, 2021",2018,13+,110 min,"Drama, International",A Metro Family decides to fight a Cyber Crimin...
2,s3,Movie,Secrets of Deception,Josh Webber,"Tom Sizemore, Lorenzo Lamas, Robert LaSardo, R...",United States,"March 30, 2021",2017,,74 min,"Action, Drama, Suspense",After a man discovers his wife is cheating on ...
3,s4,Movie,Pink: Staying True,Sonia Anderson,"Interviews with: Pink, Adele, Beyoncé, Britney...",United States,"March 30, 2021",2014,,69 min,Documentary,"Pink breaks the mold once again, bringing her ..."
4,s5,Movie,Monster Maker,Giles Foster,"Harry Dean Stanton, Kieran O'Brien, George Cos...",United Kingdom,"March 30, 2021",1989,,45 min,"Drama, Fantasy",Teenage Matt Banting wants to work with a famo...


In [4]:
# Drop unnecessary columns (corrected column names)
columns_to_drop = [ 'director', 'cast', 'country', 'date_added', 'description']

# Use drop safely with existing column names
df = df.drop(columns=[col for col in columns_to_drop if col in df.columns])

# Confirm columns after dropping
print("\nColumns after dropping unnecessary ones:")
print(df.columns)


Columns after dropping unnecessary ones:
Index(['show_id', 'type', 'title', 'release_year', 'rating', 'duration',
       'listed_in'],
      dtype='object')


In [6]:
# Explore structure and metadata
df.shape         # Check number of rows and columns
df.columns       # List all column names
df.info()        # Get data types and non-null counts per column

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 9668 entries, 0 to 9667
Data columns (total 7 columns):
 #   Column        Non-Null Count  Dtype 
---  ------        --------------  ----- 
 0   show_id       9668 non-null   object
 1   type          9668 non-null   object
 2   title         9668 non-null   object
 3   release_year  9668 non-null   int64 
 4   rating        9331 non-null   object
 5   duration      9668 non-null   object
 6   listed_in     9668 non-null   object
dtypes: int64(1), object(6)
memory usage: 528.8+ KB


In [8]:
# Check missing values in descending order
df.isnull().sum().sort_values(ascending=False)

rating          337
show_id           0
type              0
title             0
release_year      0
duration          0
listed_in         0
dtype: int64

In [10]:
# Number of unique values per column
df.nunique().sort_values()

# Distribution of key categorical columns
df['type'].value_counts(normalize=True)  # Distribution of 'Movie' vs 'TV Show'
df['rating'].value_counts().head(10)     # Most common ratings

rating
13+      2117
16+      1547
ALL      1268
18+      1243
R        1010
PG-13     393
7+        385
PG        253
NR        223
TV-14     208
Name: count, dtype: int64

In [12]:
print(df['rating'].unique())

[nan '13+' 'ALL' '18+' 'R' 'TV-Y' 'TV-Y7' 'NR' '16+' 'TV-PG' '7+' 'TV-14'
 'TV-NR' 'TV-G' 'PG-13' 'TV-MA' 'G' 'PG' 'NC-17' 'UNRATED' '16' 'AGES_16_'
 'AGES_18_' 'ALL_AGES' 'NOT_RATE']


In [14]:
# View a sample of the genres column
df['listed_in'].sample(30)

222                                               Western
3690                                               Horror
8481                               Comedy, Drama, Romance
4276                                      Drama, Suspense
2088                                             TV Shows
7599                                                Drama
9096                            Science Fiction, Suspense
2245                                               Horror
527                                           Drama, Kids
2447                                          Documentary
9038                                                Drama
1721                                                Drama
5258                               International, Romance
1251                   Action, Adventure, Science Fiction
972                              Special Interest, Sports
4708                                        Action, Drama
8145                                                Drama
1155          

In [16]:
# Ensure the column is string and remove leading/trailing spaces
df['listed_in'] = df['listed_in'].astype(str).str.strip()

# Keep only the first genre if there are multiple (split by comma)
df['genres'] = df['listed_in'].apply(lambda x: x.split(',')[0].strip())

# Optional: Drop the original 'listed_in' column if you no longer need it
df = df.drop(columns=['listed_in'])

# Preview to confirm changes
print(df[['genres']].sample(10))


                         genres
210   Music Videos and Concerts
3009                  Animation
1527                      Drama
5857                  Animation
6154                      Drama
6835                     Action
959                        Kids
2210                     Action
9646                     Action
4877                     Comedy


In [18]:
# Check all unique genres BEFORE replacements
print("\nUnique genres in Amazon (before grouping):")
print(df['genres'].unique())

# Optional: Count how many times each genre appears
print("\nGenre value counts (before grouping):")
print(df['genres'].value_counts(dropna=False))



Unique genres in Amazon (before grouping):
['Comedy' 'Drama' 'Action' 'Documentary' 'Fantasy' 'Adventure' 'Horror'
 'Kids' 'Science Fiction' 'Arts' 'TV Shows' 'Animation' 'Anime'
 'Music Videos and Concerts' 'Fitness' 'Faith and Spirituality'
 'Special Interest' 'Suspense' 'Unscripted' 'Western' 'Arthouse' 'Sports'
 'Military and War' 'International' 'Romance' 'Young Adult Audience'
 'Talk Show and Variety' 'LGBTQ' 'Historical']

Genre value counts (before grouping):
genres
Drama                        2216
Action                       1657
Comedy                       1475
Documentary                   913
Horror                        535
Animation                     498
Arts                          457
Kids                          373
TV Shows                      263
Suspense                      194
Special Interest              188
Arthouse                      132
Romance                       126
Music Videos and Concerts     103
Western                       102
Science Fi

In [20]:
# Replacing genre labels with unified categories for consistency
genre_replacements = {
    # Drama group
    'Drama': 'Drama',
    'Romance': 'Drama',
    'Suspense': 'Drama',
    'Thrillers': 'Drama',  
    'Mystery': 'Drama',    

    # Comedy group
    'Comedy': 'Comedy',
    'Stand-Up Comedy': 'Comedy',
    'Talk Show and Variety': 'Comedy',
    'Talk Show': 'Comedy',
    'Talk Show and Variety': 'Comedy',

    # Action & Thriller group
    'Action': 'Action/Thriller',
    'Adventure': 'Action/Thriller',
    'Military and War': 'Action/Thriller',
    'Arthouse': 'Action/Thriller',
    'Suspense': 'Action/Thriller',

    # Documentary group
    'Documentary': 'Documentary',
    'Docuseries': 'Documentary',
    'Unscripted': 'Documentary',

    # International group
    'International': 'International',

    # Kids & Family group
    'Kids': 'Kids',
    'Young Adult Audience': 'Kids',

    # Sci-Fi / Fantasy / Animation group
    'Sci-Fi & Fantasy': 'Sci-Fi/Fantasy',
    'Science Fiction': 'Sci-Fi/Fantasy',
    'Fantasy': 'Sci-Fi/Fantasy',
    'Animation': 'Sci-Fi/Fantasy',
    'Anime': 'Sci-Fi/Fantasy',

    # Horror group
    'Horror': 'Horror',

    # Reality / Other group
    'Reality TV': 'Reality',
    'Sports': 'Other',
    'Fitness': 'Other',
    'Music Videos and Concerts': 'Other',
    'Arts': 'Other',
    'Faith and Spirituality': 'Other',
    'LGBTQ': 'Other',
    'Historical': 'Other',
    'Western': 'Other',
}


In [22]:
# Check unique genres AFTER applying replacements
print("\nUnique genres in Amazon (after grouping):")
print(df['genres'].unique())


Unique genres in Amazon (after grouping):
['Comedy' 'Drama' 'Action' 'Documentary' 'Fantasy' 'Adventure' 'Horror'
 'Kids' 'Science Fiction' 'Arts' 'TV Shows' 'Animation' 'Anime'
 'Music Videos and Concerts' 'Fitness' 'Faith and Spirituality'
 'Special Interest' 'Suspense' 'Unscripted' 'Western' 'Arthouse' 'Sports'
 'Military and War' 'International' 'Romance' 'Young Adult Audience'
 'Talk Show and Variety' 'LGBTQ' 'Historical']


In [24]:
df['genres'] = df['genres'].replace(genre_replacements)


In [26]:
# Optional: Count genre occurrences after grouping
print("\nGenre value counts (after grouping):")
print(df['genres'].value_counts(dropna=False))



Genre value counts (after grouping):
genres
Drama               2342
Action/Thriller     2059
Comedy              1478
Documentary          942
Other                793
Sci-Fi/Fantasy       645
Horror               535
Kids                 376
TV Shows             263
Special Interest     188
International         47
Name: count, dtype: int64


In [28]:
# Split 'duration' column into duration value and type (e.g., "90 min" → 90 + 'min')
df[['duration_int', 'duration_type']] = df['duration'].str.extract(r'(\d+)\s*(\w+)')
df['duration_int'] = pd.to_numeric(df['duration_int'], errors='coerce')  # Convert to numeric
df[['duration', 'duration_int', 'duration_type']].head()  # Preview new columns

Unnamed: 0,duration,duration_int,duration_type
0,113 min,113,min
1,110 min,110,min
2,74 min,74,min
3,69 min,69,min
4,45 min,45,min


In [30]:
# Split genres into lists (for multi-label classification)

df['genres'] = df['genres'].str.split(', ')  # Check results

In [32]:
# Frequency of duration types (e.g., 'min', 'Season')
df['duration_type'].value_counts() # Check types of duration (minutes or seasons)

# Standardize plural form to singular ('Seasons' → 'Season')
df['duration_type'] = df['duration_type'].replace('Seasons', 'Season') # Normalize to singular
df['duration_type'].value_counts()  # Confirm correction

duration_type
min       7814
Season    1854
Name: count, dtype: int64

In [34]:
# Check for missing values in 'duration_int'
df['duration_int'].isnull().sum()  # Check how many null durations remain
# Keep only rows where 'duration_int' is not null
df = df[df['duration_int'].notna()]  # Filter out rows without valid duration
# Inspect rows where duration was missing
df[df['duration_int'].isna()][['duration']]


Unnamed: 0,duration


In [36]:
# Remove duplicated rows based on 'show_id', 'title', and 'release_year'
df = df.drop_duplicates(subset=['show_id', 'title', 'release_year'])  # Remove duplicates
print("Number of rows after removing duplicates:", df.shape[0])  # Check number of rows after deduplication

Number of rows after removing duplicates: 9668


In [38]:
# Confirm that duplicates are removed
duplicates = df.duplicated(subset=['show_id', 'title', 'release_year']).sum()  # Count remaining duplicates
print("Remaining duplicates:", duplicates) 

Remaining duplicates: 0


In [40]:
# Check missing values again per column (after cleaning)
df.isnull().sum().sort_values(ascending=False)

rating           337
show_id            0
type               0
title              0
release_year       0
duration           0
genres             0
duration_int       0
duration_type      0
dtype: int64

In [42]:
import numpy as np  # Asegúrate de importar numpy para poder usar np.nan

# Fill missing values with appropriate defaults

# Replace missing 'rating' with "Not Rated"
df['rating'] = df['rating'].fillna("Not Rated")

# Final check: confirm there are no missing values left
print("\nMissing values per column after filling:")
print(df.isnull().sum().sort_values(ascending=False))



Missing values per column after filling:
show_id          0
type             0
title            0
release_year     0
rating           0
duration         0
genres           0
duration_int     0
duration_type    0
dtype: int64


In [44]:
# Confirm again that no missing values remain after filling
df.isnull().sum().sort_values(ascending=False)

show_id          0
type             0
title            0
release_year     0
rating           0
duration         0
genres           0
duration_int     0
duration_type    0
dtype: int64

In [46]:
# Quick check of distributions on key columns
print(df['duration_type'].value_counts())  # Check distribution of 'duration_type'
print(df['type'].value_counts())           # Check distribution of 'type'
print(df['rating'].value_counts())         # Check distribution of 'rating'


duration_type
min       7814
Season    1854
Name: count, dtype: int64
type
Movie      7814
TV Show    1854
Name: count, dtype: int64
rating
13+          2117
16+          1547
ALL          1268
18+          1243
R            1010
PG-13         393
7+            385
Not Rated     337
PG            253
NR            223
TV-14         208
TV-PG         169
TV-NR         105
G              93
TV-G           81
TV-MA          77
TV-Y           74
TV-Y7          39
UNRATED        33
NC-17           3
AGES_18_        3
NOT_RATE        3
AGES_16_        2
16              1
ALL_AGES        1
Name: count, dtype: int64


In [48]:
# Check the unique values in the 'rating' column
print("Unique ratings:")
print(df['rating'].unique())
# Count how many times each rating appears
print("\nRating value counts:")
print(df['rating'].value_counts(dropna=False))


Unique ratings:
['Not Rated' '13+' 'ALL' '18+' 'R' 'TV-Y' 'TV-Y7' 'NR' '16+' 'TV-PG' '7+'
 'TV-14' 'TV-NR' 'TV-G' 'PG-13' 'TV-MA' 'G' 'PG' 'NC-17' 'UNRATED' '16'
 'AGES_16_' 'AGES_18_' 'ALL_AGES' 'NOT_RATE']

Rating value counts:
rating
13+          2117
16+          1547
ALL          1268
18+          1243
R            1010
PG-13         393
7+            385
Not Rated     337
PG            253
NR            223
TV-14         208
TV-PG         169
TV-NR         105
G              93
TV-G           81
TV-MA          77
TV-Y           74
TV-Y7          39
UNRATED        33
NC-17           3
AGES_18_        3
NOT_RATE        3
AGES_16_        2
16              1
ALL_AGES        1
Name: count, dtype: int64


In [50]:
# Check the unique values in the 'type' column
print("Unique types:")
print(df['type'].unique())

Unique types:
['Movie' 'TV Show']


In [52]:
# Create a table that shows the count of each rating by type
pd.crosstab(df['rating'], df['type'])


type,Movie,TV Show
rating,Unnamed: 1_level_1,Unnamed: 2_level_1
13+,1849,268
16,1,0
16+,1272,275
18+,1097,146
7+,288,97
AGES_16_,2,0
AGES_18_,3,0
ALL,988,280
ALL_AGES,1,0
G,93,0


In [54]:
# Define the replacements dictionary
rating_replacements = {
    # Movie ratings
    '13+': 'PG-13',
    '16+': 'R',
    '16': 'R',
    '18+': 'R',
    '7+': 'G',
    'AGES_16+': 'R',
    'AGES_16_': 'R',
    'AGES_18_': 'R',
    'ALL': 'G',
    'ALL_AGES': 'G',
    'G': 'G',
    'NC-17': 'R',
    'NR': 'Unrated',
    'Not Rated': 'Unrated',
    'PG': 'PG',
    'PG-13': 'PG-13',
    'R': 'R',

    # TV Show ratings
    'TV-14': 'TV-14',
    'TV-G': 'TV-G',
    'TV-MA': 'TV-MA',
    'TV-NR': 'Unrated',
    'TV-PG': 'TV-PG',
    'TV-Y': 'TV-Y',
    'TV-Y7': 'TV-Y7',
    'UNRATED': 'Unrated',
    
    # Additional cases
    'AGE_18_': 'R',    # For content rated for 18+ should be R
    'NOT_RATE': 'Unrated',  # 'NOT_RATE' should be treated as 'Unrated'
    'nan': 'Unrated'   # Treat missing values (NaN) as 'Unrated'
}

# Function to map ratings based on 'type' (Movie or TV Show)
def map_rating(row):
    if row['type'] == 'Movie':  # For Movies
        return rating_replacements.get(row['rating'], row['rating'])  # Replace based on movie ratings
    else:  # For TV Shows
        return rating_replacements.get(row['rating'], row['rating'])  # Replace based on TV show ratings

# Apply the function to the DataFrame
df['rating'] = df.apply(map_rating, axis=1)

# Check the unique values after applying the replacements
print(df['rating'].unique())

['Unrated' 'PG-13' 'G' 'R' 'TV-Y' 'TV-Y7' 'TV-PG' 'TV-14' 'TV-G' 'TV-MA'
 'PG']


In [56]:
# Apply the mapping function to the 'rating' column
df['rating'] = df.apply(map_rating, axis=1)


In [58]:
# Check the unique values after replacements
print("\nUnique ratings after mapping:")
print(df['rating'].unique())

#  Count how many times each rating appears
print("\nRating value counts after mapping:")
print(df['rating'].value_counts(dropna=False))



Unique ratings after mapping:
['Unrated' 'PG-13' 'G' 'R' 'TV-Y' 'TV-Y7' 'TV-PG' 'TV-14' 'TV-G' 'TV-MA'
 'PG']

Rating value counts after mapping:
rating
R          3809
PG-13      2510
G          1747
Unrated     701
PG          253
TV-14       208
TV-PG       169
TV-G         81
TV-MA        77
TV-Y         74
TV-Y7        39
Name: count, dtype: int64


In [60]:
# Standardize 'duration_type' to lowercase for consistency
df['duration_type'] = df['duration_type'].str.lower()

In [62]:
# Calculate the average duration by 'duration_type' (e.g., average movie length)
avg_duration = df.groupby('duration_type')['duration_int'].mean().reset_index()
print("Average duration per type:")
print(avg_duration)

Average duration per type:
  duration_type  duration_int
0           min     91.311876
1        season      1.723301


In [64]:
# Count number of titles by release year
titles_per_year = df['release_year'].value_counts().sort_index().reset_index()
titles_per_year.columns = ['release_year', 'count']
print("\n🔹 Titles by release year:")
print(titles_per_year.tail(10))  # Show most recent years


🔹 Titles by release year:
    release_year  count
90          2012    252
91          2013    289
92          2014    391
93          2015    378
94          2016    521
95          2017    562
96          2018    623
97          2019    929
98          2020    962
99          2021   1442


In [66]:
# Check available columns
print(df.columns)

# Correct column order if platform exists
correct_order = ['show_id', 'type', 'title', 'release_year', 'rating', 'duration', 'duration_int', 'duration_type', 'genres', 'platform']

# Ensure that 'platform' column exists
if 'platform' in df.columns:
    df = df[correct_order]
    df['genres'] = df['genres'].str.replace(r"\[|\]", '', regex=True).str.strip()
else:
    print("'platform' column is missing in the DataFrame")


Index(['show_id', 'type', 'title', 'release_year', 'rating', 'duration',
       'genres', 'duration_int', 'duration_type'],
      dtype='object')
'platform' column is missing in the DataFrame


In [68]:
# Add the 'platform' column as 'Netflix' if it doesn't exist yet
df['platform'] = 'Netflix'  
if 'platform' not in df.columns:
    df['platform'] = 'Netflix'  

correct_order = ['show_id', 'type', 'title', 'release_year', 'rating', 'duration', 'duration_int', 'duration_type', 'genres', 'platform']
df = df[correct_order]  


In [70]:
# Check the current column names to confirm the order
print(df.columns)


Index(['show_id', 'type', 'title', 'release_year', 'rating', 'duration',
       'duration_int', 'duration_type', 'genres', 'platform'],
      dtype='object')


In [72]:
correct_order = ['show_id', 'type', 'title', 'release_year', 'rating', 'duration', 'duration_int', 'duration_type', 'genres', 'platform']
df = df[correct_order]

In [74]:
#Export cleaned DataFrame to CSV for SQL import or future use
df['platform'] = 'Prime'
# Save the cleaned Prime data to CSV
df.to_csv('../data/clean/cleaned_prime_data.csv') 


In [76]:
# Quick check: read the file back and check platform counts
import pandas as pd
df = pd.read_csv('../data/clean/cleaned_prime_data.csv')
print(df['platform'].value_counts())
print(df.shape)

platform
Prime    9668
Name: count, dtype: int64
(9668, 11)


In [78]:
# Save again without the index column
df.to_csv('../data/clean/cleaned_prime_data.csv', index=False)

import os

