# Netflix Metadata Enrichment
This notebook demonstrates the process of enriching the Netflix dataset with additional metadata, engineered features, text analysis, and data quality improvements. Each step is explained below.

In [9]:
import pandas as pd
import requests
import time
import os
import nltk
from textblob import TextBlob

# Ensure nltk data path points to a persistent location
nltk_data_dir = os.path.expanduser('~/nltk_data')
if nltk_data_dir not in nltk.data.path:
    nltk.data.path.append(nltk_data_dir)
# Optionally set the environment variable for subprocesses
os.environ.setdefault('NLTK_DATA', nltk_data_dir)

# Load the cleaned dataset
df = pd.read_csv('data/netflix_titles_cleaned.csv')
df.head()

Unnamed: 0,show_id,type,title,director,cast,country,date_added,release_year,rating,duration,listed_in,description,year_added,month_added,month_name_added,day_added,duration_minutes,duration_seasons
0,s1,Movie,Dick Johnson Is Dead,Kirsten Johnson,No Data,United States,2021-09-25,2020,PG-13,90 min,Documentaries,"As her father nears the end of his life, filmm...",2021,9,September,25,90,1
1,s2,TV Show,Blood & Water,No Data,"Ama Qamata, Khosi Ngema, Gail Mabalane, Thaban...",South Africa,2021-09-24,2021,TV-MA,2 Seasons,"International TV Shows, TV Dramas, TV Mysteries","After crossing paths at a party, a Cape Town t...",2021,9,September,24,98,2
2,s3,TV Show,Ganglands,Julien Leclercq,"Sami Bouajila, Tracy Gotoas, Samuel Jouy, Nabi...",United States,2021-09-24,2021,TV-MA,1 Season,"Crime TV Shows, International TV Shows, TV Act...",To protect his family from a powerful drug lor...,2021,9,September,24,98,1
3,s4,TV Show,Jailbirds New Orleans,No Data,No Data,United States,2021-09-24,2021,TV-MA,1 Season,"Docuseries, Reality TV","Feuds, flirtations and toilet talk go down amo...",2021,9,September,24,98,1
4,s5,TV Show,Kota Factory,No Data,"Mayur More, Jitendra Kumar, Ranjan Raj, Alam K...",India,2021-09-24,2021,TV-MA,2 Seasons,"International TV Shows, Romantic TV Shows, TV ...",In a city of coaching centers known to train I...,2021,9,September,24,98,2


In [2]:
OMDB_API_KEY = '6fdfb278'

def fetch_omdb_data(title, year=None):
    """Fetch metadata from OMDb API for a given title (and optional year)."""
    base_url = 'http://www.omdbapi.com/'
    params = {
        'apikey': OMDB_API_KEY,
        't': title,
    }
    if year:
        params['y'] = year
    response = requests.get(base_url, params=params)
    if response.status_code == 200:
        return response.json()
    else:
        return None

In [None]:
# Quick NLTK/TextBlob availability test and downloader
# This cell will ensure required NLTK corpora/tokenizers are available
required = ['punkt', 'brown']
for pkg in required:
    try:
        nltk.data.find(pkg)
        print(f'{pkg} already available')
    except LookupError:
        print(f'{pkg} not found, downloading to', nltk_data_dir)
        nltk.download(pkg, download_dir=nltk_data_dir)

# Test TextBlob noun_phrases and sentiment on a sample
test_text = 'A thrilling story of friendship and adventure in a small town.'
tb = TextBlob(test_text)
print('Noun phrases:', tb.noun_phrases)
print('Sentiment:', tb.sentiment)


## External Metadata Enrichment

In [3]:
# Path to save progress
progress_path = 'data/enriched_metadata_progress.csv'
DAILY_LIMIT = 1000

# Load already processed titles if file exists
if os.path.exists(progress_path):
    enriched_df = pd.read_csv(progress_path)
    processed_titles = set(enriched_df['title'])
else:
    enriched_df = pd.DataFrame()
    processed_titles = set()

# Filter unprocessed rows
to_process = df[~df['title'].isin(processed_titles)]

batch = to_process.head(DAILY_LIMIT)

In [5]:
enriched_data = []
for idx, row in batch.iterrows():
    omdb_info = fetch_omdb_data(row['title'], row.get('release_year'))
    # Define defaults for all expected fields
    ratings = []
    imdb_rating = None
    imdb_votes = None
    value = None
    metascore = None
    awards = None
    language = None
    plot = None
    poster = None
    if omdb_info and omdb_info.get('Response') == 'True':
        ratings = omdb_info.get('Ratings', []) or []
        imdb_rating = omdb_info.get('imdbRating') or None
        imdb_votes_raw = omdb_info.get('imdbVotes') or None
        try:
            imdb_votes = int(str(imdb_votes_raw).replace(',', '')) if imdb_votes_raw not in (None, '') else None
        except (ValueError, TypeError):
            imdb_votes = None
        value = omdb_info.get('Value') if 'Value' in omdb_info else None
        metascore = omdb_info.get('Metascore') or None
        awards = omdb_info.get('Awards') or None
        language = omdb_info.get('Language') or None
        plot = omdb_info.get('Plot') or None
        poster = omdb_info.get('Poster') or None
    enriched_data.append({
        'title': row['title'],
        'ratings': ratings,
        'imdb_rating': imdb_rating,
        'imdb_votes': imdb_votes,
        'value': value,
        'metascore': metascore,
        'awards': awards,
        'language': language,
        'short_plot': plot,
        'poster': poster,
    })
    time.sleep(0.2)  # Be polite to the API

# Convert to DataFrame and append to progress
batch_enriched_df = pd.DataFrame(enriched_data)
enriched_df = pd.concat([enriched_df, batch_enriched_df], ignore_index=True)
enriched_df.to_csv(progress_path, index=False)

# Merge with original for analysis
df_merged = df.merge(enriched_df, on='title', how='left')
df_merged.head()

Unnamed: 0,show_id,type,title,director,cast,country,date_added,release_year,rating,duration,...,duration_seasons,ratings,imdb_rating,imdb_votes,value,metascore,language,short_plot,poster,awards
0,s1,Movie,Dick Johnson Is Dead,Kirsten Johnson,No Data,United States,2021-09-25,2020,PG-13,90 min,...,1,"[{'Source': 'Internet Movie Database', 'Value'...",7.4,7464.0,,89.0,English,A daughter helps her father prepare for the en...,https://m.media-amazon.com/images/M/MV5BOTQyN2...,
1,s2,TV Show,Blood & Water,No Data,"Ama Qamata, Khosi Ngema, Gail Mabalane, Thaban...",South Africa,2021-09-24,2021,TV-MA,2 Seasons,...,2,[],,,,,,,https://m.media-amazon.com/images/M/MV5BNGE5YW...,
2,s3,TV Show,Ganglands,Julien Leclercq,"Sami Bouajila, Tracy Gotoas, Samuel Jouy, Nabi...",United States,2021-09-24,2021,TV-MA,1 Season,...,1,"[{'Source': 'Internet Movie Database', 'Value'...",7.2,4880.0,,,French,"Mehdi, a qualified robber, and Liana, an appre...",https://m.media-amazon.com/images/M/MV5BYWJkOW...,
3,s4,TV Show,Jailbirds New Orleans,No Data,No Data,United States,2021-09-24,2021,TV-MA,1 Season,...,1,"[{'Source': 'Internet Movie Database', 'Value'...",6.5,332.0,,,English,"Feuds, flirtations and toilet talk go down amo...",https://m.media-amazon.com/images/M/MV5BNTI0OG...,
4,s5,TV Show,Kota Factory,No Data,"Mayur More, Jitendra Kumar, Ranjan Raj, Alam K...",India,2021-09-24,2021,TV-MA,2 Seasons,...,2,"[{'Source': 'Internet Movie Database', 'Value'...",,,,,English,"Feuds, flirtations and toilet talk go down amo...",https://m.media-amazon.com/images/M/MV5BNTI0OG...,


## Feature Engineering and Geographical Enrichment

In [6]:
# Get all unique country names from the 'country' column
unique_countries = set()
for entry in df['country'].dropna():
    for country in entry.split(','):
        unique_countries.add(country.strip())

# Convert to a sorted list for easier viewing
unique_countries = sorted(unique_countries)
print(unique_countries)

['', 'Afghanistan', 'Albania', 'Algeria', 'Angola', 'Argentina', 'Armenia', 'Australia', 'Austria', 'Azerbaijan', 'Bahamas', 'Bangladesh', 'Belarus', 'Belgium', 'Bermuda', 'Botswana', 'Brazil', 'Bulgaria', 'Burkina Faso', 'Cambodia', 'Cameroon', 'Canada', 'Cayman Islands', 'Chile', 'China', 'Colombia', 'Croatia', 'Cuba', 'Cyprus', 'Czech Republic', 'Denmark', 'Dominican Republic', 'East Germany', 'Ecuador', 'Egypt', 'Ethiopia', 'Finland', 'France', 'Georgia', 'Germany', 'Ghana', 'Greece', 'Guatemala', 'Hong Kong', 'Hungary', 'Iceland', 'India', 'Indonesia', 'Iran', 'Iraq', 'Ireland', 'Israel', 'Italy', 'Jamaica', 'Japan', 'Jordan', 'Kazakhstan', 'Kenya', 'Kuwait', 'Latvia', 'Lebanon', 'Liechtenstein', 'Lithuania', 'Luxembourg', 'Malawi', 'Malaysia', 'Malta', 'Mauritius', 'Mexico', 'Mongolia', 'Montenegro', 'Morocco', 'Mozambique', 'Namibia', 'Nepal', 'Netherlands', 'New Zealand', 'Nicaragua', 'Nigeria', 'Norway', 'Pakistan', 'Palestine', 'Panama', 'Paraguay', 'Peru', 'Philippines', 'Po

In [7]:
# Create a flag for Netflix Originals (if 'Netflix' is in the director or production columns, adjust as needed)
df['is_original'] = df['director'].fillna('').str.contains('Netflix', case=False)

# Create a flag for kids content based on rating
def is_kids(rating):
    kids_ratings = ['G', 'TV-Y', 'TV-Y7', 'TV-G', 'TV-PG']
    return rating in kids_ratings
df['is_kids_content'] = df['rating'].apply(is_kids)

# Calculate content age (current year - release year)
current_year = pd.Timestamp.now().year
df['content_age'] = current_year - df['release_year']

# Group countries into regions
region_map = {
    '': 'Unknown',
    'Afghanistan': 'Asia',
    'Albania': 'Europe',
    'Algeria': 'Africa',
    'Angola': 'Africa',
    'Argentina': 'South America',
    'Armenia': 'Asia',
    'Australia': 'Oceania',
    'Austria': 'Europe',
    'Azerbaijan': 'Asia',
    'Bahamas': 'North America',
    'Bangladesh': 'Asia',
    'Belarus': 'Europe',
    'Belgium': 'Europe',
    'Bermuda': 'North America',
    'Botswana': 'Africa',
    'Brazil': 'South America',
    'Bulgaria': 'Europe',
    'Burkina Faso': 'Africa',
    'Cambodia': 'Asia',
    'Cameroon': 'Africa',
    'Canada': 'North America',
    'Cayman Islands': 'North America',
    'Chile': 'South America',
    'China': 'Asia',
    'Colombia': 'South America',
    'Croatia': 'Europe',
    'Cuba': 'North America',
    'Cyprus': 'Asia',
    'Czech Republic': 'Europe',
    'Denmark': 'Europe',
    'Dominican Republic': 'North America',
    'East Germany': 'Europe',
    'Ecuador': 'South America',
    'Egypt': 'Africa',
    'Ethiopia': 'Africa',
    'Finland': 'Europe',
    'France': 'Europe',
    'Georgia': 'Asia',
    'Germany': 'Europe',
    'Ghana': 'Africa',
    'Greece': 'Europe',
    'Guatemala': 'North America',
    'Hong Kong': 'Asia',
    'Hungary': 'Europe',
    'Iceland': 'Europe',
    'India': 'Asia',
    'Indonesia': 'Asia',
    'Iran': 'Asia',
    'Iraq': 'Asia',
    'Ireland': 'Europe',
    'Israel': 'Asia',
    'Italy': 'Europe',
    'Jamaica': 'North America',
    'Japan': 'Asia',
    'Jordan': 'Asia',
    'Kazakhstan': 'Asia',
    'Kenya': 'Africa',
    'Kuwait': 'Asia',
    'Latvia': 'Europe',
    'Lebanon': 'Asia',
    'Liechtenstein': 'Europe',
    'Lithuania': 'Europe',
    'Luxembourg': 'Europe',
    'Malawi': 'Africa',
    'Malaysia': 'Asia',
    'Malta': 'Europe',
    'Mauritius': 'Africa',
    'Mexico': 'North America',
    'Mongolia': 'Asia',
    'Montenegro': 'Europe',
    'Morocco': 'Africa',
    'Mozambique': 'Africa',
    'Namibia': 'Africa',
    'Nepal': 'Asia',
    'Netherlands': 'Europe',
    'New Zealand': 'Oceania',
    'Nicaragua': 'North America',
    'Nigeria': 'Africa',
    'Norway': 'Europe',
    'Pakistan': 'Asia',
    'Palestine': 'Asia',
    'Panama': 'North America',
    'Paraguay': 'South America',
    'Peru': 'South America',
    'Philippines': 'Asia',
    'Poland': 'Europe',
    'Portugal': 'Europe',
    'Puerto Rico': 'North America',
    'Qatar': 'Asia',
    'Romania': 'Europe',
    'Russia': 'Europe',
    'Samoa': 'Oceania',
    'Saudi Arabia': 'Asia',
    'Senegal': 'Africa',
    'Serbia': 'Europe',
    'Singapore': 'Asia',
    'Slovakia': 'Europe',
    'Slovenia': 'Europe',
    'Somalia': 'Africa',
    'South Africa': 'Africa',
    'South Korea': 'Asia',
    'Soviet Union': 'Europe',
    'Spain': 'Europe',
    'Sri Lanka': 'Asia',
    'Sudan': 'Africa',
    'Sweden': 'Europe',
    'Switzerland': 'Europe',
    'Syria': 'Asia',
    'Taiwan': 'Asia',
    'Thailand': 'Asia',
    'Turkey': 'Asia',
    'Uganda': 'Africa',
    'Ukraine': 'Europe',
    'United Arab Emirates': 'Asia',
    'United Kingdom': 'Europe',
    'United States': 'North America',
    'Uruguay': 'South America',
    'Vatican City': 'Europe',
    'Venezuela': 'South America',
    'Vietnam': 'Asia',
    'West Germany': 'Europe',
    'Zimbabwe': 'Africa'
}
def map_region(country):
    if pd.isna(country):
        return None
    for key in region_map:
        if key in country:
            return region_map[key]
    return 'Other'
df['region'] = df['country'].apply(map_region)

df[['title', 'year_added', 'month_added', 'is_original', 'is_kids_content', 'content_age', 'region']].head()

Unnamed: 0,title,year_added,month_added,is_original,is_kids_content,content_age,region
0,Dick Johnson Is Dead,2021,9,False,False,5,Unknown
1,Blood & Water,2021,9,False,False,4,Unknown
2,Ganglands,2021,9,False,False,4,Unknown
3,Jailbirds New Orleans,2021,9,False,False,4,Unknown
4,Kota Factory,2021,9,False,False,4,Unknown


## Text Enrichment

In [10]:
# Run Text Enrichment on a small sample (first 5 rows) to validate TextBlob usage
sample = df.head(5).copy()
def extract_keywords(text):
    if pd.isna(text) or text == '':
        return []
    blob = TextBlob(text)
    return list(blob.noun_phrases)

def get_sentiment(text):
    if pd.isna(text) or text == '':
        return 0.0, 0.0
    blob = TextBlob(text)
    return blob.sentiment.polarity, blob.sentiment.subjectivity

sample['description_keywords'] = sample['description'].apply(extract_keywords)
sample[['desc_polarity', 'desc_subjectivity']] = sample['description'].apply(lambda x: pd.Series(get_sentiment(x)))
sample[['title', 'description', 'description_keywords', 'desc_polarity', 'desc_subjectivity']]


Unnamed: 0,title,description,description_keywords,desc_polarity,desc_subjectivity
0,Dick Johnson Is Dead,"As her father nears the end of his life, filmm...","[father nears, kirsten johnson, comical ways]",0.333333,1.0
1,Blood & Water,"After crossing paths at a party, a Cape Town t...",[cape town teen],0.0,0.0
2,Ganglands,To protect his family from a powerful drug lor...,"[powerful drug lord, skilled thief, mehdi, exp...",-0.05,0.725
3,Jailbirds New Orleans,"Feuds, flirtations and toilet talk go down amo...","[feuds, toilet talk, orleans, justice center, ...",-0.030505,0.318687
4,Kota Factory,In a city of coaching centers known to train I...,"[india, ’ s, collegiate minds, unexceptional s...",0.0,0.0


In [11]:
# Extract keywords using TextBlob noun phrases
def extract_keywords(text):
    if pd.isna(text):
        return []
    blob = TextBlob(text)
    return blob.noun_phrases

df['description_keywords'] = df['description'].apply(extract_keywords)

#nltk.download('punkt_tab', download_dir=nltk_data_dir) Calculate sentiment polarity and subjectivity
def get_sentiment(text):
    if pd.isna(text):
        return 0.0, 0.0
    blob = TextBlob(text)
    return blob.sentiment.polarity, blob.sentiment.subjectivity

df[['desc_polarity', 'desc_subjectivity']] = df['description'].apply(lambda x: pd.Series(get_sentiment(x)))

df[['title', 'description_keywords', 'desc_polarity', 'desc_subjectivity']].head()

Unnamed: 0,title,description_keywords,desc_polarity,desc_subjectivity
0,Dick Johnson Is Dead,"[father nears, kirsten johnson, comical ways]",0.333333,1.0
1,Blood & Water,[cape town teen],0.0,0.0
2,Ganglands,"[powerful drug lord, skilled thief, mehdi, exp...",-0.05,0.725
3,Jailbirds New Orleans,"[feuds, toilet talk, orleans, justice center, ...",-0.030505,0.318687
4,Kota Factory,"[india, ’ s, collegiate minds, unexceptional s...",0.0,0.0


## Data Quality Improvements

In [12]:
# Standardize genre, country, and rating fields
if 'listed_in' in df.columns:
    df['genre_standardized'] = df['listed_in'].str.title().str.strip()
df['country_standardized'] = df['country'].str.title().str.strip()
df['rating_standardized'] = df['rating'].str.upper().str.strip()

# Fill missing values for key fields
df['director'] = df['director'].fillna('Unknown')
df['cast'] = df['cast'].fillna('Unknown')
df['country'] = df['country'].fillna('Unknown')
df['rating'] = df['rating'].fillna('Unknown')
df['description'] = df['description'].fillna('')

df.head()

Unnamed: 0,show_id,type,title,director,cast,country,date_added,release_year,rating,duration,...,month_name_added,day_added,duration_minutes,duration_seasons,description_keywords,desc_polarity,desc_subjectivity,genre_standardized,country_standardized,rating_standardized
0,s1,Movie,Dick Johnson Is Dead,Kirsten Johnson,No Data,United States,2021-09-25,2020,PG-13,90 min,...,September,25,90,1,"[father nears, kirsten johnson, comical ways]",0.333333,1.0,Documentaries,United States,PG-13
1,s2,TV Show,Blood & Water,No Data,"Ama Qamata, Khosi Ngema, Gail Mabalane, Thaban...",South Africa,2021-09-24,2021,TV-MA,2 Seasons,...,September,24,98,2,[cape town teen],0.0,0.0,"International Tv Shows, Tv Dramas, Tv Mysteries",South Africa,TV-MA
2,s3,TV Show,Ganglands,Julien Leclercq,"Sami Bouajila, Tracy Gotoas, Samuel Jouy, Nabi...",United States,2021-09-24,2021,TV-MA,1 Season,...,September,24,98,1,"[powerful drug lord, skilled thief, mehdi, exp...",-0.05,0.725,"Crime Tv Shows, International Tv Shows, Tv Act...",United States,TV-MA
3,s4,TV Show,Jailbirds New Orleans,No Data,No Data,United States,2021-09-24,2021,TV-MA,1 Season,...,September,24,98,1,"[feuds, toilet talk, orleans, justice center, ...",-0.030505,0.318687,"Docuseries, Reality Tv",United States,TV-MA
4,s5,TV Show,Kota Factory,No Data,"Mayur More, Jitendra Kumar, Ranjan Raj, Alam K...",India,2021-09-24,2021,TV-MA,2 Seasons,...,September,24,98,2,"[india, ’ s, collegiate minds, unexceptional s...",0.0,0.0,"International Tv Shows, Romantic Tv Shows, Tv ...",India,TV-MA


## Save the Enriched Dataset

In [13]:
# Save the Enriched Dataset
df.to_csv('data/netflix_titles_enriched.csv', index=False)
print('Enriched dataset saved to data/netflix_titles_enriched.csv')

Enriched dataset saved to data/netflix_titles_enriched.csv


# Summary and Next Steps

This notebook has enriched the Netflix dataset with external metadata, engineered features, text analysis, and data quality improvements. The enriched dataset is now ready for advanced analytics and machine learning applications.

## Next Steps: Project 2 - Netflix Content Recommendation Systems
You can now move on to Project 2, where you will use this enriched dataset to build and evaluate content recommendation systems for Netflix. For more details and code, see the repository:

[Netflix Content Recommendation Systems (GitHub)](https://github.com/hongwei-cai/Netflix-Content-Recommendation-Systems)

Potential directions include:
- Building collaborative filtering or content-based recommenders
- Using enriched metadata for hybrid recommendation models
- Evaluating recommendation quality with user or content metrics
- Visualizing recommendations and user/content relationships