In [3]:
import pandas as pd
import numpy as np

In [4]:
def clean_data(df):
    # Drop duplicates
    df = df.drop_duplicates()
    # Drop rows with all NaN values
    df = df.dropna(how='all')
    # Fill NaN with placeholder or mean/median if numeric
    df = df.fillna('Unknown')  # Adjust based on context
    # Standardize column names: lowercase, replace spaces with underscores
    df.columns = df.columns.str.strip().str.lower().str.replace(' ', '_')
    return df

In [5]:
# Load datasets with different encoding
amazon_prime_titles = pd.read_csv('data/amazon_prime_titles.csv', encoding='latin1')
disney_all_content = pd.read_csv('data/disney_all_content.csv', encoding='latin1')
disney_plus_titles = pd.read_csv('data/disney_plus_titles.csv', encoding='latin1')
hulu_all_content = pd.read_csv('data/hulu_all_content.csv', encoding='latin1')
hulu_titles = pd.read_csv('data/hulu_titles.csv', encoding='latin1')
netflix_all_content = pd.read_csv('data/netflix_all_content.csv', encoding='latin1')
netflix_titles_2 = pd.read_csv('data/netflix_titles 2.csv', encoding='latin1')
movies_on_streaming = pd.read_csv('data/MoviesOnStreamingPlatforms.csv', encoding='latin1')
prime_all_content = pd.read_csv('data/prime_all_content.csv', encoding='latin1')

# Clean datasets
amazon_prime_titles_clean = clean_data(amazon_prime_titles)
disney_all_content_clean = clean_data(disney_all_content)
disney_plus_titles_clean = clean_data(disney_plus_titles)
hulu_all_content_clean = clean_data(hulu_all_content)
hulu_titles_clean = clean_data(hulu_titles)
netflix_all_content_clean = clean_data(netflix_all_content)
netflix_titles_clean = clean_data(netflix_titles)
netflix_titles_2_clean = clean_data(netflix_titles_2)
movies_on_streaming_clean = clean_data(movies_on_streaming)
prime_all_content_clean = clean_data(prime_all_content)

# Combine into one DataFrame
streaming_content = pd.concat([
    amazon_prime_titles_clean, disney_all_content_clean, disney_plus_titles_clean,
    hulu_all_content_clean, hulu_titles_clean, netflix_all_content_clean,
    netflix_titles_clean, netflix_titles_2_clean, movies_on_streaming_clean,
    prime_all_content_clean
], ignore_index=True)

# Save the cleaned and combined streaming content
streaming_content.to_csv('data/cleaned_streaming_content.csv', index=False)

In [6]:
# Load datasets
all_services_price_history = pd.read_csv('data/all_services_price_history.csv', encoding='latin1')
top_services_price_history = pd.read_csv('data/top_services_price_history.csv', encoding='latin1')
netflix_dataset = pd.read_csv('data/Netflix Dataset.csv', encoding='latin1')
netflix_userbase = pd.read_csv('data/Netflix Userbase.csv', encoding='latin1')

# Clean datasets
all_services_price_history_clean = clean_data(all_services_price_history)
top_services_price_history_clean = clean_data(top_services_price_history)
netflix_dataset_clean = clean_data(netflix_dataset)
netflix_userbase_clean = clean_data(netflix_userbase)

# Save cleaned pricing and subscription data
all_services_price_history_clean.to_csv('data/cleaned_all_services_price_history.csv', index=False)
top_services_price_history_clean.to_csv('data/cleaned_top_services_price_history.csv', index=False)
netflix_dataset_clean.to_csv('data/cleaned_netflix_dataset.csv', index=False)
netflix_userbase_clean.to_csv('data/cleaned_netflix_userbase.csv', index=False)

In [7]:
# Load datasets
imdb_data = pd.read_csv('data/imdb_data.csv', encoding='latin1')
imdb_movies_dataset = pd.read_csv('data/imdb-movies-dataset.csv', encoding='latin1')
title_principals = pd.read_csv('data/title.principals.tsv', sep='\t', encoding='latin1', nrows=50000)
title_ratings = pd.read_csv('data/title.ratings.tsv', sep='\t', encoding='latin1', nrows=50000)

# Clean datasets
imdb_data_clean = clean_data(imdb_data)
imdb_movies_dataset_clean = clean_data(imdb_movies_dataset)
title_principals_clean = clean_data(title_principals)
title_ratings_clean = clean_data(title_ratings)

# Save cleaned IMDb data
imdb_data_clean.to_csv('data/cleaned_imdb_data.csv', index=False)
imdb_movies_dataset_clean.to_csv('data/cleaned_imdb_movies_dataset.csv', index=False)
title_principals_clean.to_csv('data/cleaned_title_principals.csv', index=False)
title_ratings_clean.to_csv('data/cleaned_title_ratings.csv', index=False)

In [9]:
# Load datasets
top_services_all_content = pd.read_csv('data/top_services_all_content.csv', encoding='latin1')
top_services_movies = pd.read_csv('data/top_services_movies.csv', encoding='latin1')
top_services_tv = pd.read_csv('data/top_services_tv.csv', encoding='latin1')

# Clean datasets
top_services_all_content_clean = clean_data(top_services_all_content)
top_services_movies_clean = clean_data(top_services_movies)
top_services_tv_clean = clean_data(top_services_tv)

# Save cleaned service data
top_services_all_content_clean.to_csv('data/cleaned_top_services_all_content.csv', index=False)
top_services_movies_clean.to_csv('data/cleaned_top_services_movies.csv', index=False)
top_services_tv_clean.to_csv('data/cleaned_top_services_tv.csv', index=False)

In [None]:
# Load datasets
tv_shows = pd.read_csv('data/tv_shows.csv', encoding='latin1')
movies = pd.read_csv('data/movies.csv', encoding='latin1')
top_services_tv = pd.read_csv('data/top_services_tv.csv', encoding='latin1')

# Preview the data
print(tv_shows.head())
print(movies.head())
print(top_services_tv.head())

# Cleaning data (example - can vary based on inspection)
tv_shows_clean = tv_shows.dropna().reset_index(drop=True)
movies_clean = movies.dropna().reset_index(drop=True)
top_services_tv_clean = top_services_tv.dropna().reset_index(drop=True)

# Standardizing column names
tv_shows_clean.columns = [col.strip().lower().replace(' ', '_') for col in tv_shows_clean.columns]
movies_clean.columns = [col.strip().lower().replace(' ', '_') for col in movies_clean.columns]
top_services_tv_clean.columns = [col.strip().lower().replace(' ', '_') for col in top_services_tv_clean.columns]

# Further cleaning as needed