In [1]:
import pandas as pd
import sqlite3

# Load the TSV file into a pandas DataFrame
credits = pd.read_csv('data/netflix/credits.csv')
titles = pd.read_csv('data/netflix/titles.csv')

## Save the data into sqlite database

In [None]:
# Connect to the SQLite database (or create it if it doesn't exist)
conn = sqlite3.connect('sqlite-databases/netflix_simple.db')

# Save the DataFrame to the SQLite database
credits.to_sql('credits', conn, if_exists='replace', index=False)
titles.to_sql('titles', conn, if_exists='replace', index=False)

# Close the database connection
conn.close()

print("Data successfully loaded into SQLite database.")

## Explode genres

In [9]:
# Load the CSV file into a DataFrame
titles_df = titles.copy()

# Expand the genres column into a list
titles_df['genres'] = titles_df['genres'].apply(eval)
titles_df['production_countries'] = titles_df['production_countries'].apply(eval)


In [10]:
# Explode the genres list into separate rows
exploded_genres_df = titles_df.explode('genres')[['id', 'genres']]
exploded_production_countries_df = titles_df\
    .explode("production_countries")[['id', 'production_countries']]

In [11]:
# Drop duplicates to create a unique genres DataFrame
unique_genres = exploded_genres_df['genres'].drop_duplicates().reset_index(drop=True).reset_index()
unique_genres.columns = ['genre_id', 'genre']
unique_production_countries = exploded_production_countries_df['production_countries'].drop_duplicates()\
    .reset_index(drop=True).reset_index()
unique_production_countries.columns = ['production_country_id', 'production_country']

In [12]:
unique_genres = unique_genres.dropna(subset=["genre"])
unique_production_countries = unique_production_countries.dropna(subset=['production_country'])

In [13]:
# Merge to create a relationship table between titles and genres
title_genre_relationship = exploded_genres_df\
    .merge(unique_genres, left_on='genres', right_on='genre')[['id', 'genre_id']]
title_production_country_relationship = exploded_production_countries_df\
    .merge(unique_production_countries, left_on='production_countries', right_on='production_country')[['id', 'production_country_id']]

In [14]:
# Drop the original genres column from the titles DataFrame
titles_df = titles_df.drop(columns=['genres', 'production_countries'])

# Reset the index of the titles DataFrame
titles_df.reset_index(drop=True, inplace=True)

In [16]:
# Save the DataFrames to CSV files (optional)
titles_df.to_csv('data/netflix_clean/titles.csv', index=False)
unique_genres.to_csv('data/netflix_clean/genres.csv', index=False)
title_genre_relationship.to_csv('data/netflix_clean/title_genre_relationship.csv', index=False)
unique_production_countries.to_csv('data/netflix_clean/production_countries.csv', index=False)
title_production_country_relationship.to_csv(
    'data/netflix_clean/title_production_country_relationship.csv', index=False)
credits.to_csv('data/netflix_clean/credits.csv', index=False)

## Save the clean database to sqlite

In [19]:
# Connect to the SQLite database (or create it if it doesn't exist)
conn = sqlite3.connect('sqlite-databases/netflix_clean.db')

# Save the DataFrame to the SQLite database
titles_df.to_sql('titles', conn, if_exists='replace', index=False)
unique_genres.to_sql('genres', conn, if_exists='replace', index=False)
title_genre_relationship.to_sql('titles_genres', conn, if_exists='replace', index=False)
unique_production_countries.to_sql('production_countries', conn, if_exists='replace', index=False)
title_production_country_relationship.to_sql(
    'titles_production_countries', conn, if_exists='replace', index=False)

# Close the database connection
conn.close()

print("Data successfully loaded into SQLite database.")

Data successfully loaded into SQLite database.
