# Převod dat z csv do sqlite databáze
Ke spuštění tohoto jupyteru je potřeba
1. mít k dispozici mysql server, se kterým budeme pracovat
2. upravit přístupové údaje k serveru na dvou místech, kde se používá `create_engine`.

Spuštěním buněk z tohoto notebooku dojde k načtení csv dat z `data/netflix` a uložení do mysql databáze podle přihlašovacích údajů v jupyteru.

Uložení proběhne ve dvou verzích.

1. Tabulky `credits` a `titles`. Tabulka `titles` přitom obsahuje složené sloupce `genres` a `production_countries`.
2. Tabulky `titles_clean`, `genres_clean`, `titles_genres_clean`, `production_countries_clean`, `credits_clean` propojené vzdálenými klíči a relacemi 1:N a N:M.


In [1]:
import pandas as pd

# Load the csv file into a pandas DataFrame
credits = pd.read_csv('data/netflix/credits.csv')
titles = pd.read_csv('data/netflix/titles.csv')

## Save the data into mysql database

In [None]:
from sqlalchemy import create_engine

# Zde nacitame heslo ze souboru heslo.py 
# Bud tu radku smazte a vyplnte heslo primo, nebo vytvorte soubor heslo.py stejne jako existuje heslo_priklad.py
from heslo import db_heslo
# Create an engine for MySQL using SQLAlchemy
engine = create_engine(f'mysql+pymysql://jenda:{db_heslo}@chrys/jenda')

# Save the DataFrame to the MySQL database
credits.to_sql('credits', engine, if_exists='replace', index=False)
titles.to_sql('titles', engine, if_exists='replace', index=False)

# Close the engine connection (optional, as SQLAlchemy handles connections automatically)
engine.dispose()

print("Data successfully loaded into MySQL database.")

Data successfully loaded into MySQL database.


## Explode genres and production_countries

In [6]:
# Load the CSV file into a DataFrame
titles_df = titles.copy()

# Expand the genres column into a list
titles_df['genres'] = titles_df['genres'].apply(eval)
titles_df['production_countries'] = titles_df['production_countries'].apply(eval)


In [7]:
# Explode the genres list into separate rows
exploded_genres_df = titles_df.explode('genres')[['id', 'genres']]
exploded_production_countries_df = titles_df\
    .explode("production_countries")[['id', 'production_countries']]

In [8]:
# Drop duplicates to create a unique genres DataFrame
unique_genres = exploded_genres_df['genres'].drop_duplicates().reset_index(drop=True).reset_index()
unique_genres.columns = ['genre_id', 'genre']
unique_production_countries = exploded_production_countries_df['production_countries'].drop_duplicates()\
    .reset_index(drop=True).reset_index()
unique_production_countries.columns = ['production_country_id', 'production_country']

In [9]:
unique_genres = unique_genres.dropna(subset=["genre"])
unique_production_countries = unique_production_countries.dropna(subset=['production_country'])

In [10]:
# Merge to create a relationship table between titles and genres
title_genre_relationship = exploded_genres_df\
    .merge(unique_genres, left_on='genres', right_on='genre')[['id', 'genre_id']]
title_production_country_relationship = exploded_production_countries_df\
    .merge(unique_production_countries, left_on='production_countries', right_on='production_country')[['id', 'production_country_id']]

In [11]:
# Drop the original genres column from the titles DataFrame
titles_df = titles_df.drop(columns=['genres', 'production_countries'])

# Reset the index of the titles DataFrame
titles_df.reset_index(drop=True, inplace=True)

In [12]:
# Save the DataFrames to CSV files (optional)
titles_df.to_csv('data/netflix_clean/titles.csv', index=False)
unique_genres.to_csv('data/netflix_clean/genres.csv', index=False)
title_genre_relationship.to_csv('data/netflix_clean/title_genre_relationship.csv', index=False)
unique_production_countries.to_csv('data/netflix_clean/production_countries.csv', index=False)
title_production_country_relationship.to_csv(
    'data/netflix_clean/title_production_country_relationship.csv', index=False)
credits.to_csv('data/netflix_clean/credits.csv', index=False)

## Save the clean database to mysql

In [14]:
from sqlalchemy import create_engine

# Nacist heslo z externiho souboru, nebo import smazat a napsat primo
from heslo import db_heslo
# Create an engine for MySQL using SQLAlchemy
conn = create_engine(f'mysql+pymysql://jenda:{db_heslo}@chrys/jenda')

# Save the DataFrame to the SQLite database
titles_df.to_sql('titles_clean', conn, if_exists='replace', index=False)
unique_genres.to_sql('genres_clean', conn, if_exists='replace', index=False)
title_genre_relationship.to_sql('titles_genres_clean', conn, if_exists='replace', index=False)
unique_production_countries.to_sql('production_countries_clean', conn, if_exists='replace', index=False)
title_production_country_relationship.to_sql(
    'titles_production_countries_clean', conn, if_exists='replace', index=False)
credits.to_sql('credits_clean', conn, if_exists='replace', index=False)


# Close the database connection
conn.dispose()

print("Data successfully loaded into MySQL database.")

Data successfully loaded into MySQL database.
