In [1]:
import pandas as pd
import numpy as np
import matplotlib.pyplot as plt
import seaborn as sns
import pymysql
pymysql.install_as_MySQLdb()

from sqlalchemy import create_engine
from sqlalchemy_utils import create_database, database_exists
from urllib.parse import quote_plus as urlquote

In [2]:
basics = pd.read_csv('Data/title_basics.csv.gz')
basics.head()

Unnamed: 0,tconst,titleType,primaryTitle,originalTitle,isAdult,startYear,endYear,runtimeMinutes,genres
0,tt0035423,movie,Kate & Leopold,Kate & Leopold,0,2001.0,,118,"Comedy,Fantasy,Romance"
1,tt0062336,movie,The Tango of the Widower and Its Distorting Mi...,El tango del viudo y su espejo deformante,0,2020.0,,70,Drama
2,tt0069049,movie,The Other Side of the Wind,The Other Side of the Wind,0,2018.0,,122,Drama
3,tt0079644,movie,November 1828,November 1828,0,2001.0,,140,"Drama,War"
4,tt0088751,movie,The Naked Monster,The Naked Monster,0,2005.0,,100,"Comedy,Horror,Sci-Fi"


In [3]:
cols_to_drop = ['originalTitle','isAdult','titleType','endYear']
basics = basics.drop(columns=cols_to_drop)
basics

Unnamed: 0,tconst,primaryTitle,startYear,runtimeMinutes,genres
0,tt0035423,Kate & Leopold,2001.0,118,"Comedy,Fantasy,Romance"
1,tt0062336,The Tango of the Widower and Its Distorting Mi...,2020.0,70,Drama
2,tt0069049,The Other Side of the Wind,2018.0,122,Drama
3,tt0079644,November 1828,2001.0,140,"Drama,War"
4,tt0088751,The Naked Monster,2005.0,100,"Comedy,Horror,Sci-Fi"
...,...,...,...,...,...
145714,tt9916170,The Rehearsal,2019.0,51,Drama
145715,tt9916190,Safeguard,2020.0,95,"Action,Adventure,Thriller"
145716,tt9916270,Il talento del calabrone,2020.0,84,Thriller
145717,tt9916362,Coven,2020.0,92,"Drama,History"


In [4]:
## create a col with a list of genres
basics['genres_split'] = basics['genres'].str.split(',')
basics.head()

Unnamed: 0,tconst,primaryTitle,startYear,runtimeMinutes,genres,genres_split
0,tt0035423,Kate & Leopold,2001.0,118,"Comedy,Fantasy,Romance","[Comedy, Fantasy, Romance]"
1,tt0062336,The Tango of the Widower and Its Distorting Mi...,2020.0,70,Drama,[Drama]
2,tt0069049,The Other Side of the Wind,2018.0,122,Drama,[Drama]
3,tt0079644,November 1828,2001.0,140,"Drama,War","[Drama, War]"
4,tt0088751,The Naked Monster,2005.0,100,"Comedy,Horror,Sci-Fi","[Comedy, Horror, Sci-Fi]"


In [5]:
exploded_genres = basics.explode('genres_split')
exploded_genres

Unnamed: 0,tconst,primaryTitle,startYear,runtimeMinutes,genres,genres_split
0,tt0035423,Kate & Leopold,2001.0,118,"Comedy,Fantasy,Romance",Comedy
0,tt0035423,Kate & Leopold,2001.0,118,"Comedy,Fantasy,Romance",Fantasy
0,tt0035423,Kate & Leopold,2001.0,118,"Comedy,Fantasy,Romance",Romance
1,tt0062336,The Tango of the Widower and Its Distorting Mi...,2020.0,70,Drama,Drama
2,tt0069049,The Other Side of the Wind,2018.0,122,Drama,Drama
...,...,...,...,...,...,...
145715,tt9916190,Safeguard,2020.0,95,"Action,Adventure,Thriller",Thriller
145716,tt9916270,Il talento del calabrone,2020.0,84,Thriller,Thriller
145717,tt9916362,Coven,2020.0,92,"Drama,History",Drama
145717,tt9916362,Coven,2020.0,92,"Drama,History",History


In [6]:
unique_genres = sorted(exploded_genres['genres_split'].unique())

In [7]:
title_genres = exploded_genres[['tconst', 'genres_split']].copy()
title_genres.head()

Unnamed: 0,tconst,genres_split
0,tt0035423,Comedy
0,tt0035423,Fantasy
0,tt0035423,Romance
1,tt0062336,Drama
2,tt0069049,Drama


In [8]:
## Making the genre mapper dictionary
genre_ints = range(len(unique_genres))
genre_map = dict(zip(unique_genres, genre_ints))
genre_map



{'Action': 0,
 'Adult': 1,
 'Adventure': 2,
 'Animation': 3,
 'Biography': 4,
 'Comedy': 5,
 'Crime': 6,
 'Drama': 7,
 'Family': 8,
 'Fantasy': 9,
 'Game-Show': 10,
 'History': 11,
 'Horror': 12,
 'Music': 13,
 'Musical': 14,
 'Mystery': 15,
 'News': 16,
 'Reality-TV': 17,
 'Romance': 18,
 'Sci-Fi': 19,
 'Short': 20,
 'Sport': 21,
 'Talk-Show': 22,
 'Thriller': 23,
 'War': 24,
 'Western': 25}

In [9]:
## make new integer genre_id and drop string genres
title_genres['genre_id'] = title_genres['genres_split'].map(genre_map)
title_genres = title_genres.drop(columns='genres_split')



In [10]:
genre_lookup = pd.DataFrame({'Genre_Name': genre_map.keys(),
                            'Genre_ID': genre_map.values()})
genre_lookup.head()

Unnamed: 0,Genre_Name,Genre_ID
0,Action,0
1,Adult,1
2,Adventure,2
3,Animation,3
4,Biography,4


In [11]:
## Dropping original genre columns 
basics = basics.drop(columns=['genres','genres_split'])
basics

Unnamed: 0,tconst,primaryTitle,startYear,runtimeMinutes
0,tt0035423,Kate & Leopold,2001.0,118
1,tt0062336,The Tango of the Widower and Its Distorting Mi...,2020.0,70
2,tt0069049,The Other Side of the Wind,2018.0,122
3,tt0079644,November 1828,2001.0,140
4,tt0088751,The Naked Monster,2005.0,100
...,...,...,...,...
145714,tt9916170,The Rehearsal,2019.0,51
145715,tt9916190,Safeguard,2020.0,95
145716,tt9916270,Il talento del calabrone,2020.0,84
145717,tt9916362,Coven,2020.0,92


In [12]:
connection = f"mysql+pymysql://{'root'}:{'root'}@localhost/movies"
engine = create_engine(connection)

In [13]:
if database_exists(connection) == False:
 create_database(connection)
else:
  print('The database already exists')

The database already exists


In [14]:
genre_lookup.dtypes

Genre_Name    object
Genre_ID       int64
dtype: object

In [15]:
from sqlalchemy.types import *

genre_len = genre_lookup['Genre_Name'].fillna('').map(len).max()

genre_schema = {
    "Genre_ID": Integer(), 
    "Genre_Name": Text(genre_len+1)}

In [16]:
genre_lookup.to_sql('genre_lookup',engine,dtype=genre_schema,if_exists='replace',index=False)

26

In [17]:
engine.execute('ALTER TABLE genre_lookup ADD PRIMARY KEY (`Genre_ID`);')

<sqlalchemy.engine.cursor.LegacyCursorResult at 0x1b24878beb0>

In [18]:
q = """SELECT * FROM genre_lookup;"""
pd.read_sql(q, engine)

Unnamed: 0,Genre_Name,Genre_ID
0,Action,0
1,Adult,1
2,Adventure,2
3,Animation,3
4,Biography,4
5,Comedy,5
6,Crime,6
7,Drama,7
8,Family,8
9,Fantasy,9


basics load

In [19]:
basics.dtypes

tconst             object
primaryTitle       object
startYear         float64
runtimeMinutes      int64
dtype: object

In [20]:
## saving text length
key_len = basics['tconst'].map(len).max()
title_len = basics['primaryTitle'].map(len).max()
key_len, title_len

(10, 242)

In [21]:
basics_schema = {
    "tconst": String(key_len+1), 
    "primaryTitle": Text(title_len+1),
    'startYear':Float(),
    'runtimeMinutes':Integer()
    }
basics_schema

{'tconst': String(length=11),
 'primaryTitle': Text(length=243),
 'startYear': Float(),
 'runtimeMinutes': Integer()}

In [22]:
basics

Unnamed: 0,tconst,primaryTitle,startYear,runtimeMinutes
0,tt0035423,Kate & Leopold,2001.0,118
1,tt0062336,The Tango of the Widower and Its Distorting Mi...,2020.0,70
2,tt0069049,The Other Side of the Wind,2018.0,122
3,tt0079644,November 1828,2001.0,140
4,tt0088751,The Naked Monster,2005.0,100
...,...,...,...,...
145714,tt9916170,The Rehearsal,2019.0,51
145715,tt9916190,Safeguard,2020.0,95
145716,tt9916270,Il talento del calabrone,2020.0,84
145717,tt9916362,Coven,2020.0,92


In [23]:
basics.to_sql('title_basics',engine,dtype=basics_schema,if_exists='replace',index=False)
engine.execute('ALTER TABLE title_basics ADD PRIMARY KEY (`tconst`);')

<sqlalchemy.engine.cursor.LegacyCursorResult at 0x1b248e1b0a0>

rating

In [24]:
ratings = pd.read_csv('Data/title_ratings.csv.gz')

In [25]:
ratings.info()

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 1275548 entries, 0 to 1275547
Data columns (total 3 columns):
 #   Column         Non-Null Count    Dtype  
---  ------         --------------    -----  
 0   tconst         1275548 non-null  object 
 1   averageRating  1275548 non-null  float64
 2   numVotes       1275548 non-null  int64  
dtypes: float64(1), int64(1), object(1)
memory usage: 29.2+ MB


In [26]:
from sqlalchemy.types import *

key_len = ratings['tconst'].fillna('').map(len).max()

ratings_schema = {
    "tconst": String(key_len+1), 
    "averageRating": Float(),
    "numVotes": Integer()}

In [27]:
ratings.to_sql('title_ratings',engine,dtype=ratings_schema,if_exists='replace',index=False)

1275548

In [28]:
engine.execute('ALTER TABLE title_ratings ADD PRIMARY KEY (`tconst`);')

<sqlalchemy.engine.cursor.LegacyCursorResult at 0x1b24877d0d0>