In [14]:
from sqlalchemy import create_engine
import pandas as pd 
import matplotlib.pyplot as plt
import seaborn as sns
import plotly.express as px
import plotly.graph_objects as go
from plotly import tools
import warnings
warnings.simplefilter(action='ignore', category=pd.errors.PerformanceWarning)

In [4]:
cred = pd.read_csv('../credentials/credentials.csv')
db_hostname, db_port, db_user, db_pw, db_dbname = cred.iloc[0]

In [5]:
db_connection_str = f"mysql+pymysql://{db_user}:{db_pw}@{db_hostname}/{db_dbname}"
db_connection = create_engine(db_connection_str)

In [7]:
test = pd.read_sql('SELECT * FROM movies;',con=db_connection)
test

Unnamed: 0,movieId,title,release_year
0,1,Toy Story (1995),1995
1,2,Jumanji (1995),1995
2,3,Grumpier Old Men (1995),1995
3,4,Waiting to Exhale (1995),1995
4,5,Father of the Bride Part II (1995),1995
...,...,...,...
58093,193876,The Great Glinka (1946),1946
58094,193878,Les tribulations d'une caissière (2011),2011
58095,193880,Her Name Was Mumu (2016),2016
58096,193882,Flora (2017),2017


## (3) Genre Share over time (v1 absolute count -> misrepresentation)

In [8]:
movies_genres = pd.read_sql('SELECT m.movieId, m.title, m.release_year, g.genreId, g.genrename FROM movies as m inner join movies_genres as mg on m.movieId=mg.movieId inner join genres as g on mg.genreId=g.genreId order by m.movieId', con=db_connection)
movies_genres[:10]

Unnamed: 0,movieId,title,release_year,genreId,genrename
0,1,Toy Story (1995),1995,9,Fantasy
1,1,Toy Story (1995),1995,5,Comedy
2,1,Toy Story (1995),1995,4,Children
3,1,Toy Story (1995),1995,3,Animation
4,1,Toy Story (1995),1995,2,Adventure
5,2,Jumanji (1995),1995,2,Adventure
6,2,Jumanji (1995),1995,4,Children
7,2,Jumanji (1995),1995,9,Fantasy
8,3,Grumpier Old Men (1995),1995,15,Romance
9,3,Grumpier Old Men (1995),1995,5,Comedy


In [9]:
print(f'Average genres per film: {len(movies_genres) / len(movies_genres.movieId.unique()):.2f}')
print('\nCheck total distribution')
genres_total_dist = movies_genres.genrename.value_counts().to_frame()
genres_total_dist['share'] = genres_total_dist.apply(lambda x: (x / genres_total_dist.genrename.sum()*100))
print(genres_total_dist)

Average genres per film: 1.89

Check total distribution
             genrename      share
Drama            24144  23.707544
Comedy           15956  15.667560
Thriller          8216   8.067478
Romance           7412   7.278012
Action            7130   7.001110
Horror            5555   5.454581
Documentary       5118   5.025481
Crime             5105   5.012716
Adventure         4067   3.993480
Sci-Fi            3444   3.381742
Mystery           2773   2.722872
Children          2749   2.699306
Animation         2663   2.614860
Fantasy           2637   2.589330
War               1820   1.787099
Western           1378   1.353090
Musical           1113   1.092880
Film-Noir          364   0.357420
IMAX               197   0.193439


In [12]:
genres_hist = pd.DataFrame(index=movies_genres.genrename.unique())
movies_genres.query('release_year == 2012').genrename.value_counts()

#year range TBD
#movies_genres.release_year.sort_values().unique()
for year in range(1900,2018+1):
    genres_hist[year] = movies_genres.query(f'release_year == {year}').genrename.value_counts()

genres_hist[1900] = movies_genres.query('release_year == 1900').genrename.value_counts()
genres_hist.fillna(0,inplace=True)
genres_hist = genres_hist.astype(int)


for year in range(1900,2018+1):
    sum = genres_hist[year].sum()
    genres_hist[year] = genres_hist[year].apply(lambda x: round((x / sum)*100,2))

genres_hist = genres_hist.T
genres_hist

Unnamed: 0,Fantasy,Comedy,Children,Animation,Adventure,Romance,Drama,Action,Thriller,Crime,Horror,Mystery,Sci-Fi,IMAX,Documentary,War,Musical,Western,Film-Noir
1900,13.33,20.00,0.00,6.67,0.00,20.00,6.67,6.67,6.67,0.00,0.00,0.00,0.00,0.00,20.00,0.00,0.00,0.00,0.0
1901,10.53,21.05,0.00,0.00,0.00,0.00,15.79,5.26,0.00,10.53,5.26,0.00,5.26,0.00,26.32,0.00,0.00,0.00,0.0
1902,33.33,0.00,11.11,0.00,22.22,0.00,11.11,11.11,0.00,0.00,0.00,0.00,11.11,0.00,0.00,0.00,0.00,0.00,0.0
1903,36.67,23.33,0.00,0.00,3.33,3.33,3.33,3.33,0.00,6.67,10.00,0.00,0.00,0.00,6.67,0.00,0.00,3.33,0.0
1904,33.33,16.67,0.00,0.00,16.67,0.00,0.00,0.00,0.00,0.00,0.00,0.00,16.67,0.00,16.67,0.00,0.00,0.00,0.0
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
2014,2.31,15.64,2.93,2.33,3.33,6.42,22.79,6.65,9.96,4.32,5.89,2.46,3.48,0.39,9.25,1.05,0.39,0.39,0.0
2015,2.25,15.83,3.12,2.47,3.56,5.67,22.86,6.30,10.45,3.50,7.14,2.72,4.15,0.03,8.58,0.95,0.05,0.35,0.0
2016,2.33,16.56,2.91,2.68,3.28,6.13,22.66,6.94,10.74,3.51,6.48,2.56,3.37,0.00,8.67,0.81,0.00,0.37,0.0
2017,2.34,15.95,2.51,2.94,3.40,5.25,22.62,7.27,10.30,4.00,6.77,2.68,4.19,0.00,8.26,1.06,0.00,0.46,0.0


In [17]:
top10genres = genres_hist.loc[2018].sort_values(ascending=False).head(10).index.to_list()

color_seq = px.colors.qualitative.G10
color_seq_count = 0

fig = go.Figure()
for genre in top10genres:
    color = color_seq[color_seq_count]
    color_seq_count+=1
    fig.add_trace(go.Scatter(x=genres_hist.index,
                         y=genres_hist[genre],
                         #fill='tozeroy', #fill down to xaxis
                         #fillcolor='orange',
                         mode='lines',
                         line={'dash': 'solid', 'color': color},
                         name=genre,
                         stackgroup='one'))

fig.update_layout(yaxis_range=(0,100),xaxis_range=(1900,2018))
fig.show()