In [None]:
from sqlalchemy import create_engine
import pandas as pd 
import matplotlib.pyplot as plt
import seaborn as sns
import plotly.express as px
import plotly.graph_objects as go
from plotly import tools
import warnings
warnings.simplefilter(action='ignore', category=pd.errors.PerformanceWarning)

In [None]:
cred = pd.read_csv('../credentials/credentials_flavio.csv')
db_hostname, db_port, db_user, db_pw, db_dbname = cred.iloc[0]

In [None]:
db_connection_str = f"mysql+pymysql://{db_user}:{db_pw}@{db_hostname}/{db_dbname}"
db_connection = create_engine(db_connection_str)

In [None]:
test = pd.read_sql('SELECT * FROM movies;',con=db_connection)
test

## (3) Genre Share over time (v1 absolute count -> misrepresentation)

In [None]:
movies_genres = pd.read_sql('SELECT m.movieId, m.title, m.release_year, g.genreId, g.genrename FROM movies as m inner join movies_genres as mg on m.movieId=mg.movieId inner join genres as g on mg.genreId=g.genreId order by m.movieId', con=db_connection)
movies_genres[:10]

In [None]:
print(f'Average genres per film: {len(movies_genres) / len(movies_genres.movieId.unique()):.2f}')
print('\nCheck total distribution')
genres_total_dist = movies_genres.genrename.value_counts().to_frame()
genres_total_dist['share'] = genres_total_dist.apply(lambda x: (x / genres_total_dist.genrename.sum()*100))
print(genres_total_dist)

In [None]:
genres_hist = pd.DataFrame(index=movies_genres.genrename.unique())
movies_genres.query('release_year == 2012').genrename.value_counts()

#year range TBD
#movies_genres.release_year.sort_values().unique()
for year in range(1900,2018+1):
    genres_hist[year] = movies_genres.query(f'release_year == {year}').genrename.value_counts()

genres_hist[1900] = movies_genres.query('release_year == 1900').genrename.value_counts()
genres_hist.fillna(0,inplace=True)
genres_hist = genres_hist.astype(int)


for year in range(1900,2018+1):
    sum = genres_hist[year].sum()
    genres_hist[year] = genres_hist[year].apply(lambda x: round((x / sum)*100,2))

genres_hist = genres_hist.T
genres_hist

In [None]:
top10genres = genres_hist.loc[2018].sort_values(ascending=False).head(10).index.to_list()

color_seq = px.colors.qualitative.G10
color_seq_count = 0

fig = go.Figure()
for genre in top10genres:
    color = color_seq[color_seq_count]
    color_seq_count+=1
    fig.add_trace(go.Scatter(x=genres_hist.index,
                         y=genres_hist[genre],
                         #fill='tozeroy', #fill down to xaxis
                         #fillcolor='orange',
                         mode='lines',
                         line={'dash': 'solid', 'color': color},
                         name=genre,
                         stackgroup='one'))

fig.update_layout(yaxis_range=(0,100),xaxis_range=(1900,2018))
fig.show()

# DBS Presentation

In [None]:
search_ai = pd.read_sql_query("CALL popular_movies.search_by_tags('artificial intelligence')", con=db_connection)

In [None]:
search_ai[:10]

In [None]:
avg_rating_films_original = pd.read_csv('query_ideas/avg_rating_of_films/output.csv')

In [None]:
avg_rating_films = avg_rating_films_original.copy()
avg_rating_films.rename(columns={'avg(rating)': 'avg_rating', 'count(rating)':'count_rating'}, inplace=True)
avg_rating_films = avg_rating_films.sort_values('avg_rating', ascending=True)
avg_rating_films = avg_rating_films.query('count_rating > 50')

avg_rating_films_original