In [None]:
#1. Bibliotheken importieren
import mysql.connector as mysql
import pandas as pd
import plotly.graph_objects as go
import warnings
warnings.simplefilter(action='ignore', category=pd.errors.PerformanceWarning)

In [None]:
"""cursor.execute("SHOW DATABASES")

for x in cursor:
  print(x)"""

In [None]:
#Read all the credentials from external file
credentials = pd.read_csv('../credentials/credentials.csv')

database_adress = credentials['Host'].iloc[0]
database_user = credentials['User'].iloc[0]
database_password = credentials['Password'].iloc[0]
database_name = credentials['Database'].iloc[0]

print(database_user)

In [None]:
#Connect to Database
db = mysql.connect(host=database_adress, user=database_user, passwd=database_password, database=database_name)
cursor = db.cursor()

#Form and execute query
query = "SELECT * FROM movies;" ## Read from DB
cursor.execute(query)

#Load query in Dataframe and print it out
all_movies = pd.DataFrame(list(cursor), columns=['id', 'title', 'year'])

all_movies.iloc[0:10].style.hide_index()


In [None]:
from sqlalchemy import create_engine
import matplotlib.pyplot as plt

In [None]:
db_connection_str = f"mysql+pymysql://{database_user}:{database_password}@{database_adress}/{database_name}"
db_connection = create_engine(db_connection_str)

In [None]:
all_movies = pd.read_sql('SELECT * FROM movies ORDER BY release_year', con=db_connection)

In [None]:
all_movies.head(400)

In [None]:
fig, ax = plt.subplots(figsize=(10,5))

year = pd.DataFrame(all_movies['release_year'].drop_duplicates())
count = all_movies.groupby(['release_year'])['movieId'].count()

year_count = pd.merge(year, count, on=["release_year", "release_year"])
year_count.rename(columns = {'movieId':'count'}, inplace=True)

year_count.drop(0, inplace=True)
year_count.drop(year_count.index[len(year_count)-1], inplace=True)

ax.bar(year_count['release_year'], year_count['count'], color ='maroon',
      width = 0.4)

ax.set_title("Released movies over time")
#ax.set_xlim(1850,2018)

In [None]:
year_count['release_year'].head()

In [None]:
all_movies.groupby(['release_year']).head()#['movieId'].count()

In [None]:
count = all_movies.groupby(['release_year'])['movieId'].count()

test_fig, test_ax = plt.subplots(figsize=(10,5))

count = count.drop([-1])
count = count.drop([2018])

test_ax.bar(count.index, count.values, color ='maroon')

test_ax.set_title("Released movies over time")

In [None]:
all_movies.groupby(['release_year']).head()

In [None]:
movies_with_genres = pd.read_sql("""SELECT m.movieId, m.title, g.genrename FROM movies as m 
INNER JOIN movies_genres as mg ON m.movieID = mg.movieId
INNER JOIN genres as g ON mg.genreId = g.genreId
ORDER BY m.movieId;""", con=db_connection)

In [None]:
genres = pd.read_sql("""SELECT * FROM genres;""", con=db_connection)

In [None]:
genres_count = movies_with_genres.groupby(['genrename'])['movieId'].count()

genres_count_fig, genres_count_ax = plt.subplots(figsize=(30,15))
#genres_count.head()
genres_count_ax.bar(genres_count.index, genres_count.values, color ='maroon')

In [None]:
genres.count()

In [None]:
genres_df = pd.read_sql('SELECT mg.movieId,mg.genreId,g.genrename FROM movies_genres AS mg INNER JOIN genres AS g ON mg.genreId=g.genreId;', con=db_connection)

In [None]:
genres_df.groupby(['genrename'])['movieId'].count()

## Genre share over time

Diagram Type: 100% Stacked Area Chart (https://codejock.com/products/chart/100-stacked-area-chart.asp)

**Challenges**
* Multiple genres
* Amount of genres to display

In [None]:
movies_genres = pd.read_sql('SELECT m.movieId, m.title, m.release_year, g.genreId, g.genrename FROM movies as m inner join movies_genres as mg on m.movieId=mg.movieId inner join genres as g on mg.genreId=g.genreId order by m.movieId', con=db_connection)
movies_genres[:10]

In [None]:
print(f'Average genres per film: {len(movies_genres) / len(movies_genres.movieId.unique()):.2f}')
print('\nCheck total distribution')
genres_total_dist = movies_genres.genrename.value_counts().to_frame()
genres_total_dist['share'] = genres_total_dist.apply(lambda x: (x / genres_total_dist.genrename.sum()*100))
print(genres_total_dist)

In [None]:
genres_hist = pd.DataFrame(index=movies_genres.genrename.unique())
movies_genres.query('release_year == 2012').genrename.value_counts()

#year range TBD
#movies_genres.release_year.sort_values().unique()
for year in range(1900,2018+1):
    genres_hist[year] = movies_genres.query(f'release_year == {year}').genrename.value_counts()

genres_hist[1900] = movies_genres.query('release_year == 1900').genrename.value_counts()
genres_hist.fillna(0,inplace=True)
genres_hist = genres_hist.astype(int)


for year in range(1900,2018+1):
    sum = genres_hist[year].sum()
    genres_hist[year] = genres_hist[year].apply(lambda x: round((x / sum)*100,2))

#stfu 😂
genres_hist = genres_hist.T
genres_hist

In [None]:
fig = go.Figure()
fig.add_trace(go.Scatter(x=genres_hist.index,
                         y=genres_hist['Adventure'], 
                         fill='tozeroy', #fill down to xaxis
                         fillcolor='orange',
                         mode='lines',
                         line={'dash': 'solid', 'color': 'orange'},
                         name="Adventure"))
fig.add_trace(go.Scatter(x=genres_hist.index,
                         y=genres_hist['Mystery'], 
                         fill='tozeroy', #fill down to xaxis
                         fillcolor='red',
                         mode='lines',
                         line={'dash': 'solid', 'color': 'red'},
                         name="Mystery"))