# Investigating TV Series ratings using IMDB

In [None]:
import imdb

import numpy as np
import pandas as pd

import seaborn as sns
import matplotlib.pyplot as plt

from tqdm.auto import tqdm

In [None]:
sns.set_context('talk')

## Load ratings

In [None]:
query_list = [
    'Lost',
    'Game of Thrones',
    'Stargate SG-1',
    'Stargate: Atlantis',
    'Stargate Universe',
    'Westworld',
    'Black Mirror',
    'Breaking Bad',
    'The Witcher',
    'Dark',
]

In [None]:
ia = imdb.IMDb()

In [None]:
kind = 'tv series'
series_list = [
    x
    for query in query_list
    for x in ia.search_movie(query)
    if x['kind'] == kind and x['title'] == query and not x.has_key('imdbIndex')
]

assert len(query_list) == len(series_list)

In [None]:
series_list

In [None]:
tmp = []
for series in tqdm(series_list, desc='Series'):
    tqdm.write(series['title'])
    ia.update(series, 'episodes')

    for season_nr in series['episodes']:
        season = series['episodes'][season_nr]

        for episode_nr in series['episodes'][season_nr]:
            episode = season[episode_nr]

            tmp.append(
                {
                    'series': series['title'],
                    'season_nr': season_nr,
                    'episode_nr': episode_nr,
                    'episode': episode['title'],
                    'rating': episode.get('rating', np.nan),
                    'date': pd.to_datetime(episode.get('original air date')),
                    # 'raw': series.data['episodes'][season][episode]
                }
            )

df = pd.DataFrame(tmp).sort_values(by=['series', 'season_nr', 'episode_nr'])
df['idx'] = pd.Categorical(df['season_nr'].map(str) + ':' + df['episode_nr'].map(str))

# fix for latest seaborn version (otherwise all cetegory levels are plotted for all series)
df['idx'] = df['idx'].astype(str)

In [None]:
df.head()

## Visualize results

In [None]:
def annotate_episode(entry, ax, m):
    return ax.annotate(
        entry.episode,
        xy=(entry.idx, entry.rating),
        xytext=(0, 10 * m),
        xycoords='data',
        textcoords='offset points',
        fontsize=10,
        ha='center',
        va='center',
        arrowprops=dict(arrowstyle='->'),
    )

In [None]:
g = sns.FacetGrid(
    df.dropna(),
    row='series',
    hue='season_nr',
    sharex=False,
    sharey=True,
    aspect=2,
    height=5,
)

g.map_dataframe(sns.lineplot, x='idx', y='rating', marker='o', estimator=None)

g.set_xticklabels([])
g.set_ylabels('IMDB score')

g.add_legend()

# annotate episodes
for (i, j, k), data in g.facet_data():
    # row, col, hue
    ax = g.facet_axis(i, j)

    if not data.empty:
        annotate_episode(data.loc[data['rating'].idxmax()], ax, 1)
        annotate_episode(data.loc[data['rating'].idxmin()], ax, -1)