In [153]:
import pandas as pd
import numpy as np

import plotly.express as px

import warnings
warnings.filterwarnings('ignore')

def plot_box_over_time(data, x, y, hover_list):
    
    fig = px.box(data, x = x, y = y, hover_data = hover_list,
                  title = y + ' over time',
                  width = 850, height = 450)
    fig.update_layout(showlegend=False)
    fig.update_yaxes(visible = True, showticklabels = True, title = None)
    fig.show()
    
def plot_box(data, x, y, hover_list, cat_array, title):
    
    fig = px.box(data, x = x, y = y, hover_data = hover_list,
                  title = title,
                  width = 850, height = 450)
    fig.update_layout(showlegend=False)
    fig.update_yaxes(visible = True, showticklabels = True, title = None)
    fig.update_xaxes(visible = True, showticklabels = True, title = None)
    fig.update_xaxes(categoryorder='array', categoryarray= cat_array)
    fig.show()

In [154]:
data = pd.read_csv('../../data/the_office_series.csv', index_col = 0)

In [155]:
data['Date'] = pd.to_datetime(data['Date'], format = ' %d %B %Y')

In [156]:
data.loc[data['EpisodeTitle'] == 'Boys and Girls', 'Director'] = 'Dennie Gordon'
data.loc[data['EpisodeTitle'] == 'Take Your Daughter to Work Day', 'Director'] = 'Victor Nelli Jr.'
data.loc[data['EpisodeTitle'] == 'Launch Party', 'Director'] = 'Ken Whittingham'
data.loc[data['EpisodeTitle'] == 'Frame Toby', 'Director'] = 'Jason Reitman'
data.loc[data['EpisodeTitle'] == 'WUPHF.com', 'Director'] = 'Danny Leiner'
data.loc[data['EpisodeTitle'] == 'The Search', 'Director'] = 'Michael Spiller'
data.loc[data['EpisodeTitle'] == 'The Whale', 'Director'] = 'Rodman Flender'

In [157]:
data['Writers'] = data['Writers'].str.replace('|', ',')\
                                 .str.replace(' and ', ',')\
                                 .str.replace('\s+', ' ')\
                                 .str.replace(' ,', ',')\
                                 .str.replace(', ', ',')\
                                 .apply(lambda x: x.split(','))

In [158]:
data['GuestStars'] = data['GuestStars'].str.replace('\s+', ' ')\
                                       .str.replace(' ,', ',')\
                                       .str.replace(', ', ',')\
                                       .str.strip()\
                                       .apply(lambda x: x.split(',') if isinstance(x, float) == False else x)

In [159]:
data['GuestStarsNum'] = data['GuestStars'].apply(lambda x: len(x) if isinstance(x, float) == False else 0)
#data.drop(['GuestStars'], 1, inplace = True)

In [160]:
data.head(3)

Unnamed: 0,Season,EpisodeTitle,About,Ratings,Votes,Viewership,Duration,Date,GuestStars,Director,Writers,GuestStarsNum
0,1,Pilot,The premiere episode introduces the boss and s...,7.5,4936,11.2,23,2005-03-24,,Ken Kwapis,"[Ricky Gervais, Stephen Merchant, Greg Daniels]",0
1,1,Diversity Day,Michael's off color remark puts a sensitivity ...,8.3,4801,6.0,23,2005-03-29,,Ken Kwapis,[B. J. Novak],0
2,1,Health Care,Michael leaves Dwight in charge of picking the...,7.8,4024,5.8,22,2005-04-05,,Ken Whittingham,[Paul Lieberstein],0


### Ratings, votes, viewership and duration over time (seasons)

In [161]:
plot_box_over_time(data, 'Season', 'Ratings', ['EpisodeTitle', 'Date'])

In [162]:
plot_box_over_time(data, 'Season', 'Votes', ['EpisodeTitle', 'Date'])

In [163]:
plot_box_over_time(data, 'Season', 'Viewership', ['EpisodeTitle', 'Date'])

'Stress relief' in season 5 aired after the Super Bowl - hence the 22+ million viewership

In [164]:
plot_box_over_time(data, 'Season', 'Duration', ['EpisodeTitle', 'Date'])

In [171]:
dir_count = data.groupby('Director').agg({'EpisodeTitle' : 'count'}).reset_index().sort_values('EpisodeTitle', ascending = True)

fig = px.bar(dir_count[dir_count['EpisodeTitle'] > 2], x = 'EpisodeTitle', y = 'Director',
                     title = 'Number of episodes direceted by director', 
                     width = 750, height = 450)
fig.update_layout(showlegend=False)
fig.update_xaxes(visible = True, showticklabels = True, title = None)
fig.update_yaxes(visible = True, showticklabels = True, title = None)
fig.show()

In [172]:
dirs_at_least_3 = dir_count[dir_count['EpisodeTitle'] >= 3]['Director'].unique()
dirs_at_least_3_data = data[data['Director'].isin(dirs_at_least_3)]
order = dirs_at_least_3_data.groupby(['Director'])['Ratings'].median().sort_values(ascending = False).index

In [174]:
plot_box(dirs_at_least_3_data, 'Director', 'Ratings', [], order, 
         'Ratings distribution by directors with at least 3 directed episodes')