# The Office Time Series Analysis

In [3]:
# imports

from wrangle import the_office

# ignore warnings
import warnings
warnings.filterwarnings("ignore")

# Wrangling
import pandas as pd
import numpy as np
from sklearn.model_selection import train_test_split

# Statistical Tests
import scipy.stats as stats

# Visualizing
import matplotlib.pyplot as plt
from matplotlib import cm
import seaborn as sns
from sklearn.model_selection import learning_curve
import statsmodels.api as sm

# Acquire

In [None]:
# getting the office data
df = the_office()
df.head() # check_yo_head

# Prepare for Time Series

In [None]:
# setting office data to datetime using the date column
df.date = pd.to_datetime(df.date)
df.dtypes

In [None]:
df.date.head() # check_yo_head

In [None]:
df.head() #check_yo_head

In [None]:
# adding columns for year, month, day, and weekday
df['year'] = df.date.dt.year
df['month'] = df.date.dt.month
df['day'] = df.date.dt.day
df['weekday'] = df.date.dt.day_name()
df.head() # check_yo_head

In [None]:
# changing the index of the office data to date after it has been changed to datetime
df = df.set_index('date').sort_index()
df.head() # check_yo_head

In [None]:
# plotting defaults
plt.rc('figure', figsize=(13, 7))
plt.style.use('seaborn-whitegrid')
plt.rc('font', size=16)

In [None]:
print('Date Range:', df.index.min(), 'to', df.index.max())
print('Shape:', df.shape)

df.head()# check_yo_head

In [8]:
# let's make a function

def office_time():
    '''
    fucntion to take the office data and convert the date column
    to datetime, then add year, month, day, weekday columns
    set a new index to the date column, save a csv file and 
    print the date range and shape of df
    '''
    # getting the office data
    df = the_office()
    # converting the date column to datetime
    df.date = pd.to_datetime(df.date)
    # adding columns for year, month, day, and weekday
    df['year'] = df.date.dt.year
    df['month'] = df.date.dt.month
    df['day'] = df.date.dt.day
    df['weekday'] = df.date.dt.day_name()
    # setting the df index to the date column
    df = df.set_index('date').sort_index()
    # saving the office data to a csv
    df.to_csv('office_time.csv')
    # printing out the date range and shape of the df
    print('Date Range:', df.index.min(), 'to', df.index.max())
    print('Shape:', df.shape)
    
    return df

In [9]:
office_time()

Date Range: 2005-03-24 00:00:00 to 2013-05-16 00:00:00
Shape: (188, 15)


Unnamed: 0_level_0,episode,season,episode_title,about,ratings,votes,viewership,duration,director,writers,color,year,month,day,weekday
date,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1,Unnamed: 5_level_1,Unnamed: 6_level_1,Unnamed: 7_level_1,Unnamed: 8_level_1,Unnamed: 9_level_1,Unnamed: 10_level_1,Unnamed: 11_level_1,Unnamed: 12_level_1,Unnamed: 13_level_1,Unnamed: 14_level_1,Unnamed: 15_level_1
2005-03-24,0,1,Pilot,The premiere episode introduces the boss and s...,7.5,4936,11.20,23,Ken Kwapis,Ricky Gervais |Stephen Merchant and Greg Daniels,yellow,2005,3,24,Thursday
2005-03-29,1,1,Diversity Day,Michael's off color remark puts a sensitivity ...,8.3,4801,6.00,23,Ken Kwapis,B. J. Novak,lightgreen,2005,3,29,Tuesday
2005-04-05,2,1,Health Care,Michael leaves Dwight in charge of picking the...,7.8,4024,5.80,22,Ken Whittingham,Paul Lieberstein,yellow,2005,4,5,Tuesday
2005-04-12,3,1,The Alliance,"Just for a laugh, Jim agrees to an alliance wi...",8.1,3915,5.40,23,Bryan Gordon,Michael Schur,yellow,2005,4,12,Tuesday
2005-04-19,4,1,Basketball,Michael and his staff challenge the warehouse ...,8.4,4294,5.00,23,Greg Daniels,Greg Daniels,lightgreen,2005,4,19,Tuesday
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
2013-04-11,183,9,Stairmageddon,Dwight shoots Stanley with a bull tranquilizer...,8.0,1985,3.83,22,Matt Sohn,Dan Sterling,yellow,2013,4,11,Thursday
2013-04-25,184,9,Paper Airplane,The employees hold a paper airplane competitio...,8.0,2007,3.25,22,Jesse Peretz,Halsted Sullivan | Warren Lieberstein,yellow,2013,4,25,Thursday
2013-05-02,185,9,Livin' the Dream,Dwight becomes regional manager after Andy qui...,9.0,2831,3.51,42,Jeffrey Blitz,Niki Schwartz-Wright,darkgreen,2013,5,2,Thursday
2013-05-09,186,9,A.A.R.M.,Dwight prepares for a marriage proposal and hi...,9.5,3914,4.56,43,David Rogers,Brent Forrester,darkgreen,2013,5,9,Thursday


### Data Splitting

#### Percentage Based

In [None]:
train_size = .70
n = df.shape[0]
test_start_index = round(train_size * n)

train = df[:test_start_index] # everything up (not including) to the test_start_index
test = df[test_start_index:] # everything from the test_start_index to the end

plt.plot(train.index, train.viewership)
plt.plot(test.index, test.viewership)

#### Human Based

In [None]:
train = df[:'2010'] # includes everything into 2010
test = df['2011':] # everthing from 2011 to the last episode

plt.plot(train.index, train.viewership)
plt.plot(test.index, test.viewership)

### Visualization

In [None]:
# assigning the train viewership data to y 
y = train.viewership
y.head() # check_yo_head

In [None]:
# plotting a histogram of the y data
y.plot.hist()

In [None]:
# looking at the average viewership by month
ax = y.groupby(y.index.month).mean().plot.bar(width=.9, ec='black')
plt.xticks(rotation=0)
ax.set(title='Average Views by Month', xlabel='Month', ylabel='Views')

In [None]:
# bar plot of the views by day within a month
ax = y.groupby(y.index.day).mean().plot.bar(width=.9, ec='black')
plt.xticks(rotation=0)
ax.set(title='Average Views by Day', xlabel='Day', ylabel='Views')

In [None]:
# bar plot of the views by day
y.groupby(y.index.day_name()).mean().plot.bar(width=.9, ec='black')
plt.xticks(rotation=0)
ax.set(title='Average Temperature by Weekday', xlabel='Weekday', ylabel='Temp (F)')

In [None]:
# boxplot of the views by day
(
    y.reset_index()
    .assign(weekday=lambda df: df.date.dt.day_name())
    .pipe((sns.boxplot, 'data'), y='viewership', x='weekday')
)

In [None]:
# using a .plot() on th y data
y.plot()

In [None]:
# looking at a 2 week average of viewership
y.resample('2W').mean().plot(title='2 week average')

In [None]:
# putting together days, weeks, months, and years of viewership
y.plot(alpha=.2, label='Hourly')
y.resample('D').mean().plot(alpha=.5, label='Daily')
y.resample('W').mean().plot(alpha=.8, label='Weekly')
y.resample('M').mean().plot(label='Montly')
y.resample('Y').mean().plot(label='Yearly')
plt.legend()


In [None]:
# looking at the weekly change in viewership
y.resample('W').mean().diff().plot(title='Average week-to-week change in viewership')

In [None]:
# looking at the monthly change in viewership
y.resample('M').mean().diff().plot(title='Average month-to-month change in Viewership')

In [None]:
# .unstack turns an index level into columns
y.groupby([y.index.year, y.index.month]).mean().unstack(0).plot(title='Seasonal Plot')
# looking at viewership by year

In [None]:
table = y.groupby([y.index.year, y.index.month]).mean().unstack()

fig, axs = plt.subplots(1, 12, sharey=True, sharex=True)
for ax, (month, subset) in zip(axs, table.iteritems()):
    subset.plot(ax=ax, title=month)
    ax.hlines(subset.mean(), *ax.get_xlim())
    ax.set(xlabel='')

fig.suptitle('Seasonal Subseries Plot') # super-title for the overall figure
fig.subplots_adjust(wspace=0)


In [None]:
plt.scatter(y, y.shift(-1))
plt.xlabel('$y$')
plt.ylabel('$y_{t + 1}$')
plt.title('Lag plot with lag=1')


In [None]:
train['y(t + 1)'] = train.viewership.shift(-1)
ax = train.plot.scatter(x='viewership', y='y(t + 1)')
ax.set(xlabel='t', ylabel='t + 1')


In [None]:
pd.plotting.autocorrelation_plot(train.viewership.resample('Y').mean())


In [None]:
train = train.resample('D').mean()
# create a categorical feature
train['view_bin'] = pd.qcut(train.viewership, 4, labels=['cold', 'cool', 'warm', 'hot'])
train.groupby('view_bin').mean()


In [None]:
(train.groupby('view_bin')
 .resample('M')
 .size()
 .unstack(0)
 .apply(lambda row: row / row.sum(), axis=1)
 .plot.area()
)
plt.ylabel('% of days in the month')


In [None]:
ax = (train
 .groupby('view_bin')
 .resample('M')
 .size()
 .unstack(0)
 .plot.bar(stacked=True, width=.9, ec='black')
)
labels = [pd.to_datetime(t.get_text()).strftime('%B') for t in ax.get_xticklabels()]
ax.set_xticklabels(labels)


In [None]:
ax = y.groupby(y.index.strftime('%m-%b')).mean().plot.bar()
ax.set_xticklabels([t.get_text()[3:] for t in ax.get_xticklabels()], rotation=0)
None


In [None]:
from matplotlib.dates import DateFormatter

fig, ax = plt.subplots()

weekly = y.resample('W').mean()

ax.plot(weekly.index, weekly)

ax.xaxis.set_major_formatter(DateFormatter('%b %y'))


# Modeling

In [None]:
import warnings
warnings.filterwarnings("ignore")

import numpy as np
import pandas as pd

from datetime import datetime
from sklearn.metrics import mean_squared_error
from math import sqrt

import matplotlib.pyplot as plt
%matplotlib inline
import seaborn as sns
from pandas.plotting import register_matplotlib_converters

import statsmodels.api as sm
from statsmodels.tsa.api import Holt

In [None]:
# dropping columns that are not needed for modeling purposes
df = df.drop(columns= {'episode_title', 'episode', 'season', 'about', 'director', 'writers', 'color', 'year', 'month', 'day', 'weekday', 'votes', 'duration'})

In [None]:
df.head() # check_yo_head

In [None]:
# splitting the data
train_size = int(len(df) * .5)
validate_size = int(len(df) * .3)
test_size = int(len(df) - train_size - validate_size)
validate_end_index = train_size + validate_size

# split into train, validation, test
train = df[: train_size]
validate = df[train_size : validate_end_index]
test = df[validate_end_index : ]

In [None]:
# checking to make sure split worked properly
print(len(train) + len(validate) + len(test) == len(df))

In [None]:
# comparing the train data to the dataframe data
print(df.head(1) == train.head(1))

In [None]:
# looking at the end of test and df data 
pd.concat([test.tail(1), df.tail(1)])

In [None]:
for col in train.columns:
    plt.figure(figsize=(12,4))
    plt.plot(train[col])
    plt.plot(validate[col])
    plt.plot(test[col])
    plt.ylabel(col)
    plt.title(col)
    plt.show()


In [None]:
# evaluation function to compute rmse

def evaluate(target_var):
    rmse = sqrt(mean_squared_error(validate[target_var], yhat_df[target_var]))
    return rmse


In [None]:
# plot and evaluate 
def plot_and_eval(target_var):
    plt.figure(figsize = (12,4))
    plt.plot(train[target_var], label = 'Train', linewidth = 1)
    plt.plot(validate[target_var], label = 'Validate', linewidth = 1)
    plt.plot(yhat_df[target_var])
    plt.title(target_var)
    rmse = evaluate(target_var)
    print(target_var, '-- RMSE: {:.0f}'.format(rmse))
    plt.show()


In [None]:
# Create the empty dataframe
eval_df = pd.DataFrame(columns=['model_type', 'target_var', 'rmse'])

# function to store rmse for comparison purposes
def append_eval_df(model_type, target_var):
    rmse = evaluate(target_var)
    d = {'model_type': [model_type], 'target_var': [target_var], 'rmse': [rmse]}
    d = pd.DataFrame(d)
    return eval_df.append(d, ignore_index = True)


In [None]:
eval_df

In [None]:
# Forcast
# Last observed value
views = train['viewership'][-1:][0]
ratings = round(train['ratings'][-1:][0],2)

yhat_df = pd.DataFrame({'viewership': [views], 'ratings': [ratings]}, 
                       index = validate.index)

yhat_df.head(2)


In [None]:
for col in train.columns:
    plot_and_eval(col)

In [None]:
for col in train.columns:
    eval_df = append_eval_df(model_type = 'last_observed_value', 
                             target_var = col)


In [None]:
eval_df

In [None]:
# simple average

views = round(train['viewership'].mean(), 2)
ratings = round(train['ratings'].mean(), 2)

def make_predictions():
    yhat_df = pd.DataFrame({'viewership': [views],
                            'ratings': [ratings],
                           }, index = validate.index)
    return yhat_df

yhat_df = make_predictions()


In [None]:
yhat_df.head(2)

In [None]:
for col in train.columns:
    plot_and_eval(col)

In [None]:
# evaluate
for col in train.columns:
    eval_df = append_eval_df(model_type='simple_average', 
                             target_var = col)

In [None]:
eval_df

In [None]:
# moving average
# compute a 30 day rolling average, 
# use the most recent/last 30 day period value to predict forward. 

period = 30

views = round(train['viewership'].rolling(period).mean().iloc[-1], 2)
ratings = round(train['ratings'].rolling(period).mean().iloc[-1], 2)

yhat_df = make_predictions()
yhat_df.head(3)

In [None]:
for col in train.columns:
    plot_and_eval(col)

In [None]:
for col in train.columns:
    eval_df = append_eval_df(model_type='30d moving average', 
                             target_var = col)

In [None]:
eval_df

In [None]:
periods = [1, 7, 14, 21]

for p in periods:
    views = round(train['viewership'].rolling(p).mean().iloc[-1], 2)
    ratings = round(train['ratings'].rolling(p).mean().iloc[-1], 2)
    yhat_df = make_predictions()
    model_type = str(p) + 'd moving average'
    eval_df = append_eval_df(model_type = model_type,
                             target_var = 'viewership'
                            )
    eval_df = append_eval_df(model_type = model_type,
                             target_var = 'ratings'
                            )


In [None]:
eval_df

In [None]:
# get the min rmse for each variable

min_rmse_ratings = eval_df.groupby('target_var')['rmse'].min()[0]
min_rmse_views = eval_df.groupby('target_var')['rmse'].min()[1]

# filter only the rows that match those rmse to find out 
# which models are best thus far
eval_df[((eval_df.rmse == min_rmse_ratings) | 
         (eval_df.rmse == min_rmse_views)
        )]


In [None]:
for col in train.columns:
    plot_and_eval(target_var = col)


In [None]:
eval_df