# The Office Scratchpad

In [1]:
# imports

import pandas as pd
import numpy as np
import matplotlib.pyplot as plt

# Acquire

In [4]:
# functions to set data to be used in my visualizations 
def set_color(ratings):
    if ratings < 7.4:
        return 'red'
    elif (ratings >= 7.4) & (ratings < 8.2):
        return 'yellow'
    elif (ratings >= 8.2) & (ratings < 9.0):
        return 'lightgreen'
    elif (ratings >= 9.0):
        return 'darkgreen'

In [5]:
def the_office():
    '''
    Function to read the office series csv file,
    and drop null values in the data,
    rename columns for readability,
    convert column names to lowercase
    and save the new df to a csv file
    '''
    # reading the office series data from a csv file
    df = pd.read_csv('the_office_series.csv')
    # dropping the GuestStars column, too many null values
    df = df.drop(columns='GuestStars')
    # renaming the columns for readability
    df = df.rename(columns={"Unnamed: 0": "Episode", "EpisodeTitle": "Episode_Title"})
    # convert column names to lowercase
    df.columns = [col.lower() for col in df]
    # adding a color column to the df
    df['color'] = df['ratings'].apply(set_color)
    # saving the office data to a csv
    df.to_csv('the_office.csv')
    
    return df

In [6]:
# getting the office data using the office function 
df = the_office()
df.head()

Unnamed: 0,episode,season,episode_title,about,ratings,votes,viewership,duration,date,director,writers,color
0,0,1,Pilot,The premiere episode introduces the boss and s...,7.5,4936,11.2,23,24 March 2005,Ken Kwapis,Ricky Gervais |Stephen Merchant and Greg Daniels,yellow
1,1,1,Diversity Day,Michael's off color remark puts a sensitivity ...,8.3,4801,6.0,23,29 March 2005,Ken Kwapis,B. J. Novak,lightgreen
2,2,1,Health Care,Michael leaves Dwight in charge of picking the...,7.8,4024,5.8,22,5 April 2005,Ken Whittingham,Paul Lieberstein,yellow
3,3,1,The Alliance,"Just for a laugh, Jim agrees to an alliance wi...",8.1,3915,5.4,23,12 April 2005,Bryan Gordon,Michael Schur,yellow
4,4,1,Basketball,Michael and his staff challenge the warehouse ...,8.4,4294,5.0,23,19 April 2005,Greg Daniels,Greg Daniels,lightgreen


In [None]:
# reading the office series csv file
df = pd.read_csv('the_office_series.csv')
df.head() # check_yo_head

In [None]:
df.info()

In [None]:
# convert column names to lowercase, replace '.' in column names with '_'
df.columns = [col.lower().replace('.', '_') for col in df]

In [None]:
df.head()

In [None]:
# looking at the shape of the data
df.shape

In [None]:
# looking at the data info
df.info()

In [None]:
# checking for nulls in the data
df.isna().sum()

It looks like there are only nulls in the Gueststar column,
I think I will make a gueststar df and then drop it for the original df

In [None]:
def guest_office():
    '''
    Function to create a df where there are guest stars on the episode,
    drop the null values in the data,
    rename columns for readability,
    convert column names to lowercase
    and save the df to a csv file 
    '''
    # reading the office series data from a csv file
    df = pd.read_csv('the_office_series.csv')
    # dropping the null values in the GuestStars column
    df = df[df.GuestStars.notna()]
    # renaming the columns for readability
    df = df.rename(columns={"Unnamed: 0": "Episode", "EpisodeTitle": "Episode_Title", "GuestStars": "Guest_Stars"})
    # convert column names to lowercase
    df.columns = [col.lower() for col in df]
    # adding a color column to the df
    df['color'] = df['ratings'].apply(set_color)
    # saving the office data to a csv
    df.to_csv('the_office_guest.csv')
    
    return df
    

In [None]:
guest_df = guest_office()
guest_df.head()

In [None]:
guest_df.shape

In [None]:
df.head()

In [None]:
df = df.drop(columns='GuestStars')

In [None]:
df.head()

In [None]:
df.tail()

In [None]:
# convert column names to lowercase, replace '.' in column names with '_'
df = df.columns = [col.lower().replace('.', '_') for col in df]

In [None]:
# looking at the episode with the most views
max_views = df.loc[df.viewership.idxmax()]
max_views

In [None]:
# making a function for max views

def max_views():
    '''
    function that takes the office data
    and findes the episode with the most views
    '''
    # getting the office data
    df = the_office()
    # getting the episode data with the most views
    views = df.loc[df.viewership.idxmax()]
    
    return views

In [None]:
max_views()

In [None]:
# checking to see the most viewed episode on the guest df, looks like they are the same
guest_max = guest_df.loc[guest_df.viewership.idxmax()]
guest_max

In [None]:
plt.scatter(df.episode, df.viewership)
plt.xlabel("Episode")
plt.ylabel("Viewership")
plt.title("Episode Popularity of The Office")

In [None]:
df.info()

In [None]:
df.describe()

In [None]:
# functions to set data to be used in my visualizations 
def set_color(ratings):
    if ratings < 7.4:
        return 'red'
    elif (ratings >= 7.4) & (ratings < 8.2):
        return 'yellow'
    elif (ratings >= 8.2) & (ratings < 9.0):
        return 'lightgreen'
    elif (ratings >= 9.0):
        return 'darkgreen'

In [None]:
# setting data required in visualizations
df['color'] = df['ratings'].apply(set_color)
df.head()

In [None]:
plt.scatter(df.episode, df.ratings)
plt.xlabel("Episode")
plt.ylabel("Viewership")
plt.title("Episode Popularity of The Office")

In [None]:
fig, ax = plt.subplots()

ax.scatter(x=df.index, 
           y=df.viewership,
           c=df.color,
          )
plt.xlabel("Episode")
plt.ylabel("Viewership")
plt.title("Episode Popularity of The Office")

In [None]:
import plotly.express as px

# dataframe for average duration of each Season
avg_season = df.groupby(df.season)[['ratings']].mean().reset_index()

fig = px.scatter(avg_season, x = 'season', y = 'ratings',trendline = 'lowess',size = 'season',
                 title = '<b>Ratings over each Season</b>')
fig.show()

In [None]:
import plotly.express as px
avg_season = df.groupby(df.season)[['ratings']].mean().reset_index()
fig = px.scatter(avg_season, x="season", y="ratings", color="ratings", marginal_y="violin",
           marginal_x="box", trendline="ols", template="simple_white", title = '<b>Ratings over each Season</b>')
fig.show()

In [None]:
import plotly.express as px
fig = px.bar(x=["a", "b", "c"], y=[1, 3, 2])
fig.write_html('first_figure.html', auto_open=True)

In [7]:
df_episodes = df.groupby('season').size().reset_index(name='episodes')
df_episodes

Unnamed: 0,season,episodes
0,1,6
1,2,22
2,3,23
3,4,14
4,5,26
5,6,26
6,7,24
7,8,24
8,9,23


In [8]:
def season_episodes():
    '''
    function that takes the office df and 
    makes a new df with the season and 
    number of episodes per season
    '''
    # getting the office data
    df = the_office()
    # creating a new df with seasons and episodes
    df_episodes = df.groupby('season').size().reset_index(name='episodes')
    
    return df_episodes

In [9]:
season_episodes()

Unnamed: 0,season,episodes
0,1,6
1,2,22
2,3,23
3,4,14
4,5,26
5,6,26
6,7,24
7,8,24
8,9,23


# Setting up for Clustering

In [10]:
# ignore warnings
import warnings
warnings.filterwarnings("ignore")

# Wrangling
import pandas as pd
import numpy as np

# Exploring
import scipy.stats as stats

# Visualizing
%matplotlib inline
import matplotlib.pyplot as plt
import seaborn as sns

# default pandas decimal number display format
pd.options.display.float_format = '{:20,.2f}'.format

from wrangle import the_office

# ignore warnings
import warnings
warnings.filterwarnings("ignore")

# Wrangling
import pandas as pd
import numpy as np
from sklearn.model_selection import train_test_split

# Statistical Tests
import scipy.stats as stats

# Visualizing
import matplotlib.pyplot as plt
from matplotlib import cm
import seaborn as sns
from sklearn.model_selection import learning_curve

pd.options.display.float_format = '{:20,.2f}'.format


In [None]:
# getting the office data again
df = the_office()

In [None]:
df.head() # check_yo_head

In [None]:
# histogram of the columns in the data
df.hist(figsize=(24, 10), bins=20)

In [None]:
# outliers functions
def get_upper_outliers(s, k):
    '''
    Given a series and a cutoff value, k, returns the upper outliers for the
    series.

    The values returned will be either 0 (if the point is not an outlier), or a
    number that indicates how far away from the upper bound the observation is.
    '''
    q1, q3 = s.quantile([.25, .75])
    iqr = q3 - q1
    upper_bound = q3 + k * iqr
    return s.apply(lambda x: max([x - upper_bound, 0]))

def add_upper_outlier_columns(df, k):
    '''
    Add a column with the suffix _outliers for all the numeric columns
    in the given dataframe.
    '''
    # outlier_cols = {col + '_outliers': get_upper_outliers(df[col], k)
    #                 for col in df.select_dtypes('number')}
    # return df.assign(**outlier_cols)

    for col in df.select_dtypes('number'):
        df[col + '_outliers'] = get_upper_outliers(df[col], k)

    return df

add_upper_outlier_columns(df, k=1.5)

df.head()

In [None]:
# taking a look at what the outliers look like
outlier_cols = [col for col in df if col.endswith('_outliers')]
for col in outlier_cols:
    print('~~~\n' + col)
    data = df[col][df[col] > 0]
    print(data.describe())

- Votes: 8, std = 2,175
- Viewership: 1, std = 10.83
- Duration: 6, std = 7.63

In [None]:
df.head() # check_yo_head

In [None]:
# episode, season, ratings, votes, viewership, duration, 
df = the_office()

df.drop(columns=['episode_title', 'about', 'date', 'director', 'writers', 'color', 'votes', 'duration', 'viewership'], inplace=True)
df.head()

In [12]:
def cluster_df():
    '''
    function to make a dataframe for clustering
    takes the office data and drops columns
    then changes the dataframe to integers
    '''
    # getting the office data
    df = the_office()
    # dropping columns that are not needed for clustering
    df.drop(columns=['episode_title', 'about', 'date', 'director', 'writers', 'color', 'votes', 'duration', 'viewership', 'season'], inplace=True)
    # changing the dataframe type to integers
    df = df.astype(int)
    
    return df

In [14]:
cluster_df().head()

Unnamed: 0,episode,ratings
0,0,7
1,1,8
2,2,7
3,3,8
4,4,8


In [None]:
# checking the info to see if the df is ready to explore
df.info()

In [None]:
# using a describe to see some df stats
df.describe()

In [None]:
# changing the df type to integer 
df = df.astype(int)

In [None]:
# double checking the df info after changing to integer
df.info()

In [None]:
# train validate test split
# split test off, 20% of original df size. 
train_validate, test = train_test_split(df, test_size=.2, 
                                        random_state=42)

# split validate off, 30% of what remains (24% of original df size)
# thus train will be 56% of original df size. 
train, validate = train_test_split(train_validate, test_size=.3, 
                                   random_state=42)

print("train observations: ", train.size)
print("validate observations: ", validate.size)
print("test observations: ", test.size)

In [None]:
# what is the distribution of each variable
for col in train.columns:
    plt.figure(figsize=(4,2))
    plt.hist(train[col])
    plt.title(col)
    plt.show()

In [None]:
sns.boxplot(train.season, train.ratings)

plt.show()

In [None]:
sns.boxplot(train.ratings, train.episode)

plt.show()

In [None]:
sns.boxplot(train.season, train.ratings)

plt.show()

In [None]:
sns.jointplot(x="episode", y="ratings", data=train)
plt.xlabel("Episode")
plt.ylabel("Rating")
plt.show()

In [None]:
sns.jointplot(x="season", y="ratings", data=train)
plt.xlabel("Episode")
plt.ylabel("Rating")
plt.show()

In [None]:
# plot age by spending_score
plt.scatter(train.episode, train.ratings, color='green')
plt.xlabel("Episode")
plt.ylabel("Ratings")
plt.title("Is there a relationship\nbetween episodes and ratings?")
plt.show()

In [None]:
sns.scatterplot(x='episode', y='ratings',
                data=train, color='green')
plt.show()

In [None]:
sns.boxplot(x='season', y='ratings',
                data=train, 
                color='purple')

plt.show()

In [None]:
sns.pairplot(train, hue='season')
plt.show()

In [None]:
sns.pairplot(train, hue='ratings')
plt.show()

# K-Means

In [None]:
from sklearn.cluster import KMeans

In [None]:
df.drop(columns='season', inplace=True)
df.head()

In [None]:
X = df

kmeans = KMeans(n_clusters=3)
kmeans.fit(X)

kmeans.predict(X)

In [None]:
train.groupby('ratings').mean().plot.bar()

In [None]:
kmeans.cluster_centers_

In [None]:
centroids = pd.DataFrame(kmeans.cluster_centers_, columns=X.columns)
centroids

In [None]:
plt.figure(figsize=(14, 9))

for cluster, subset in df.groupby('ratings'):
    plt.scatter(subset.episode, subset.ratings, label='cluster ' + str(cluster), alpha=.6)

centroids.plot.scatter(y='ratings', x='episode', c='black', marker='x', s=1000, ax=plt.gca(), label='centroid')

plt.legend()
plt.xlabel('episodes')
plt.ylabel('ratings')
plt.title('Visualizing Cluster Centers')

In [None]:
kmeans = KMeans(n_clusters=2)
kmeans.fit(X)
df['cluster'] = kmeans.predict(X)

sns.relplot(data=df, x='episode', y='ratings', hue='cluster')

In [None]:
kmeans = KMeans(n_clusters=5).fit(X)
df['cluster'] = kmeans.predict(X)
df.cluster = 'cluster_' + (df.cluster + 1).astype('str')
sns.relplot(data=df, x='episode', y='ratings', hue='cluster')

In [None]:
df.head()

In [None]:
df.tail()

In [None]:
df.info()

In [None]:
df = df.drop(columns='cluster')
df.head()

In [None]:
X = X.drop(columns='cluster')
X.head()

In [None]:
# choosing k with inertia: elbow method
with plt.style.context('seaborn-whitegrid'):
    plt.figure(figsize=(9, 6))
    pd.Series({k: KMeans(k).fit(X).inertia_ for k in range(1, 12)}).plot(marker='x')
    plt.xticks(range(1, 12))
    plt.xlabel('k')
    plt.ylabel('inertia')
    plt.title('Change in inertia as k increases')

In [None]:
fig, axs = plt.subplots(2, 2, figsize=(13, 13), sharex=True, sharey=True)

for ax, k in zip(axs.ravel(), range(2, 6)):
    clusters = KMeans(k).fit(X).predict(X)
    ax.scatter(X.episode, X.ratings, c=clusters)
    ax.set(title='k = {}'.format(k), xlabel='episodes', ylabel='ratings')


In [None]:
for col in df.columns:
    sns.boxplot(df[col])
    plt.title(col)
    plt.show()

In [None]:
# list of variables I will cluster on. 
cluster_vars = ['episode', 'ratings']
cluster_name = 'area_cluster'
k_range = range(2,20)

In [None]:
def find_k(X_train, cluster_vars, k_range):
    sse = []
    for k in k_range:
        kmeans = KMeans(n_clusters=k)

        # X[0] is our X_train dataframe..the first dataframe in the list of dataframes stored in X. 
        kmeans.fit(X_train[cluster_vars])

        # inertia: Sum of squared distances of samples to their closest cluster center.
        sse.append(kmeans.inertia_) 

    # compute the difference from one k to the next
    delta = [round(sse[i] - sse[i+1],0) for i in range(len(sse)-1)]

    # compute the percent difference from one k to the next
    pct_delta = [round(((sse[i] - sse[i+1])/sse[i])*100, 1) for i in range(len(sse)-1)]

    # create a dataframe with all of our metrics to compare them across values of k: SSE, delta, pct_delta
    k_comparisons_df = pd.DataFrame(dict(k=k_range[0:-1], 
                             sse=sse[0:-1], 
                             delta=delta, 
                             pct_delta=pct_delta))

    # plot k with inertia
    plt.plot(k_comparisons_df.k, k_comparisons_df.sse, 'bx-')
    plt.xlabel('k')
    plt.ylabel('SSE')
    plt.title('The Elbow Method to find the optimal k\nFor which k values do we see large decreases in SSE?')
    plt.show()

    # plot k with pct_delta
    plt.plot(k_comparisons_df.k, k_comparisons_df.pct_delta, 'bx-')
    plt.xlabel('k')
    plt.ylabel('Percent Change')
    plt.title('For which k values are we seeing increased changes (%) in SSE?')
    plt.show()

    # plot k with delta
    plt.plot(k_comparisons_df.k, k_comparisons_df.delta, 'bx-')
    plt.xlabel('k')
    plt.ylabel('Absolute Change in SSE')
    plt.title('For which k values are we seeing increased changes (absolute) in SSE?')
    plt.show()

    return k_comparisons_df

In [None]:
find_k(X, cluster_vars, k_range)

In [None]:
k = 4
# as reminders: 
cluster_vars = ['episode', 'ratings']
cluster_name = 'area_cluster'

In [None]:
def create_clusters(X_train, k, cluster_vars):
    # create kmean object
    kmeans = KMeans(n_clusters=k, random_state = 13)

    # fit to train and assign cluster ids to observations
    kmeans.fit(X_train[cluster_vars])

    return kmeans

In [None]:
kmeans = create_clusters(X, k, cluster_vars)

In [None]:
kmeans = KMeans(n_clusters=4).fit(X)
df['cluster'] = kmeans.predict(X)
df.cluster = 'cluster_' + (df.cluster + 1).astype('str')
sns.relplot(data=df, x='episode', y='ratings', hue='cluster')

In [None]:
df.head()