# The Office Scratchpad

In [1]:
# imports

import pandas as pd

# Acquire

In [2]:
def the_office():
    '''
    Function to read the office series csv file,
    and drop null values in the data,
    rename columns for readability,
    convert column names to lowercase
    and save the new df to a csv file
    '''
    # reading the office series data from a csv file
    df = pd.read_csv('the_office_series.csv')
    # dropping the GuestStars column, too many null values
    df = df.drop(columns='GuestStars')
    # renaming the columns for readability
    df = df.rename(columns={"Unnamed: 0": "Episode", "EpisodeTitle": "Episode_Title"})
    # convert column names to lowercase
    df.columns = [col.lower() for col in df]
    # saving the office data to a csv
    df.to_csv('the_office.csv')
    
    return df

In [4]:
# getting the office data using the office function 
df = the_office()
df.head()

Unnamed: 0,episode,season,episode_title,about,ratings,votes,viewership,duration,date,director,writers
0,0,1,Pilot,The premiere episode introduces the boss and s...,7.5,4936,11.2,23,24 March 2005,Ken Kwapis,Ricky Gervais |Stephen Merchant and Greg Daniels
1,1,1,Diversity Day,Michael's off color remark puts a sensitivity ...,8.3,4801,6.0,23,29 March 2005,Ken Kwapis,B. J. Novak
2,2,1,Health Care,Michael leaves Dwight in charge of picking the...,7.8,4024,5.8,22,5 April 2005,Ken Whittingham,Paul Lieberstein
3,3,1,The Alliance,"Just for a laugh, Jim agrees to an alliance wi...",8.1,3915,5.4,23,12 April 2005,Bryan Gordon,Michael Schur
4,4,1,Basketball,Michael and his staff challenge the warehouse ...,8.4,4294,5.0,23,19 April 2005,Greg Daniels,Greg Daniels


In [None]:
# reading the office series csv file
df = pd.read_csv('the_office_series.csv')
df.head() # check_yo_head

In [5]:
df.info()

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 188 entries, 0 to 187
Data columns (total 11 columns):
 #   Column         Non-Null Count  Dtype  
---  ------         --------------  -----  
 0   episode        188 non-null    int64  
 1   season         188 non-null    int64  
 2   episode_title  188 non-null    object 
 3   about          188 non-null    object 
 4   ratings        188 non-null    float64
 5   votes          188 non-null    int64  
 6   viewership     188 non-null    float64
 7   duration       188 non-null    int64  
 8   date           188 non-null    object 
 9   director       188 non-null    object 
 10  writers        188 non-null    object 
dtypes: float64(2), int64(4), object(5)
memory usage: 16.3+ KB


In [None]:
# convert column names to lowercase, replace '.' in column names with '_'
df.columns = [col.lower().replace('.', '_') for col in df]

In [None]:
df.head()

In [None]:
# looking at the shape of the data
df.shape

In [None]:
# looking at the data info
df.info()

In [None]:
# checking for nulls in the data
df.isna().sum()

It looks like there are only nulls in the Gueststar column,
I think I will make a gueststar df and then drop it for the original df

In [6]:
def guest_office():
    '''
    Function to create a df where there are guest stars on the episode,
    drop the null values in the data,
    rename columns for readability,
    convert column names to lowercase
    and save the df to a csv file 
    '''
    # reading the office series data from a csv file
    df = pd.read_csv('the_office_series.csv')
    # dropping the null values in the GuestStars column
    df = df[df.GuestStars.notna()]
    # renaming the columns for readability
    df = df.rename(columns={"Unnamed: 0": "Episode", "EpisodeTitle": "Episode_Title", "GuestStars": "Guest_Stars"})
    # convert column names to lowercase
    df.columns = [col.lower() for col in df]
    # saving the office data to a csv
    df.to_csv('the_office_guest.csv')
    
    return df
    

In [7]:
guest_df = guest_office()
guest_df.head()

Unnamed: 0,episode,season,episode_title,about,ratings,votes,viewership,duration,date,guest_stars,director,writers
5,5,1,Hot Girl,Michael is just one of the many male staff who...,7.7,3854,4.8,23,26 April 2005,Amy Adams,Amy Heckerling,Mindy Kaling
8,8,2,Office Olympics,"Ready to finalize his deal for a new condo, Mi...",8.4,3665,8.3,22,4 October 2005,Nancy Carell,Paul Feig,Michael Schur
9,9,2,The Fire,A fire in the kitchen relegates the staff to t...,8.4,3607,7.6,22,11 October 2005,Amy Adams,Ken Kwapis,B. J. Novak
12,12,2,The Client,With Michael and Jan Levinson-Gould away from ...,8.6,3533,7.5,22,8 November 2005,Tim Meadows,Greg Daniels,Paul Lieberstein
14,14,2,E-Mail Surveillance,Michael decides to set up e-mail surveillance ...,8.4,3338,8.1,23,22 November 2005,Ken Jeong,Paul Feig,Jennifer Celotta


In [None]:
guest_df.shape

In [None]:
df.head()

In [None]:
df = df.drop(columns='GuestStars')

In [None]:
df.head()

In [None]:
df.tail()

In [None]:
# convert column names to lowercase, replace '.' in column names with '_'
df = df.columns = [col.lower().replace('.', '_') for col in df]

In [12]:
# looking at the episode with the most views
max_views = df.loc[df.viewership.idxmax()]
max_views

episode                                                         77
season                                                           5
episode_title                                        Stress Relief
about            Dwight's too-realistic fire alarm gives Stanle...
ratings                                                        9.7
votes                                                         8170
viewership                                                   22.91
duration                                                        60
date                                               1 February 2009
director                                             Jeffrey Blitz
writers                                           Paul Lieberstein
Name: 77, dtype: object