# Feature Engineering for Standup Scripts

## Imports

In [222]:
import pickle
import numpy as np
import pandas as pd
from datetime import date

import matplotlib.pyplot as plt
import seaborn as sns
import plotly.express as px

import re

In [223]:
#transcripts_df = pd.read_pickle('../data/transcripts_raw_df.pickle')
transcripts_df = pd.read_pickle('../data/transcripts_df_2022-05-26.pickle')

In [224]:
# with open(f'../data/imdb_title_results_2022-05-23.pickle', 'rb') as file:
#     show_meta = pickle.load(file)

with open(f"../data/imdb_full_title_results_2022-05-26", 'rb') as file:
    show_meta = pickle.load(file)

## add metadata to transcripts_df

In [225]:
meta = pd.concat([pd.DataFrame.from_dict({index: show}, orient = 'index') for index, show in enumerate(show_meta)])

In [226]:
metascripts_all = pd.concat([transcripts_df, meta], axis = 1)
metascripts = metascripts_all.drop(metascripts_all.index[325:380])

In [227]:
def extract_artist(Series):
    colonoscopy = re.search(r"^.+:", Series['title'])
    commoscopy = re.search(r"^.+,", Series['writers'])
    if colonoscopy:
        return Series['title'][:colonoscopy.end()-1]
    elif commoscopy:
        return Series['writers'][:commoscopy.end()-1]
    else:
        return Series['writers']


In [228]:
problem_rows = []
for ind, row in metascripts.iterrows():
    try:
        extract_artist(row)
    except:
        problem_rows.append(ind)
problem_rows

[73, 114, 126, 135, 192, 200, 201, 202, 259, 282]

In [229]:
metascripts = metascripts.drop(metascripts.index[problem_rows])

In [230]:
metascripts['artist'] = [extract_artist(row) for ind, row in  metascripts.iterrows()]

In [231]:
# Manually fill the one missing runtime
metascripts.loc[metascripts['runtimeMins'].isnull(), 'runtimeMins'] = '60'

In [232]:
metascripts = metascripts[metascripts.columns[metascripts.isnull().sum() < 200]]
def extract_mins(string):
    if 'H' in string:
        return int(re.search('\d+', string).group(0))*60
    else:
        return int(re.search('\d+', string).group(0))
metascripts['runtimeMins'] = metascripts['runtimeMins'].apply(extract_mins)

In [233]:
keepcols = ['description', 'link', 'transcript', 'script characters',
       'id', 'artist', 'title', 'fullTitle', 'year', 'image', 'releaseDate', 'runtimeMins',
       'runtimeStr', 'awards', 'genres', 'genreList', 'companies',
       'companyList', 'contentRating', 'imDbRating', 'imDbRatingVotes',
       'similars', 'languages', 'languageList']
       
metascripts = metascripts[keepcols]

In [234]:
metascripts['year'] = metascripts['year'].astype(int)
metascripts['releaseDate'] = pd.to_datetime(metascripts['releaseDate'])
metascripts['imDbRating'] = metascripts['imDbRating'].astype(float)
metascripts['imDbRatingVotes'] = metascripts['imDbRatingVotes'].fillna('0').astype(int)

In [235]:
metascripts.head()

Unnamed: 0,description,link,transcript,script characters,id,artist,title,fullTitle,year,image,...,genres,genreList,companies,companyList,contentRating,imDbRating,imDbRatingVotes,similars,languages,languageList
0,Jim Gaffigan: Comedy Monster (2021) | Transcript,https://scrapsfromtheloft.com/comedy/jim-gaffi...,"Thank you! Thank you! Oh, my gosh. Thank you s...",49799,tt15907298,Jim Gaffigan,Jim Gaffigan: Comedy Monster,Jim Gaffigan: Comedy Monster (2021),2021,https://imdb-api.com/images/original/MV5BMDcyN...,...,Comedy,"[{'key': 'Comedy', 'value': 'Comedy'}]",The Nacelle Company,"[{'id': 'co0649705', 'name': 'The Nacelle Comp...",TV-14,6.8,1618,"[{'id': 'tt6090102', 'title': 'Jim Gaffigan: C...",English,"[{'key': 'English', 'value': 'English'}]"
1,Louis C. K.: Sorry (2021) | Transcript,https://scrapsfromtheloft.com/comedy/louis-c-k...,♪♪ [“Like a Rolling Stone” by Bob Dylan playin...,44669,tt16491756,Louis C.K.,Sorry,Sorry (2021),2021,https://imdb-api.com/images/original/MV5BOWNkN...,...,Comedy,"[{'key': 'Comedy', 'value': 'Comedy'}]",,[],,7.7,2363,"[{'id': 'tt12087624', 'title': 'Sincerely Loui...",English,"[{'key': 'English', 'value': 'English'}]"
2,Drew Michael: Drew Michael (2018) | Transcript,https://scrapsfromtheloft.com/comedy/drew-mich...,“This is the latest I’ve stayed up in a long t...,40006,tt8563704,Drew Michael,Drew Michael: Drew Michael,Drew Michael: Drew Michael (2018),2018,https://imdb-api.com/images/original/MV5BMDkyZ...,...,Comedy,"[{'key': 'Comedy', 'value': 'Comedy'}]",A24 Television,"[{'id': 'co0702684', 'name': 'A24 Television'}]",TV-MA,5.4,368,"[{'id': 'tt16153658', 'title': 'Drew Michael: ...",English,"[{'key': 'English', 'value': 'English'}]"
3,Drew Michael: Red Blue Green (2021) | Transcript,https://scrapsfromtheloft.com/comedy/drew-mich...,(EMOTIONAL MUSIC PLAYING) (MUSIC ENDS) DREW MI...,50422,tt16153658,Drew Michael,Drew Michael: Red Blue Green,Drew Michael: Red Blue Green (2021),2021,https://imdb-api.com/images/original/MV5BNTcxM...,...,Comedy,"[{'key': 'Comedy', 'value': 'Comedy'}]","Rotten Science, HBO Films","[{'id': 'co0602462', 'name': 'Rotten Science'}...",TV-MA,6.9,261,"[{'id': 'tt8563704', 'title': 'Drew Michael: D...",English,"[{'key': 'English', 'value': 'English'}]"
4,Mo Amer: Mohammed in Texas (2021) | Transcript,https://scrapsfromtheloft.com/comedy/mo-amer-m...,[quirky flute music playing] [single note pian...,58020,tt15845288,Mo Amer,Mo Amer: Mohammed in Texas,Mo Amer: Mohammed in Texas (2021),2021,https://imdb-api.com/images/original/MV5BMDI1M...,...,Comedy,"[{'key': 'Comedy', 'value': 'Comedy'}]",A24,"[{'id': 'co0390816', 'name': 'A24'}]",TV-MA,6.5,615,"[{'id': 'tt9060526', 'title': 'Mo Amer: The Va...",English,"[{'key': 'English', 'value': 'English'}]"


In [236]:
metascripts.info()

<class 'pandas.core.frame.DataFrame'>
Int64Index: 316 entries, 0 to 380
Data columns (total 24 columns):
 #   Column             Non-Null Count  Dtype         
---  ------             --------------  -----         
 0   description        316 non-null    object        
 1   link               316 non-null    object        
 2   transcript         316 non-null    object        
 3   script characters  316 non-null    int64         
 4   id                 316 non-null    object        
 5   artist             316 non-null    object        
 6   title              316 non-null    object        
 7   fullTitle          316 non-null    object        
 8   year               316 non-null    int32         
 9   image              316 non-null    object        
 10  releaseDate        304 non-null    datetime64[ns]
 11  runtimeMins        316 non-null    int64         
 12  runtimeStr         314 non-null    object        
 13  awards             316 non-null    object        
 14  genres    

## Ratings and Rating Counts

In [237]:
px.box(metascripts, 
       x = 'imDbRating', 
       hover_data = ['title'], 
       color_discrete_sequence=['saddlebrown'], 
       points = 'all',
       title = "<b>Boxplot & Swarm of IMDb Ratings: ratings are normal, skewed somewhat left<b>")

In [238]:
px.box(metascripts, 
       x = 'imDbRatingVotes', 
       hover_data = ['title'], 
       color_discrete_sequence=['saddlebrown'], 
       points = 'all',
       title= "<b>Boxplot & Swarm of IMDb Rating Counts: the distribution is heavily right-skewed")

In [239]:
px.scatter(metascripts,
           x = 'imDbRatingVotes',
           y = 'imDbRating',
           #log_x=True,
           hover_data = ['title'],
           color_discrete_sequence=['saddlebrown'],
           title = "<b>IMDb Rating vs Rating Count: ratings appear logarithmically correlated with rating count")

In [240]:
px.scatter(metascripts,
           x = 'imDbRatingVotes',
           y = 'imDbRating',
           log_x=True,
           hover_data = ['title'],
           color_discrete_sequence=['saddlebrown'],
           title = "<b>IMDb Rating vs Rating Count: ratings appear logarithmically correlated with rating count")

Let's check the correlations and p-values between rating, rating votes, and the log of rating votes

In [241]:
metascripts[['imDbRating', 'imDbRatingVotes']].assign(log_imDbRatingVotes = lambda x: np.log(x['imDbRatingVotes'])).corr()


divide by zero encountered in log



Unnamed: 0,imDbRating,imDbRatingVotes,log_imDbRatingVotes
imDbRating,1.0,0.344974,0.385701
imDbRatingVotes,0.344974,1.0,0.7327
log_imDbRatingVotes,0.385701,0.7327,1.0


In [247]:
from scipy.stats import pearsonr

def calculate_pvalues(df):
    df = df.dropna()._get_numeric_data()
    dfcols = pd.DataFrame(columns=df.columns)
    pvalues = dfcols.transpose().join(dfcols, how='outer')
    for r in df.columns:
        for c in df.columns:
            pvalues[r][c] = round(pearsonr(df[r], df[c])[1], 4)
    return pvalues

calculate_pvalues(metascripts[['imDbRating', 'imDbRatingVotes']].assign(log_imDbRatingVotes = lambda x: np.log(x['imDbRatingVotes']+1)))

Unnamed: 0,imDbRating,imDbRatingVotes,log_imDbRatingVotes
imDbRating,0.0,0.0,0.0
imDbRatingVotes,0.0,0.0,0.0
log_imDbRatingVotes,0.0,0.0,0.0


## Number of Shows
- By decade
- by artist

In [243]:
fig = px.histogram(metascripts, x = 'year', color_discrete_sequence=['saddlebrown'], title = '<b>Number of Shows by Decade: the vast majority of shows are in the 2010s<b>')
fig.update_traces(xbins_size=10)

In [244]:
artist_counts = metascripts['artist'].value_counts().reset_index().rename(columns = {'index':'artist', 'artist':'count'})
px.box(artist_counts, 
       x = 'count', 
       hover_data=['artist'], 
       color_discrete_sequence=['saddlebrown'], 
       points = 'all',
       title = '<b>Boxplot & Swarm of Number of Shows by Artist: some major outliers, but almost everyone is 3 or under<b>')

## Runtime in Minutes

In [245]:
px.histogram(metascripts, x = 'runtimeMins', color_discrete_sequence=['saddlebrown'], title = "<b>Histogram of Runtimes in Minutes<b>")

In [246]:
px.box(metascripts, 
       x = 'runtimeMins', 
       hover_data=['artist', 'title'], 
       color_discrete_sequence=['saddlebrown'], 
       points = 'all',
       title = '<b>Boxplot & Swarm of Runtime in Minutes: right-skewed with some wildly long shows<b>')