# Feature Engineering for Standup Scripts

## Imports

In [308]:
import pickle
import numpy as np
import pandas as pd
from datetime import date

import matplotlib.pyplot as plt
import seaborn as sns
import plotly.express as px

import re
from collections import Counter, defaultdict
import itertools
import nltk
from nltk.corpus import stopwords
from nltk.tokenize import word_tokenize, sent_tokenize, regexp_tokenize
import gensim
from gensim.corpora.dictionary import Dictionary
import spacy

In [309]:
sw = stopwords.words("english")

In [310]:
transcripts_df = pd.read_pickle('../data/transcripts_raw_df.pickle')

In [311]:
with open(f'../data/imdb_title_results_2022-05-23.pickle', 'rb') as file:
    show_meta = pickle.load(file)

## add metadata to transcripts_df

Drop:
- type
- fullTitle
- languages
- languageList
- boxOffice
- keywords
- keywordList
- plot
- plotLocal
- plotLocalIsRtl
- stars
Separate Table:
- similars

In [312]:
meta = pd.concat([pd.DataFrame.from_dict({index: show}, orient = 'index') for index, show in enumerate(show_meta)])

In [313]:
def extract_artist(Series):
    colonoscopy = re.search(r"^.+:", Series['title'])
    commoscopy = re.search(r"^.+,", Series['writers'])
    if colonoscopy:
        return Series['title'][:colonoscopy.end()-1]
    elif commoscopy:
        return Series['writers'][:commoscopy.end()-1]
    else:
        return Series['writers']

In [314]:
meta['artist'] = [extract_artist(row) for ind, row in  meta.iterrows()]

In [315]:
# Manually fill the one missing runtime
meta.loc[meta['runtimeMins'].isnull(), 'runtimeMins'] = '60'

In [316]:
meta = meta[meta.columns[meta.isnull().sum() < 200]]
def extract_mins(string):
    if 'H' in string:
        return int(re.search('\d+', string).group(0))*60
    else:
        return int(re.search('\d+', string).group(0))
meta['runtimeMins'] = meta['runtimeMins'].apply(extract_mins)

In [317]:
# dropcols = [
# 'type',
# 'originalTitle',
# 'fullTitle',
# 'languages',
# 'languageList',
# 'boxOffice',
# 'keywords',
# 'keywordList',
# 'plot',
# 'plotLocal',
# 'plotLocalIsRtl',
# 'directors', 
# 'directorList', 
# 'writers',
# 'writerList', 
# 'stars', 
# 'starList', 
# 'actorList',
# 'countries',
# 'countryList'
# ]

# meta = meta.drop(columns=dropcols)

In [318]:
keepcols = ['id', 'title', 'year', 'image', 'releaseDate', 'runtimeMins',
       'runtimeStr', 'awards', 'genres', 'genreList', 'companies',
       'companyList', 'contentRating', 'imDbRating', 'imDbRatingVotes',
       'similars', 'artist']
meta = meta[keepcols]

In [325]:
meta['year'] = meta['year'].astype(int)
meta['releaseDate'] = pd.to_datetime(meta['releaseDate'])
meta['imDbRating'] = meta['imDbRating'].astype(float)
meta['imDbRatingVotes'] = meta['imDbRatingVotes'].astype(int)

In [324]:
meta.columns

Index(['id', 'title', 'year', 'image', 'releaseDate', 'runtimeMins',
       'runtimeStr', 'awards', 'genres', 'genreList', 'companies',
       'companyList', 'contentRating', 'imDbRating', 'imDbRatingVotes',
       'similars', 'artist'],
      dtype='object')

In [326]:
px.strip(meta, x = 'imDbRatingVotes', hover_data = ['title'])

In [327]:
px.scatter(meta,
           x = 'imDbRatingVotes',
           y = 'imDbRating',
           hover_data = ['title'])

In [328]:
meta[meta['artist'].str.contains('Hannah')]

Unnamed: 0,id,title,year,image,releaseDate,runtimeMins,runtimeStr,awards,genres,genreList,companies,companyList,contentRating,imDbRating,imDbRatingVotes,similars,artist
43,tt10332256,Hannah Gadsby: Douglas,2020,https://imdb-api.com/images/original/MV5BZTJiO...,2020-05-26,72,1h 12min,"Nominated for 2 Primetime Emmys, 1 win & 5 nom...",Comedy,"[{'key': 'Comedy', 'value': 'Comedy'}]",Irwin Entertainment,"[{'id': 'co0193199', 'name': 'Irwin Entertainm...",TV-MA,7.6,3938,[],Hannah Gadsby
139,tt8465676,Hannah Gadsby: Nanette,2018,https://imdb-api.com/images/original/MV5BY2I3M...,2018-06-19,69,1h 9min,"Won 1 Primetime Emmy, 3 wins & 5 nominations t...","Documentary, Comedy","[{'key': 'Documentary', 'value': 'Documentary'...",Guesswork Television,"[{'id': 'co0396457', 'name': 'Guesswork Televi...",TV-MA,8.1,12185,"[{'id': 'tt10332256', 'title': 'Hannah Gadsby:...",Hannah Gadsby


## word lengths, sentence lengths, distinct words
- word lengths (letters per word)
    * tokenize words (allow apostrophes and dashes but not numbers)
    * do not lemmatize
    * do not remove stopwords
- sentence lengths (words per sentence)
    * tokenize sentences and then count whitespaces
    * do not remove stopwords
    * get arrays so we can do mean, median, boxplot values, standard deviation
- distinct words per total words
    * tokenize and lemmatize words (allow apostrophes and dashes by not numbers)

In [13]:
splits = transcripts_df['description'].str.split(':')
weird_labels = [(index, split) for index, split in enumerate(splits) if len(split) == 1]

In [18]:
transcripts_dict = dict(zip(transcripts_df['description'].values, transcripts_df['transcript'].values))

In [31]:
descriptions = list(transcripts_dict.keys())

### word lengths

In [45]:
bow_cased = [regexp_tokenize(transcript, r"[a-zA-Z]+") for description, transcript in transcripts_dict.items()]
bow_counter = [Counter(word.lower() for word in script_words) for script_words in bow_cased]

tokenized_list = [[word.lower() for word in script_words] for script_words in bow_cased]
dictionary = Dictionary(tokenized_list)
corpus = [dictionary.doc2bow(script) for script in tokenized_list]

In [52]:
word_lengths = [[len(word) for word in script_words] for script_words in tokenized_list]

In [59]:
transcripts_df['mean word length'] = [np.mean(script_word_lengths) for script_word_lengths in word_lengths]
transcripts_df['std word length'] = [np.std(script_word_lengths) for script_word_lengths in word_lengths]

for quantile in (0.25, 0.50, 0.75):
    transcripts_df[f'Q{quantile/0.25} word length'] = [np.quantile(script_word_lengths, quantile) for script_word_lengths in word_lengths]

transcripts_df['max word length'] = [np.max(script_word_lengths) for script_word_lengths in word_lengths]

### sentence lengths

In [99]:
sent_tokenized_list = [sent_tokenize(transcript) for description, transcript in transcripts_dict.items()]
sent_words_tokenized_list = [[regexp_tokenize(sent, r"[’'\-\w]+") for sent in sent_script] for sent_script in sent_tokenized_list]
sent_lengths = [[len(sent) for sent in script] for script in sent_words_tokenized_list]
sent_counts = [len(script) for script in sent_tokenized_list]

In [78]:
transcripts_df['mean sentence length'] = [np.mean(script_sent_lengths) for script_sent_lengths in sent_lengths]
transcripts_df['std sentence length'] = [np.std(script_sent_lengths) for script_sent_lengths in sent_lengths]

for quantile in (0.25, 0.50, 0.75):
    transcripts_df[f'Q{quantile/0.25} sentence length'] = [np.quantile(script_sent_lengths, quantile) for script_sent_lengths in sent_lengths]

transcripts_df['max sentence length'] = [np.max(script_sent_lengths) for script_sent_lengths in sent_lengths]

## distinct words and distinct words per total words

In [82]:
from nltk.stem.wordnet import WordNetLemmatizer

In [83]:
from nltk.stem.wordnet import WordNetLemmatizer

lemmatizer = WordNetLemmatizer()
lem_counter = [Counter(lemmatizer.lemmatize(word.lower()) for word in script_words) for script_words in bow_cased]

In [102]:
unique_word_counts = [len(script_lem_counts) for script_lem_counts in lem_counter]
total_word_counts = [np.sum([count for lem, count in script_lem_counts.items()]) for script_lem_counts in lem_counter]
unique_total_ratio = [unique/total for unique, total in zip(unique_word_counts, total_word_counts)]
unique_per_sent = [unique/sent_count for unique, sent_count in zip(unique_word_counts, sent_counts)]

In [107]:
transcripts_df['unique words'] = unique_word_counts
transcripts_df['total words'] = total_word_counts
transcripts_df['proportion unique words'] = unique_total_ratio
transcripts_df['unique words per sentence'] = unique_per_sent