# Feature Engineering for Standup Scripts

## Imports

In [171]:
import pickle
import numpy as np
import pandas as pd
from datetime import date

import matplotlib.pyplot as plt
import seaborn as sns
import plotly.express as px

import re
from collections import Counter, defaultdict
import itertools
import nltk
from nltk.corpus import stopwords
from nltk.tokenize import word_tokenize, sent_tokenize, regexp_tokenize
import gensim
from gensim.corpora.dictionary import Dictionary
import spacy

In [172]:
sw = stopwords.words("english")

In [173]:
transcripts_df = pd.read_pickle('../data/transcripts_raw_df.pickle')

In [174]:
with open(f'../data/imdb_title_results_2022-05-23.pickle', 'rb') as file:
    show_meta = pickle.load(file)

In [175]:
with open(f'../data/metascripts_df_2022-05-28.pickle', 'rb') as file:
    metascripts = pickle.load(file)

In [176]:
metascripts.head()

Unnamed: 0,description,link,transcript,script characters,id,artist,title,fullTitle,year,image,...,genres,genreList,companies,companyList,contentRating,imDbRating,imDbRatingVotes,similars,languages,languageList
0,Jim Gaffigan: Comedy Monster (2021) | Transcript,https://scrapsfromtheloft.com/comedy/jim-gaffi...,"Thank you! Thank you! Oh, my gosh. Thank you s...",49799,tt15907298,Jim Gaffigan,Jim Gaffigan: Comedy Monster,Jim Gaffigan: Comedy Monster (2021),2021,https://imdb-api.com/images/original/MV5BMDcyN...,...,Comedy,"[{'key': 'Comedy', 'value': 'Comedy'}]",The Nacelle Company,"[{'id': 'co0649705', 'name': 'The Nacelle Comp...",TV-14,6.8,1618,"[{'id': 'tt6090102', 'title': 'Jim Gaffigan: C...",English,"[{'key': 'English', 'value': 'English'}]"
1,Louis C. K.: Sorry (2021) | Transcript,https://scrapsfromtheloft.com/comedy/louis-c-k...,♪♪ [“Like a Rolling Stone” by Bob Dylan playin...,44669,tt16491756,Louis C.K.,Sorry,Sorry (2021),2021,https://imdb-api.com/images/original/MV5BOWNkN...,...,Comedy,"[{'key': 'Comedy', 'value': 'Comedy'}]",,[],,7.7,2363,"[{'id': 'tt12087624', 'title': 'Sincerely Loui...",English,"[{'key': 'English', 'value': 'English'}]"
2,Drew Michael: Drew Michael (2018) | Transcript,https://scrapsfromtheloft.com/comedy/drew-mich...,“This is the latest I’ve stayed up in a long t...,40006,tt8563704,Drew Michael,Drew Michael: Drew Michael,Drew Michael: Drew Michael (2018),2018,https://imdb-api.com/images/original/MV5BMDkyZ...,...,Comedy,"[{'key': 'Comedy', 'value': 'Comedy'}]",A24 Television,"[{'id': 'co0702684', 'name': 'A24 Television'}]",TV-MA,5.4,368,"[{'id': 'tt16153658', 'title': 'Drew Michael: ...",English,"[{'key': 'English', 'value': 'English'}]"
3,Drew Michael: Red Blue Green (2021) | Transcript,https://scrapsfromtheloft.com/comedy/drew-mich...,(EMOTIONAL MUSIC PLAYING) (MUSIC ENDS) DREW MI...,50422,tt16153658,Drew Michael,Drew Michael: Red Blue Green,Drew Michael: Red Blue Green (2021),2021,https://imdb-api.com/images/original/MV5BNTcxM...,...,Comedy,"[{'key': 'Comedy', 'value': 'Comedy'}]","Rotten Science, HBO Films","[{'id': 'co0602462', 'name': 'Rotten Science'}...",TV-MA,6.9,261,"[{'id': 'tt8563704', 'title': 'Drew Michael: D...",English,"[{'key': 'English', 'value': 'English'}]"
4,Mo Amer: Mohammed in Texas (2021) | Transcript,https://scrapsfromtheloft.com/comedy/mo-amer-m...,[quirky flute music playing] [single note pian...,58020,tt15845288,Mo Amer,Mo Amer: Mohammed in Texas,Mo Amer: Mohammed in Texas (2021),2021,https://imdb-api.com/images/original/MV5BMDI1M...,...,Comedy,"[{'key': 'Comedy', 'value': 'Comedy'}]",A24,"[{'id': 'co0390816', 'name': 'A24'}]",TV-MA,6.5,615,"[{'id': 'tt9060526', 'title': 'Mo Amer: The Va...",English,"[{'key': 'English', 'value': 'English'}]"


In [177]:
metascripts.shape

(316, 24)

## word lengths, sentence lengths, distinct words
- word lengths (letters per word)
    * tokenize words (allow apostrophes and dashes but not numbers)
    * do not lemmatize
    * do not remove stopwords
- sentence lengths (words per sentence)
    * tokenize sentences and then count whitespaces
    * do not remove stopwords
    * get arrays so we can do mean, median, boxplot values, standard deviation
- distinct words per total words
    * tokenize and lemmatize words (allow apostrophes and dashes by not numbers)

In [178]:
# Replace bracket and parenthetical content from scripts
metascripts['transcript'] = metascripts['transcript'].replace("\[.+?\]|\(.+?\)","", regex = True)

In [179]:
transcripts_dict = dict(zip(metascripts['description'].values, metascripts['transcript'].values))

In [182]:
descriptions = list(transcripts_dict.keys())
scripts = list(transcripts_dict.values())

In [183]:
parens = (re.findall(r"\(.+?\)", script) for script in scripts)
[(ind, len(matches)) for ind, matches in enumerate(parens) if len(matches) > 0]
parenscripts = (scripts[ind] for ind, matches in enumerate(parens) if len(matches) > 0)

### word lengths

In [184]:
bow_cased = [regexp_tokenize(transcript, r"[a-zA-Z]+") for description, transcript in transcripts_dict.items()]
bow_counter = [Counter(word.lower() for word in script_words) for script_words in bow_cased]

tokenized_list = [[word.lower() for word in script_words] for script_words in bow_cased]
dictionary = Dictionary(tokenized_list)
corpus = [dictionary.doc2bow(script) for script in tokenized_list]

In [185]:
word_lengths = [[len(word) for word in script_words] for script_words in tokenized_list]

In [186]:
metascripts['mean word length'] = [np.mean(script_word_lengths) for script_word_lengths in word_lengths]
metascripts['std word length'] = [np.std(script_word_lengths) for script_word_lengths in word_lengths]

for quantile in (0.25, 0.50, 0.75):
    metascripts[f'Q{quantile/0.25} word length'] = [np.quantile(script_word_lengths, quantile) for script_word_lengths in word_lengths]

metascripts['max word length'] = [np.max(script_word_lengths) for script_word_lengths in word_lengths]

### sentence lengths

In [187]:
sent_tokenized_list = [sent_tokenize(transcript) for description, transcript in transcripts_dict.items()]
sent_words_tokenized_list = [[regexp_tokenize(sent, r"[’'\-\w]+") for sent in sent_script] for sent_script in sent_tokenized_list]
sent_lengths = [[len(sent) for sent in script] for script in sent_words_tokenized_list]
sent_counts = [len(script) for script in sent_tokenized_list]

In [188]:
metascripts['mean sentence length'] = [np.mean(script_sent_lengths) for script_sent_lengths in sent_lengths]
metascripts['std sentence length'] = [np.std(script_sent_lengths) for script_sent_lengths in sent_lengths]

for quantile in (0.25, 0.50, 0.75):
    metascripts[f'Q{quantile/0.25} sentence length'] = [np.quantile(script_sent_lengths, quantile) for script_sent_lengths in sent_lengths]

metascripts['max sentence length'] = [np.max(script_sent_lengths) for script_sent_lengths in sent_lengths]

## distinct words and distinct words per total words

In [189]:
from nltk.stem.wordnet import WordNetLemmatizer

In [190]:
lemmatizer = WordNetLemmatizer()
lem_counter = [Counter(lemmatizer.lemmatize(word.lower()) for word in script_words) for script_words in bow_cased]

In [191]:
unique_word_counts = [len(script_lem_counts) for script_lem_counts in lem_counter]
total_word_counts = [np.sum([count for lem, count in script_lem_counts.items()]) for script_lem_counts in lem_counter]
unique_total_ratio = [unique/total for unique, total in zip(unique_word_counts, total_word_counts)]
unique_per_sent = [unique/sent_count for unique, sent_count in zip(unique_word_counts, sent_counts)]

In [192]:
metascripts['unique words'] = unique_word_counts
metascripts['total words'] = total_word_counts
metascripts['proportion unique words'] = unique_total_ratio
metascripts['unique words per sentence'] = unique_per_sent

## words per minute and sentences per minute

## repetition and phrases

In [193]:
from gensim.models import Phrases

In [194]:
tok_scripts = [regexp_tokenize(transcript, r"[’'\-\w]+") for description, transcript in transcripts_dict.items()]
docs = [[lemmatizer.lemmatize(tok.lower()) for tok in transcript if tok not in sw] for transcript in tok_scripts]

# Add bigrams and trigrams to docs (only ones that appear 5 times or more).
ngram = Phrases(docs, min_count=5)
for idx in range(len(docs)):
    for token in ngram[docs[idx]]:
        if '_' in token:
            # Token is a bigram, add to document.
            docs[idx].append(token)

## profanity

In [195]:
from profanityfilter import ProfanityFilter

In [196]:
pf = ProfanityFilter()

# Playground

### Remove brackets and parentheticals, as well as a check to ensure we don't accidentally remove too much
I'd also like to remove intro and exit music programmatically, but that's more fraught. Some shows deliberately contain music as content, and some scripts use an odd number of music signs, which makes it tough to single out lyrics.

In [197]:
fake_tok = "Thank you, thank you. [applause, laughter] Have you heard what Florida man's up to?"
re.search(r"\[.+\]", fake_tok)

<re.Match object; span=(22, 42), match='[applause, laughter]'>

In [198]:
re.sub(r"\[.+\]", "", fake_tok).strip()

"Thank you, thank you.  Have you heard what Florida man's up to?"

In [199]:
if fake_tok not in sw and re.search(r"\[.+\]", fake_tok):
    print("Yup, that's true")

Yup, that's true


In [200]:
re.sub(r"\[.+?\]", "", transcripts_dict['Tom Papa: Human Mule (2016) – Transcript'])
re.search(r"♪.+?♪", transcripts_dict['Tom Papa: Human Mule (2016) – Transcript'])

In [201]:
gen = (script for script in transcripts_dict.values())

In [202]:
re.sub(r"\[.+?\]", "", re.sub(r"♪.+?♪", "", re.sub(r"♪♪.+?♪♪", "", transcripts_dict['Dave Chappelle: The Closer (2021) | Transcript'])))

'     \n\n\n\n\n\n\n\n\n\n\n\n\n\n\n\n Thank you.  Everybody have a seat, be comfortable, relax. I got to tell you… let’s go.  Thank you. I need you guys to know something. And I’m gonna tell you the truth, and don’t get freaked out. This is going to be my last special for a minute.  It is all good. Listen to me. I did it in Detroit for that reason.  That’s right. You wanna know why? ‘Cause I talked so much shit about Detroit in the first special I figured, I might as well, do the last special here. Sorry about that, by the way.  First of all, before I even start, I’m gonna say that “I’m rich and famous.”  And the only reason I say that is ’cause the last 17 months were hell, and I cannot imagine what everybody went through. Well, I’m happy to see you and I’m happy you’re well and I hope everyone you love is okay.  I don’t want you to worry about me, I’m… vaccinated, I…  got the Johnson & Johnson vaccine.  I got to admit, that’s probably the most n*ggaish decision I’ve made in a long t

In [203]:
mm = metascripts.assign(
    modprop = lambda metascripts: (metascripts['script characters'] - metascripts['transcript'].replace("\[.+?\]|\(.+?\)","", regex = True).apply(len))/metascripts['script characters']
)

px.box(mm, x = 'modprop', hover_data = [mm.index, 'description'])

In [204]:
metascripts['transcript'][32]



In [205]:
metascripts['transcript'].replace("\[.+?\]|\(.+?\)","", regex = True)[32]

