# Feature Engineering for Standup Scripts

## Imports

In [49]:
import pickle
import numpy as np
import pandas as pd

import matplotlib.pyplot as plt
import seaborn as sns
import plotly.express as px

from collections import Counter, defaultdict
import itertools
import nltk
from nltk.corpus import stopwords
from nltk.tokenize import word_tokenize, sent_tokenize, regexp_tokenize
import gensim
from gensim.corpora.dictionary import Dictionary
import spacy

In [43]:
sw = stopwords.words("english")

In [5]:
transcripts_df = pd.read_pickle('../data/transcripts_raw_df.pickle')

## word lengths, sentence lengths, distinct words
- word lengths (letters per word)
    * tokenize words (allow apostrophes and dashes but not numbers)
    * do not lemmatize
    * do not remove stopwords
- sentence lengths (words per sentence)
    * tokenize sentences and then count whitespaces
    * do not remove stopwords
    * get arrays so we can do mean, median, boxplot values, standard deviation
- distinct words per total words
    * tokenize and lemmatize words (allow apostrophes and dashes by not numbers)

In [13]:
splits = transcripts_df['description'].str.split(':')
weird_labels = [(index, split) for index, split in enumerate(splits) if len(split) == 1]

In [18]:
transcripts_dict = dict(zip(transcripts_df['description'].values, transcripts_df['transcript'].values))

In [31]:
descriptions = list(transcripts_dict.keys())

### word lengths

In [45]:
bow_cased = [regexp_tokenize(transcript, r"[a-zA-Z]+") for description, transcript in transcripts_dict.items()]
bow_counter = [Counter(word.lower() for word in script_words) for script_words in bow_cased]

tokenized_list = [[word.lower() for word in script_words] for script_words in bow_cased]
dictionary = Dictionary(tokenized_list)
corpus = [dictionary.doc2bow(script) for script in tokenized_list]

In [52]:
word_lengths = [[len(word) for word in script_words] for script_words in tokenized_list]

In [59]:
transcripts_df['mean word length'] = [np.mean(script_word_lengths) for script_word_lengths in word_lengths]
transcripts_df['std word length'] = [np.std(script_word_lengths) for script_word_lengths in word_lengths]

for quantile in (0.25, 0.50, 0.75):
    transcripts_df[f'Q{quantile/0.25} word length'] = [np.quantile(script_word_lengths, quantile) for script_word_lengths in word_lengths]

transcripts_df['max word length'] = [np.max(script_word_lengths) for script_word_lengths in word_lengths]

In [60]:
transcripts_df

Unnamed: 0,description,link,transcript,mean word length,std word length,Q1.0 word length,Q2.0 word length,Q3.0 word length,max word length
0,Jim Gaffigan: Comedy Monster (2021) | Transcript,https://scrapsfromtheloft.com/comedy/jim-gaffi...,"Thank you! Thank you! Oh, my gosh. Thank you s...",3.805854,2.103520,2.0,4.0,5.0,14
1,Louis C. K.: Sorry (2021) | Transcript,https://scrapsfromtheloft.com/comedy/louis-c-k...,♪♪ [“Like a Rolling Stone” by Bob Dylan playin...,3.672118,2.024954,2.0,3.0,5.0,14
2,Drew Michael: Drew Michael (2018) | Transcript,https://scrapsfromtheloft.com/comedy/drew-mich...,“This is the latest I’ve stayed up in a long t...,3.586754,1.980245,2.0,3.0,4.0,16
3,Drew Michael: Red Blue Green (2021) | Transcript,https://scrapsfromtheloft.com/comedy/drew-mich...,(EMOTIONAL MUSIC PLAYING) (MUSIC ENDS) DREW MI...,3.982294,2.302709,2.0,4.0,5.0,16
4,Mo Amer: Mohammed in Texas (2021) | Transcript,https://scrapsfromtheloft.com/comedy/mo-amer-m...,[quirky flute music playing] [single note pian...,3.895520,2.171687,2.0,4.0,5.0,20
...,...,...,...,...,...,...,...,...,...
376,JIM JEFFERIES ON GUN CONTROL [FULL TRANSCRIPT],https://scrapsfromtheloft.com/comedy/jim-jeffe...,by Jim Jefferies I’m gonna talk about somethin...,3.840161,2.076874,2.0,4.0,5.0,14
377,Reggie Watts: Spatial (2016) – Full Transcript,https://scrapsfromtheloft.com/comedy/reggie-wa...,"Hello, I’m Thomas. I’m so glad to meet you Mum...",3.650907,1.929205,2.0,3.0,5.0,14
378,GEORGE CARLIN: COMPLAINTS AND GRIEVANCES (2001...,https://scrapsfromtheloft.com/comedy/george-ca...,Complaints and Grievances is a HBO stand-up sp...,3.986999,2.151815,2.0,4.0,5.0,18
379,GEORGE CARLIN: LIFE IS WORTH LOSING (2006) – T...,https://scrapsfromtheloft.com/comedy/george-ca...,"Recorded on November 5, 2005, Beacon Theater, ...",4.080850,2.213609,3.0,4.0,5.0,17
