# Feature Engineering for Standup Scripts

## Imports

In [2]:
import pickle
import numpy as np
import pandas as pd
from datetime import date

import matplotlib.pyplot as plt
import seaborn as sns
import plotly.express as px

import re
from collections import Counter, defaultdict
import itertools
import nltk
from nltk.corpus import stopwords
from nltk.tokenize import word_tokenize, sent_tokenize, regexp_tokenize
import gensim
from gensim.corpora.dictionary import Dictionary
import spacy

In [3]:
sw = stopwords.words("english")

In [4]:
transcripts_df = pd.read_pickle('../data/transcripts_raw_df.pickle')

In [5]:
with open(f'../data/imdb_title_results_2022-05-23.pickle', 'rb') as file:
    show_meta = pickle.load(file)

## word lengths, sentence lengths, distinct words
- word lengths (letters per word)
    * tokenize words (allow apostrophes and dashes but not numbers)
    * do not lemmatize
    * do not remove stopwords
- sentence lengths (words per sentence)
    * tokenize sentences and then count whitespaces
    * do not remove stopwords
    * get arrays so we can do mean, median, boxplot values, standard deviation
- distinct words per total words
    * tokenize and lemmatize words (allow apostrophes and dashes by not numbers)

In [6]:
splits = transcripts_df['description'].str.split(':')
weird_labels = [(index, split) for index, split in enumerate(splits) if len(split) == 1]

In [7]:
transcripts_dict = dict(zip(transcripts_df['description'].values, transcripts_df['transcript'].values))

In [8]:
descriptions = list(transcripts_dict.keys())

### word lengths

In [9]:
bow_cased = [regexp_tokenize(transcript, r"[a-zA-Z]+") for description, transcript in transcripts_dict.items()]
bow_counter = [Counter(word.lower() for word in script_words) for script_words in bow_cased]

tokenized_list = [[word.lower() for word in script_words] for script_words in bow_cased]
dictionary = Dictionary(tokenized_list)
corpus = [dictionary.doc2bow(script) for script in tokenized_list]

In [10]:
word_lengths = [[len(word) for word in script_words] for script_words in tokenized_list]

In [11]:
transcripts_df['mean word length'] = [np.mean(script_word_lengths) for script_word_lengths in word_lengths]
transcripts_df['std word length'] = [np.std(script_word_lengths) for script_word_lengths in word_lengths]

for quantile in (0.25, 0.50, 0.75):
    transcripts_df[f'Q{quantile/0.25} word length'] = [np.quantile(script_word_lengths, quantile) for script_word_lengths in word_lengths]

transcripts_df['max word length'] = [np.max(script_word_lengths) for script_word_lengths in word_lengths]

### sentence lengths

In [12]:
sent_tokenized_list = [sent_tokenize(transcript) for description, transcript in transcripts_dict.items()]
sent_words_tokenized_list = [[regexp_tokenize(sent, r"[’'\-\w]+") for sent in sent_script] for sent_script in sent_tokenized_list]
sent_lengths = [[len(sent) for sent in script] for script in sent_words_tokenized_list]
sent_counts = [len(script) for script in sent_tokenized_list]

In [13]:
transcripts_df['mean sentence length'] = [np.mean(script_sent_lengths) for script_sent_lengths in sent_lengths]
transcripts_df['std sentence length'] = [np.std(script_sent_lengths) for script_sent_lengths in sent_lengths]

for quantile in (0.25, 0.50, 0.75):
    transcripts_df[f'Q{quantile/0.25} sentence length'] = [np.quantile(script_sent_lengths, quantile) for script_sent_lengths in sent_lengths]

transcripts_df['max sentence length'] = [np.max(script_sent_lengths) for script_sent_lengths in sent_lengths]

## distinct words and distinct words per total words

In [14]:
from nltk.stem.wordnet import WordNetLemmatizer

In [15]:
lemmatizer = WordNetLemmatizer()
lem_counter = [Counter(lemmatizer.lemmatize(word.lower()) for word in script_words) for script_words in bow_cased]

In [16]:
unique_word_counts = [len(script_lem_counts) for script_lem_counts in lem_counter]
total_word_counts = [np.sum([count for lem, count in script_lem_counts.items()]) for script_lem_counts in lem_counter]
unique_total_ratio = [unique/total for unique, total in zip(unique_word_counts, total_word_counts)]
unique_per_sent = [unique/sent_count for unique, sent_count in zip(unique_word_counts, sent_counts)]

In [17]:
transcripts_df['unique words'] = unique_word_counts
transcripts_df['total words'] = total_word_counts
transcripts_df['proportion unique words'] = unique_total_ratio
transcripts_df['unique words per sentence'] = unique_per_sent

## words per minute and sentences per minute

## repetition and phrases

In [18]:
from gensim.models import Phrases

In [19]:
tok_scripts = [regexp_tokenize(transcript, r"[’'\-\w]+") for description, transcript in transcripts_dict.items()]
docs = [[lemmatizer.lemmatize(tok.lower()) for tok in transcript] for transcript in tok_scripts]

# Add bigrams and trigrams to docs (only ones that appear 5 times or more).
ngram = Phrases(docs, min_count=5)
for idx in range(len(docs)):
    for token in ngram[docs[idx]]:
        if '_' in token:
            # Token is a bigram, add to document.
            docs[idx].append(token)

## profanity

In [None]:
from profanityfilter import ProfanityFilter

In [None]:
pf = ProfanityFilter()