# Feature Engineering: Words & Sentences

## imports

In [1]:
import pickle
import numpy as np
import pandas as pd
from datetime import date
import json
from tqdm.notebook import tqdm

import re
from collections import Counter, defaultdict

import nltk
from nltk.corpus import stopwords
from nltk.tokenize import word_tokenize, sent_tokenize, regexp_tokenize
from nltk.stem.wordnet import WordNetLemmatizer
import gensim
from gensim.corpora.dictionary import Dictionary
from gensim.models import Phrases

In [2]:
sw = stopwords.words("english")

In [3]:
transcripts_df = pd.read_pickle('../data/transcripts_raw_df.pickle')

with open(f'../data/imdb_title_results_2022-05-23.pickle', 'rb') as file:
    show_meta = pickle.load(file)
    
with open(f'../data/metascripts_df_2022-05-28.pickle', 'rb') as file:
    metascripts = pickle.load(file)
    
with open("../data/profane_to_replace.pickle", 'rb') as file:
    to_replace = pickle.load(file)
    
with open("../data/profane_replace_with.pickle", 'rb') as file:
    replace_with = pickle.load(file)

## prepare the data

In [4]:
nonenglish_shows = ['BILL BURR: WHY DO I DO THIS (2008) – Testo italiano completo',
                   'DOUG STANHOPE: NO REFUNDS (2007) – Trascrizione italiana',
                   'GEORGE CARLIN: JAMMING IN NEW YORK (1992) – Testo italiano completo',
                   'GEORGE CARLIN: YOU ARE ALL DISEASED (1999) – Testo italiano completo',
                   'GEORGE CARLIN: IT’S BAD FOR YA! (2008) – Testo italiano completo',
                   'DAVE CHAPPELLE: THE BIRD REVELATION (2017) – Transcripción completa']

In [5]:
metascripts = metascripts[~metascripts['description'].isin(nonenglish_shows)]

In [6]:
# Replace bracket and parenthetical content from scripts
# Replace censored profanity with actual profanity
metascripts['transcript'] = (metascripts['transcript']
                                 .replace("\[.+?\]|\(.+?\)","", regex = True)
                                 .replace("\’|\‘", "'", regex = True)
                                 .replace("\“|\”", '"', regex = True)
                                 .replace(to_replace[0], replace_with[0], regex = True)
                                 .replace(to_replace[1], replace_with[1], regex = True)
                            )

# Fill censored words to clean up our profanity detection
profanity_fill = json.load(open('../data/profanity_fill.json'))

for key, value in profanity_fill.items(): 
    metascripts['transcript'] = metascripts['transcript'].str.replace(key, value, regex = False)

## prepare lists and dictionaries for streamlined work

In [7]:
transcripts_dict = dict(zip(metascripts['description'].values, metascripts['transcript'].values))
descriptions = list(transcripts_dict.keys())
scripts = list(transcripts_dict.values())

## word lengths
word lengths are calculated as letters per word

   * tokenize words (allow apostrophes and dashes but not numbers)
   * do not lemmatize
   * do not remove stopwords

In [8]:
bow_cased = [regexp_tokenize(transcript, r"[a-zA-Z]+") for description, transcript in transcripts_dict.items()]
bow_counter = [Counter(word.lower() for word in script_words) for script_words in bow_cased]

tokenized_list = [[word.lower() for word in script_words] for script_words in bow_cased]
dictionary = Dictionary(tokenized_list)
corpus = [dictionary.doc2bow(script) for script in tokenized_list]

In [9]:
word_lengths = [[len(word) for word in script_words] for script_words in tokenized_list]

In [10]:
metascripts['mean word length'] = [np.mean(script_word_lengths) for script_word_lengths in word_lengths]
metascripts['std word length'] = [np.std(script_word_lengths) for script_word_lengths in word_lengths]

for quantile in (0.25, 0.50, 0.75):
    metascripts[f'Q{quantile/0.25} word length'] = [np.quantile(script_word_lengths, quantile) for script_word_lengths in word_lengths]

metascripts['max word length'] = [np.max(script_word_lengths) for script_word_lengths in word_lengths]

## sentence lengths
sentence lengths are calculated as words per sentence

   * tokenize sentences and then count whitespaces
   * do not remove stopwords
   * get arrays so we can do mean, median, boxplot values, standard deviation

In [11]:
sent_tokenized_list = [sent_tokenize(transcript) for description, transcript in transcripts_dict.items()]
sent_words_tokenized_list = [[regexp_tokenize(sent, r"['\-\w]+") for sent in sent_script] for sent_script in sent_tokenized_list]
sent_lengths = [[len(sent) for sent in script] for script in sent_words_tokenized_list]
sent_counts = [len(script) for script in sent_tokenized_list]

In [12]:
metascripts['mean sentence length'] = [np.mean(script_sent_lengths) for script_sent_lengths in sent_lengths]
metascripts['std sentence length'] = [np.std(script_sent_lengths) for script_sent_lengths in sent_lengths]

for quantile in (0.25, 0.50, 0.75):
    metascripts[f'Q{quantile/0.25} sentence length'] = [np.quantile(script_sent_lengths, quantile) for script_sent_lengths in sent_lengths]

metascripts['max sentence length'] = [np.max(script_sent_lengths) for script_sent_lengths in sent_lengths]

## distinct words
count distinct words in each show and normalize by determining the proportion of distinct words and distinct words per sentence

   * tokenize: allow apostrophes and dashes but not numbers 
   * lemmatize

In [13]:
from nltk.stem.wordnet import WordNetLemmatizer

In [14]:
lemmatizer = WordNetLemmatizer()
lem_counter = [Counter(lemmatizer.lemmatize(word.lower()) for word in script_words) for script_words in bow_cased]

In [15]:
unique_word_counts = [len(script_lem_counts) for script_lem_counts in lem_counter]
total_word_counts = [np.sum([count for lem, count in script_lem_counts.items()]) for script_lem_counts in lem_counter]
unique_total_ratio = [unique/total for unique, total in zip(unique_word_counts, total_word_counts)]
unique_per_sent = [unique/sent_count for unique, sent_count in zip(unique_word_counts, sent_counts)]

In [16]:
metascripts['unique words'] = unique_word_counts
metascripts['total words'] = total_word_counts
metascripts['proportion unique words'] = unique_total_ratio
metascripts['unique words per sentence'] = unique_per_sent

## words per minute and sentences per minute

In [17]:
word_tok_scripts = [regexp_tokenize(script, r"[\w'-]+") for script in scripts]
words_per_minute = [len(script_words)/minutes for script_words, minutes in zip(word_tok_scripts, metascripts['runtimeMins'].values)]

sent_tok_scripts = [sent_tokenize(script) for script in scripts]
sent_per_minute = [len(script_sentences)/minutes for script_sentences, minutes in zip(sent_tok_scripts, metascripts['runtimeMins'].values)]

In [18]:
metascripts['words per minute'] = words_per_minute
metascripts['sentences per minute'] = sent_per_minute

## pickle updated metascripts_df

In [20]:
with open(f'../data/metascript_df_ws.pickle', 'wb') as file:
    pickle.dump(metascripts, file)

## quick facts

Total Words: 3,011,640

In [22]:
metascripts['total words'].sum()

3011640

Unique Words: 38,783

In [24]:
len(dictionary.keys())

38783

In [31]:
import plotly.express as px
px.box(metascripts, x = 'total words', hover_data = ['description'], points = 'all')