In [1]:
import os
import pandas as pd
import re
import gensim
from gensim.utils import simple_preprocess
from gensim.parsing.preprocessing import STOPWORDS
from nltk.stem import WordNetLemmatizer, SnowballStemmer
from nltk.stem.porter import *
import numpy as np
from profanity_check import predict, predict_prob
from itertools import compress

pickled_data = '/Users/johnpapaioannou/Desktop/insight/project/nlp_models/full_transcripts.pkl'
specials = pd.read_pickle(pickled_data)



In [2]:
# collection of annoying punctuation specific to these comedy transcripts
def init_clean(single_special):
    single_special = re.sub("[\(\[].*?[\)\]]", "", single_special)
    single_special = re.sub("-", " ", single_special)
    single_special = re.sub("♪", "", single_special)
    return single_special

In [64]:
# clean sentences before word embedding / tf-idf
# NOTE: scrub of profanity FIRST, then preprocess!

stemmer = SnowballStemmer('english')
#comedy_cliche = ['like', 'know', 'say', 'look', 'come', 'right', 'go', 'think']

def lemmatize_stemming(text):
    return stemmer.stem(WordNetLemmatizer().lemmatize(text, pos='v'))

def preprocess(text):
    result = []
    for token in gensim.utils.simple_preprocess(text):
        if token not in gensim.parsing.preprocessing.STOPWORDS and len(token) > 2:
            #if token not in comedy_cliche:
            result.append(token)
    return result

def profane_filter(text):
    cleaned_up = predict(text)
    cleaned_up = 1 - cleaned_up
    clean_list = list(compress(text, cleaned_up))
    cleaned_up_final = " ".join(clean_list)
    return cleaned_up_final

In [140]:
from nltk.tokenize import sent_tokenize, word_tokenize

# test everything on Ali Wong's transcript
special = init_clean(specials.iloc[4,1])
comedian = specials.iloc[4,0]
spec_sents = sent_tokenize(special)
print(spec_sents)

['\nLadies and gentlemen, please welcome to the stage Ali Wong!', 'What y’all thought Y’all wasn’t gon’ see me?', 'I’m the Osirus of this shit\n Wu Tang is here forever, motherfuckers\n It’s like this ninety seven \n Aight my niggas and my niggarettes\n Let’s do it like this\n I’ma rub your ass in the moonshine\n Let’s take it back to seventy nine\n I bomb atomically\n Socrates’ philosophies and hypotheses\n Can’t define How I be droppin’ these mockeries\n Lyrically perform armed robbery \n Flee with the lottery Possibly they spotted me\n Battle scarred shogun\n Explosion when my pen hits \nOh, my goodness!', 'I heard a rumor that all of the Asians in this city… Have congregated in this theater tonight.', 'Yeah.', 'Thank you for coming with your white boyfriends.', 'I really… Appreciate it, from the bottom of my heart.', 'I’m so excited to be here.', 'I have not been performing that much at all, in the past two years, because two years ago, I gave birth to a baby girl.', 'And when I fi

In [141]:
# filter out sentences that are 2 or less words!
def short_sents_filter(text):
    for sentence in text:
        words = sentence.split(" ")
        if len(words) < 3:
            text.remove(sentence)
    return text

In [142]:
print('before filtering out short sentences: ', len(spec_sents))
spec_sents = short_sents_filter(spec_sents)
print('after filtering out short sentences: ', len(spec_sents))


before filtering out short sentences:  1015
after filtering out short sentences:  941


In [162]:
# remove ellipsis ( why was this so difficult!!!!)
import string
for i in range(len(spec_sents)):
    spec_sents[i].translate(str.maketrans('', '', string.punctuation))
    spec_sents[i] = spec_sents[i].replace('\u2026', '')
    spec_sents[i] = spec_sents[i].replace('”', '')
    spec_sents[i] = spec_sents[i].replace('“', '')

    print(spec_sents[i])


Ladies and gentlemen please welcome to the stage Ali Wong
What y’all thought Y’all wasn’t gon’ see me
I’m the Osirus of this shit
 Wu Tang is here forever motherfuckers
 It’s like this ninety seven 
 Aight my niggas and my niggarettes
 Let’s do it like this
 I’ma rub your ass in the moonshine
 Let’s take it back to seventy nine
 I bomb atomically
 Socrates’ philosophies and hypotheses
 Can’t define How I be droppin’ these mockeries
 Lyrically perform armed robbery 
 Flee with the lottery Possibly they spotted me
 Battle scarred shogun
 Explosion when my pen hits 
Oh my goodness
I heard a rumor that all of the Asians in this city Have congregated in this theater tonight
Thank you for coming with your white boyfriends
I really Appreciate it from the bottom of my heart
I’m so excited to be here
I have not been performing that much at all in the past two years because two years ago I gave birth to a baby girl
And when I first started to come back out to do stand up the other stand up comi

I’m– I’m very very scared of childbirth
That’s why I’m going to hire a doula
You know what that is
You know what a doula is
That’s a white hippie witch that blows quinoa into your pussy to Keyser Söze all the pain away
A lot of women tried to freak me out
They tried to freak me out about childbirth by saying Ali did you know that you’re gonna poop on the table I was like Yeah I look forward to it I’m all backed up from holding in my shit at work
I can’t wait to cleanse
It makes sense like that you– that that happens because when you’re in labor you push you push you push and your husband will be asked to assist in the labor by lifting up your leg which subsequently turns into a soft serve lever
You just shit on the floor in front of the love of your life
And just when you think that’s enough to make him finally leave you boom a baby comes out and he gotta stay
That’s the real miracle of life right there
I can already see how a child can really take its toll on a marriage because the ba

In [42]:
# tokenize into words, clean out profanity

from nltk.tokenize.treebank import TreebankWordDetokenizer
from sacremoses import MosesTokenizer, MosesDetokenizer

spec_words = [word_tokenize(sentences) for sentences in spec_sents]
print(spec_words)

[['Ladies', 'and', 'gentlemen', ',', 'please', 'welcome', 'to', 'the', 'stage', 'Ali', 'Wong', '!'], ['What', 'y', '’', 'all', 'thought', 'Y', '’', 'all', 'wasn', '’', 't', 'gon', '’', 'see', 'me', '?'], ['I', '’', 'm', 'the', 'Osirus', 'of', 'this', 'shit', 'Wu', 'Tang', 'is', 'here', 'forever', ',', 'motherfuckers', 'It', '’', 's', 'like', 'this', 'ninety', 'seven', 'Aight', 'my', 'niggas', 'and', 'my', 'niggarettes', 'Let', '’', 's', 'do', 'it', 'like', 'this', 'I', '’', 'ma', 'rub', 'your', 'ass', 'in', 'the', 'moonshine', 'Let', '’', 's', 'take', 'it', 'back', 'to', 'seventy', 'nine', 'I', 'bomb', 'atomically', 'Socrates', '’', 'philosophies', 'and', 'hypotheses', 'Can', '’', 't', 'define', 'How', 'I', 'be', 'droppin', '’', 'these', 'mockeries', 'Lyrically', 'perform', 'armed', 'robbery', 'Flee', 'with', 'the', 'lottery', 'Possibly', 'they', 'spotted', 'me', 'Battle', 'scarred', 'shogun', 'Explosion', 'when', 'my', 'pen', 'hits', 'Oh', ',', 'my', 'goodness', '!'], ['I', 'heard', '

In [8]:
# try soft-cosine similarity stuff
import gensim.downloader as api

word2vec_model300 = api.load('word2vec-google-news-300')




In [61]:
w2v_model = word2vec_model300


In [36]:
# playing around with w2v
word_vecs = w2v_model.wv
del w2v_model

  


In [167]:
w2v_model.similarity('Elon', 'Musk')

0.05558063