In [290]:
# import everything
import os
import pandas as pd
import re
import gensim
from gensim.utils import simple_preprocess
from gensim.parsing.preprocessing import STOPWORDS, remove_stopwords
from nltk.stem import WordNetLemmatizer, SnowballStemmer
from nltk.stem.porter import *
import numpy as np
from profanity_check import predict, predict_prob
from itertools import compress
from nltk.tokenize import sent_tokenize, word_tokenize
import string


pickled_data = '/Users/johnpapaioannou/Desktop/insight/project/nlp_models/full_transcripts.pkl'
specials = pd.read_pickle(pickled_data)

In [198]:
# load in pre-trained word2vec model
import gensim.downloader as api
w2v_path = '~/gensim-data/word2vec-google-news-300'
w2c_model = api.load('word2vec-google-news-300')

In [200]:
from gensim.test.utils import get_tmpfile
w2v_model = w2c_model
temp_path = get_tmpfile('w2v')
w2v_model.save(temp_path)

# collecting functions to help various tasks(clean-up, preprocessing, profanity checker)


In [239]:

# collection of annoying punctuation specific to these comedy transcripts
def init_clean(single_special):
    single_special = re.sub("[\(\[].*?[\)\]]", "", single_special)
    single_special = re.sub("-", " ", single_special)
    single_special = re.sub("♪", "", single_special)
    return single_special

# filter out sentences that are 2 or less words!
def short_sents_filter(text):
    for sentence in text:
        words = sentence.split(" ")
        if len(words) < 3:
            text.remove(sentence)
    return text


# lemmatizing
stemmer = SnowballStemmer('english')
#comedy_cliche = ['like', 'know', 'say', 'look', 'come', 'right', 'go', 'think']

def lemmatize_stemming(text):
    return stemmer.stem(WordNetLemmatizer().lemmatize(text, pos='v'))

def preprocess(text):
    result = []
    for token in gensim.utils.simple_preprocess(text):
        if token not in gensim.parsing.preprocessing.STOPWORDS and len(token) > 2:
            #if token not in comedy_cliche:
            result.append(token)
    return result

def profane_filter(text):
    cleaned_up = predict(text)
    cleaned_up = 1 - cleaned_up
    clean_list = list(compress(text, cleaned_up))
    cleaned_up_final = " ".join(clean_list)
    return cleaned_up_final


# remove ellipsis ( why was this so difficult!!!!)
def punctuation_clean_up(spec_sents):
    for i in range(len(spec_sents)):
        spec_sents[i].translate(str.maketrans('', '', string.punctuation))
        spec_sents[i] = spec_sents[i].replace('\u2026', '')
        spec_sents[i] = spec_sents[i].replace('”', '')
        spec_sents[i] = spec_sents[i].replace('“', '')
        spec_sents[i] = spec_sents[i].replace('–', '')
        spec_sents[i] = spec_sents[i].replace('!', '')
        spec_sents[i] = spec_sents[i].replace('.', '')
        spec_sents[i] = spec_sents[i].replace('?', '')
        spec_sents[i] = spec_sents[i].replace(',', '')
        spec_sents[i] = spec_sents[i].replace('\n', ' ')

    return spec_sents

In [451]:

# initial cleaning of bizarre special unicode text 
special = init_clean(specials.iloc[1,1])

comedian = specials.iloc[1,0]

# remove stop words here using gensim, which doesn't require tokenization first
print(len(special))
special = remove_stopwords(special)
print(len(special))
special = gensim.parsing.preprocessing.strip_numeric(special)
print(len(special))
special = gensim.parsing.preprocessing.strip_short(special, minsize=3)
print(len(special))

# sentence tokenizing BEFORE stripping punctuation
spec_sents = sent_tokenize(special)

# clean up all punctuation, retain apostrophe's though for semantic retention in word embeddings
spec_sents = punctuation_clean_up(spec_sents)
print(spec_sents)   

50406
35676
35599
34051
['Okay ready and Take cue Adam', 'And action Dan', 'perfect day You sweetest smile You way Yeah style walkin’ talkin’ And jammin’ jawin’ world Then got hit electric car ‘Cause didn’t hear comin’ Fuck cars', 'You should one', 'electric car', 'But decency I’m driving head window go Here comes Sandman Sandman coming', 'Watch out How Tesla’s driving themselves', 'Those Teslas that’s pretty cool', 'They drive themselves', 'Holy shit man', 'got accident Tesla there didn’t know exchange information with started screaming Hey fucker', 'What fuck And Tesla’s know fucking windshield wipers flappin’ about', 'And like What fuck mean', 'You it And fucking lights going off', 'Like it’s saying Okay', 'All right And uh Then court I’m like I’m going fucking destroy thing And Tesla shows neck brace say Oh fucking break', 'What liar', 'He’s fucking liar Oh Grandma’s roommate Why talk much', 'Don’t know came nursing home grandma said hello polite Now you’re telling That grandma rud

In [452]:
# now word tokenize to prepare for tf-idf and word embedding
spec_sents2 = spec_sents
for i in range(len(spec_sents)):
    spec_sents2[i] = word_tokenize(spec_sents[i])

tot_words = 0
for i in range(len(spec_sents2)):
    tot_words += len(spec_sents2[i])
print(tot_words)

6876


# word embedding -> sentence embedding


In [404]:

def word2sent_vec(w2v, special_sentences):
    
    no_of_sentences = len(special_sentences)
    word_vec_dim = len(w2v['word'])
    sentence_vec = np.zeros(word_vec_dim, dtype=np.float32)
    sentence_vec = list(np.zeros(no_of_sentences))
    
    errors = 0
    for i in range(no_of_sentences):
        no_of_words = 0
        sent_vec = np.zeros(word_vec_dim, dtype=np.float32)
    
        for j in range(len(special_sentences[i])):
            try:
                w2v_sents = w2v[special_sentences[i][j]]
                no_of_words += 1
                sent_vec += w2v_sents
            except:
                errors += 1
                pass
        sent_vec = sent_vec/no_of_words
        sentence_vec[i] = sent_vec
    print('errors: ', errors)
    return sentence_vec


# Playing around with tf-idf

In [327]:
# use gensim's tools to get tf-idf scores for words where each doc = sentence
from gensim import corpora, models

# build the corpus
dictionary = gensim.corpora.Dictionary(spec_sents2)
print(dictionary)
# prune filler words using filter_extremes
bow_corpus = [dictionary.doc2bow(doc) for doc in spec_sents2]


Dictionary(2575 unique tokens: ['Ali', 'Ladies', 'Wong', 'gentlemen', 'stage']...)


In [330]:
tfidf = models.TfidfModel(bow_corpus)
corpus_tfidf = tfidf[bow_corpus]

from pprint import pprint

#for doc in corpus_tfidf:
 #   pprint(doc)

for i in range(len(spec_sents2)):
    print(spec_sents2[i])
    print(corpus_tfidf[i])
    print('\n\n')


['Ladies', 'gentlemen', 'welcome', 'stage', 'Ali', 'Wong']
[(0, 0.2920157204642617), (1, 0.4387261467169068), (2, 0.38646683846203317), (3, 0.4387261467169068), (4, 0.40021283455813894), (5, 0.46929588235963543)]



['What', 'y', '’', 'all', 'thought', 'Y', '’', 'all', 'wasn', '’', 't', 'gon', '’', 'me']
[(6, 0.24098326132076706), (7, 0.3896210086773813), (8, 0.597947565635207), (9, 0.1944228325164284), (10, 0.1764867897106934), (11, 0.1125164252922164), (12, 0.2658682929076722), (13, 0.35058137344519075), (14, 0.35058137344519075), (15, 0.18566886496568982)]



['I', '’', 'm', 'Osirus', 'shit', 'Tang', 'forever', 'motherfuckers', 'It', '’', 's', 'like', 'ninety', 'seven', 'Aight', 'niggas', 'niggarettes', 'Let', '’', 's', 'like', 'I', '’', 'ma', 'rub', 'ass', 'moonshine', 'Let', '’', 's', 'seventy', 'bomb', 'atomically', 'Socrates', '’', 'philosophies', 'hypotheses', 'Can', '’', 't', 'define', 'How', 'droppin', '’', 'mockeries', 'Lyrically', 'perform', 'armed', 'robbery', 'Flee', 'lot

['line', 'holding', 'What']
[(6, 0.4570191727268126), (443, 0.6648694535347108), (705, 0.590831689668517)]



['peoples', 'Where', 'fuck', 'at']
[(116, 0.34924257863754815), (810, 0.46220345605982405), (811, 0.5247041171328989), (1534, 0.6237653212040049)]



['Niggas', 'strapped', 'And', 'trying', 'twist', 'beer', 'cap', 'It', '’', 's', 'court', 'adjourned', 'For', 'bad', 'seed', 'bad', 'sperm', 'Herb', 'got', 'wig', 'fried', 'like', 'bad', 'perm', 'What', 'blood', 'clot', 'smoke', 'pot', 'And', 'blow', 'spots', 'You', 'want', 'think', 'twice', 'think', 'The', 'Iron', 'Lung', 'ain', '’', 't', 'got', 'tell', 'Where', 'it', '’', 's', 'coming', 'Guns', 'Navarone', 'Tearing', 'battle', 'zone', 'Rip', 'slums', 'twist', 'darts', 'heart', 'Tried', 'true', 'Loot', 'voice', 'team', 'slang', 'rocks', 'Certified', 'chatterbox', 'Vocabulary', '‘', 'Donna', 'talking', 'Tell', 'story', 'walking', 'Take', 'cover', 'kid', 'what']
[(6, 0.0791867794330478), (11, 0.036972747830605966), (15, 0.0457579067


['want', 'lie', 'fuck', 'down']
[(116, 0.4127935806335448), (655, 0.5298807619557874), (760, 0.5895231270637016), (848, 0.448653898404913)]



['think', 'feminism', 'worst', 'thing', 'happened', 'women']
[(125, 0.3034188821928431), (286, 0.29977457968601473), (355, 0.3540746045944531), (677, 0.40155829947895966), (1700, 0.48762180274705913), (2008, 0.5419218276554975)]



['Our', 'job', 'job']
[(163, 0.8547802929685676), (920, 0.5189900295309819)]



['good']
[(199, 1.0)]



['smart', 'thing', 'continue', 'playing', 'dumb', 'century', 'like', 'We', '’', 're', 'dumb', 'women']
[(15, 0.04190469738288278), (41, 0.09000962978624681), (122, 0.1387127467842535), (286, 0.19457370231492757), (355, 0.22981804122216196), (706, 0.22981804122216196), (924, 0.6329981654171655), (982, 0.28125474380134846), (2009, 0.31649908270858274), (2010, 0.3517434216158171), (2011, 0.3517434216158171)]



['don', '’', 't', 'know', 'anything']
[(11, 0.29682586650463494), (15, 0.12245172556638582), (115, 0.378625

# routine for sentence similarity 'autocorrelation'


In [459]:
sent_vecs = word2sent_vec(w2v_model, spec_sents2)

def cos_sim(sent1, sent2):
    norm = np.linalg.norm(sent1) * np.linalg.norm(sent2)
    cosine_similarity = np.dot(sent1, sent2) / norm
    return cosine_similarity


errors:  618
[ 0.17626953  0.09503174  0.08703613 -0.10314941 -0.01245117 -0.13623047
  0.07421875 -0.20605469  0.22558594  0.18481445  0.03173828 -0.01074219
 -0.05670166 -0.13232422  0.02966309  0.05444336 -0.01359558  0.06640625
  0.2265625   0.18920898  0.14404297  0.04663086 -0.11798096  0.09448242
 -0.04492188  0.15319824 -0.23486328  0.2006836   0.16351318  0.03601074
 -0.16040039 -0.01345825 -0.10290527  0.22509766  0.02441406 -0.13452148
  0.12915039  0.03851318 -0.08831787  0.11376953 -0.07458496 -0.01345062
  0.13208008  0.12039852 -0.21484375  0.0916748  -0.00611115  0.02709961
  0.16419983  0.15124512 -0.05786133  0.38378906 -0.0222168   0.20458984
  0.04443359  0.12182617 -0.09223175  0.18554688  0.09051514  0.01553345
 -0.0637207  -0.1706543  -0.2890625  -0.01391602  0.05932617  0.06689453
 -0.30688477  0.0980835  -0.11120605  0.08883667  0.06994629 -0.10693359
 -0.14550781 -0.1015625  -0.2397461  -0.01660156 -0.06658936  0.01281738
 -0.08776855 -0.35498047  0.3359375  -



In [446]:
# similarity adjacency routine / "auto-correlation" / joke coherence length detector

def sentence_sim_dist(sent_vecs, chunk_size):

    sim_dist_size = 2*chunk_size + 1
    sim_dist = np.zeros(2*chunk_size + 1)
    end_index = len(sent_vecs)
    # scan through all sentences!

    for i in range(chunk_size, end_index - chunk_size):
        sent_mid = sent_vecs[i]
        for j in range(sim_dist_size):
            k = i + j - chunk_size
            sim_dist[j] += cos_sim(sent_mid, sent_vecs[k])
    norm = sim_dist[chunk_size]
    sim_dist = sim_dist/norm
    return sim_dist

chunk_size = 8
sim_dist = sentence_sim_dist(sent_vecs, chunk_size)

for i in range(2*chunk_size + 1):
    print(i, sim_dist[i])

0 0.4210972731739153
1 0.4222640256896749
2 0.42419274690943076
3 0.42815181875359964
4 0.4315037802516595
5 0.44128574252743635
6 0.4545895643773861
7 0.48914211404191144
8 1.0
9 0.4893452214841378
10 0.45447602289708344
11 0.4410195282184755
12 0.43122311917712436
13 0.42795565810072805
14 0.42383711427097465
15 0.421975395027481
16 0.4204784464249993
