In [8]:
# import everything
import os
import pandas as pd
import re
import gensim
from gensim.utils import simple_preprocess
from gensim.parsing.preprocessing import STOPWORDS, remove_stopwords
from nltk.stem import WordNetLemmatizer, SnowballStemmer
from nltk.stem.porter import *
import numpy as np
from profanity_check import predict, predict_prob
from itertools import compress
from nltk.tokenize import sent_tokenize, word_tokenize
import string


In [5]:
#load in pre-trained word2vec model
import gensim.downloader as api
w2v_path = '~/gensim-data/word2vec-google-news-300'
w2v_model = api.load('word2vec-google-news-300')

In [113]:
# collection of annoying punctuation specific to these comedy transcripts
def init_clean(single_special):
    single_special = re.sub("[\(\[].*?[\)\]]", "", single_special)
    single_special = re.sub("-", " ", single_special)
    single_special = re.sub("♪", "", single_special)
    return single_special

# filter out sentences that are 2 or less words!
def short_sents_filter(text):
    for sentence in text:
        words = sentence.split(" ")
        if len(words) < 3:
            text.remove(sentence)
    return text


# lemmatizing
stemmer = SnowballStemmer('english')
#comedy_cliche = ['like', 'know', 'say', 'look', 'come', 'right', 'go', 'think']

def lemmatize_stemming(text):
    return stemmer.stem(WordNetLemmatizer().lemmatize(text, pos='v'))

def preprocess(text):
    result = []
    for token in gensim.utils.simple_preprocess(text):
        if token not in gensim.parsing.preprocessing.STOPWORDS and len(token) > 2:
            #if token not in comedy_cliche:
            result.append(token)
    return result

def profane_filter(text):
    cleaned_up = predict(text)
    cleaned_up = 1 - cleaned_up
    clean_list = list(compress(text, cleaned_up))
    cleaned_up_final = " ".join(clean_list)
    return cleaned_up_final


# remove ellipsis ( why was this so difficult!!!!)
def punctuation_clean_up(spec_sents):
    for i in range(len(spec_sents)):
        spec_sents[i].translate(str.maketrans('', '', string.punctuation))
        spec_sents[i] = spec_sents[i].replace('\u2026', '')
        spec_sents[i] = spec_sents[i].replace('”', '')
        spec_sents[i] = spec_sents[i].replace('“', '')
        spec_sents[i] = spec_sents[i].replace('–', '')
        spec_sents[i] = spec_sents[i].replace('!', '')
        spec_sents[i] = spec_sents[i].replace('.', '')
        spec_sents[i] = spec_sents[i].replace('?', '')
        spec_sents[i] = spec_sents[i].replace(',', '')
        spec_sents[i] = spec_sents[i].replace('\n', ' ')
        spec_sents[i] = spec_sents[i].replace("'", '')
        spec_sents[i] = spec_sents[i].replace("’", '')        

    return spec_sents

def gensim_preprocess(transcript):
    # remove stop words here using gensim, which doesn't require tokenization first
    print('pre clean length: ', len(transcript))
    
    transcript = init_clean(transcript)
    print('after init clean: ', len(transcript))

    transcript = remove_stopwords(transcript)
    print('after stop word removal: ', len(transcript))

    transcript = gensim.parsing.preprocessing.strip_numeric(transcript)
    print('after removing numbers: ', len(transcript))

    transcript = gensim.parsing.preprocessing.strip_short(transcript, minsize=3)
    print('after stripping short sentences: ', len(transcript))

    
    #sentence tokenizing BEFORE stripping punctuation
    transcript_sentences = sent_tokenize(transcript)

    # clean up all punctuation, replace apostrophes with blank space
    transcript_sentences = punctuation_clean_up(transcript_sentences)
    
    t_sents = transcript_sentences
    for i in range(len(transcript_sentences)):
        t_sents[i] = word_tokenize(transcript_sentences[i])
        
    return t_sents

In [186]:
def word2sent_vec(w2v, special_sentences):
    
    no_of_sentences = len(special_sentences)
    word_vec_dim = len(w2v['word'])
    sentence_vec = np.zeros(word_vec_dim, dtype=np.float32)
    sentence_vec = list(np.zeros(no_of_sentences))
    errors = 0
    for i in range(no_of_sentences):
        no_of_words = 0
        sent_vec = np.zeros(word_vec_dim, dtype=np.float32)
    
        for j in range(len(special_sentences[i])):
            try:
                w2v_sents = w2v[special_sentences[i][j]]
                no_of_words += 1
                sent_vec += w2v_sents
            except:
                errors += 1
                pass
        if no_of_words == 0:
            ind_of_err = i
            
        else:
            sent_vec = sent_vec/no_of_words
            sentence_vec[i] = sent_vec
            
    print('errors: ', errors)
    
    # trim out error indices from zero word sentences
    #sentence_vec = np.delete(sentence_vec, no_of_words)
    
    # routine that checks for any zero'd elements and removes them
    print('shape before: ', np.shape(sentence_vec))
    
    err_flag = 0
    err_list = []
    for i in range(no_of_sentences):
        if np.shape(sentence_vec[i]) == ():
            err_flag = 1
            err_list.append(i)
    
    if err_flag == 1:
        err_array = np.asarray(err_list, dtype=int)
        sentence_vec_fix = np.delete(sentence_vec, err_array)
        return sentence_vec_fix
    else:
        return sentence_vec

def cos_sim(sent1, sent2):
    norm = np.linalg.norm(sent1) * np.linalg.norm(sent2)
    cosine_similarity = np.dot(sent1, sent2) / norm
    return cosine_similarity

def sentence_sim_dist(sent_vecs, chunk_size):

    sim_dist = np.zeros(chunk_size)
    end_index = len(sent_vecs)
    # scan through all sentences!

    for i in range(end_index - chunk_size):
        sent_mid = sent_vecs[i]
        #print('i: ',i)
        for j in range(chunk_size):
            k = i + j
            #print('j: ', j, ' k: ', k)
            #print('cos_sim: ', cos_sim(sent_mid, sent_vecs[k]))
            sim_dist[j] += cos_sim(sent_mid, sent_vecs[k])
    norm = sim_dist[0]
    sim_dist = sim_dist/norm
    return sim_dist

import math
# check for NaN's in w2v embedding, meaning I didn't catch something in preprocessing
# def check_for_nans(sentence_vecs):
#     for i in range(len(sentence_vecs)):
#         for j in range(np.shape(sentence_vecs[0])[0]):
#             if math.isnan(sentence_vecs[i][j]):
#                 #print('i: ', i)

In [202]:
pickled_data = '/Users/johnpapaioannou/Desktop/insight/project/nlp_models/full_transcripts.pkl'
specials = pd.read_pickle(pickled_data)

no_of_specials = len(specials.iloc[:,1])

print('# of specials: ', no_of_specials)

for i in range(no_of_specials):
    print('i: ', i, 'comedian: ', specials.iloc[i,0])
    
jeselnik = specials.iloc[7,1]
birbigs = specials.iloc[89,1]
hedberg = specials.iloc[91,1]
patrice = specials.iloc[99,1]
bargatze = specials.iloc[93,1]
maron = specials.iloc[82,1]
carr = specials.iloc[65,1]
demetri_martin = specials.iloc[34,1]
amy_schumer = specials.iloc[5,1]
colin_quinn = specials.iloc[24,1]
kevin_smith = specials.iloc[77,1]
stewart_lee = specials.iloc[117,1]
attell = specials.iloc[31,1]

special_sentences = gensim_preprocess(specials.iloc[119,1])

print('# of sentences: ', len(special_sentences))

#print(special_sentences)
sentence_vecs = word2sent_vec(w2v_model, special_sentences)
chunk_size = 6

sim_dist = sentence_sim_dist(sentence_vecs, chunk_size)
for j in range(chunk_size):
    print(i, sim_dist[j])
    

# print(np.shape(sentence_vecs))
# print(sentence_vecs[125])
# chunk_size = 8
# sim_dist = sentence_sim_dist(sentence_vecs, chunk_size)
# for j in range(2*chunk_size + 1):
#     print(i, sim_dist[j])

# chunk_size = 8
# sim_dist = sentence_sim_dist(sentence_vecs, chunk_size)
# for j in range(2*chunk_size + 1):
#     print(i, sim_dist[j])

# print('\nMaron')
# special_sentences = gensim_preprocess(specials.iloc[82,1])
# sentence_vecs = word2sent_vec(w2v_model, special_sentences)

# chunk_size = 8
# sim_dist = sentence_sim_dist(sentence_vecs, chunk_size)
# for j in range(2*chunk_size + 1):
#     print(i, sim_dist[j])
    
# print('Jeselnik')
# special_sentences = gensim_preprocess(specials.iloc[7,1])
# sentence_vecs = word2sent_vec(w2v_model, special_sentences)

# chunk_size = 8
# sim_dist = sentence_sim_dist(sentence_vecs, chunk_size)
# for j in range(2*chunk_size + 1):
#     print(i, sim_dist[j])

# chunk_size = 8
# sim_dist = sentence_sim_dist(sentence_vecs, chunk_size)
# for j in range(2*chunk_size + 1):
#     print(i, sim_dist[j])

# print('\nCarr')
# special_sentences = gensim_preprocess(specials.iloc[65,1])
# sentence_vecs = word2sent_vec(w2v_model, special_sentences)

# chunk_size = 8
# sim_dist = sentence_sim_dist(sentence_vecs, chunk_size)
# for j in range(2*chunk_size + 1):
#     print(i, sim_dist[j])

# of specials:  126
i:  0 comedian:  adam devine
i:  1 comedian:  adam sandler
i:  2 comedian:  adel karam
i:  3 comedian:  al madrigal
i:  4 comedian:  ali wong
i:  5 comedian:  amy schumer
i:  6 comedian:  anjelah johnson
i:  7 comedian:  anthony jeselnik
i:  8 comedian:  ari shaffir
i:  9 comedian:  aziz ansari
i:  10 comedian:  bert kreischer
i:  11 comedian:  big jay oakerson
i:  12 comedian:  bill burr
i:  13 comedian:  bill hicks
i:  14 comedian:  bill maher
i:  15 comedian:  bo burnham
i:  16 comedian:  brad williams
i:  17 comedian:  brent morin
i:  18 comedian:  brian regan
i:  19 comedian:  bridget everett
i:  20 comedian:  cedric the entertainer
i:  21 comedian:  chelsea peretti
i:  22 comedian:  chris rock
i:  23 comedian:  chris tucker
i:  24 comedian:  colin quinn
i:  25 comedian:  craig ferguson
i:  26 comedian:  cristela alonzo
i:  27 comedian:  d l hughley
i:  28 comedian:  dana carvey
i:  29 comedian:  daniel sloss
i:  30 comedian:  daniel tosh
i:  31 comedian:  dave

In [84]:
for i in range(1):
    special_sentences = gensim_preprocess(specials.iloc[i,1])
    sentence_vecs = word2sent_vec(w2v_model, special_sentences)

    chunk_size = 8
    sim_dist = sentence_sim_dist(sentence_vecs, chunk_size)
    for j in range(2*chunk_size + 1):
        print(i, sim_dist[j])

pre clean length:  47948
after init clean:  44497
after stop word removal:  32527
after removing numbers:  32430
after stripping short sentences:  31461
errors:  628




KeyboardInterrupt: 