In [23]:
# import everything
import os
import pandas as pd
import re
import gensim
from gensim.utils import simple_preprocess
from gensim.parsing.preprocessing import STOPWORDS, remove_stopwords
from nltk.stem import WordNetLemmatizer, SnowballStemmer
from nltk.stem.porter import *
import numpy as np
from profanity_check import predict, predict_prob
from itertools import compress
from nltk.tokenize import sent_tokenize, word_tokenize
import string


In [6]:
#load in pre-trained word2vec model
import gensim.downloader as api
w2v_path = '~/gensim-data/word2vec-google-news-300'
w2v_model = api.load('word2vec-google-news-300')

In [335]:
# collection of annoying punctuation specific to these comedy transcripts
def init_clean(single_special):
    single_special = re.sub("[\(\[].*?[\)\]]", "", single_special)
    single_special = re.sub("-", " ", single_special)
    single_special = re.sub("♪", "", single_special)
    return single_special

# filter out sentences that are 2 or less words!
def short_sents_filter(text):
    for sentence in text:
        words = sentence.split(" ")
        if len(words) < 3:
            text.remove(sentence)
    return text


# lemmatizing
stemmer = SnowballStemmer('english')
#comedy_cliche = ['like', 'know', 'say', 'look', 'come', 'right', 'go', 'think']

def lemmatize_stemming(text):
    return stemmer.stem(WordNetLemmatizer().lemmatize(text, pos='v'))

def preprocess(text):
    result = []
    for token in gensim.utils.simple_preprocess(text):
        if token not in gensim.parsing.preprocessing.STOPWORDS and len(token) > 2:
            #if token not in comedy_cliche:
            result.append(token)
    return result

def profane_filter(text):
    cleaned_up = predict(text)
    cleaned_up = 1 - cleaned_up
    clean_list = list(compress(text, cleaned_up))
    cleaned_up_final = " ".join(clean_list)
    return cleaned_up_final


# remove ellipsis ( why was this so difficult!!!!)
def punctuation_clean_up(spec_sents):
    for i in range(len(spec_sents)):
        spec_sents[i].translate(str.maketrans('', '', string.punctuation))
        spec_sents[i] = spec_sents[i].replace('\u2026', '')
        spec_sents[i] = spec_sents[i].replace('”', '')
        spec_sents[i] = spec_sents[i].replace('“', '')
        spec_sents[i] = spec_sents[i].replace('–', '')
        spec_sents[i] = spec_sents[i].replace('!', '')
        spec_sents[i] = spec_sents[i].replace('.', '')
        spec_sents[i] = spec_sents[i].replace('?', '')
        spec_sents[i] = spec_sents[i].replace(',', '')
        spec_sents[i] = spec_sents[i].replace('\n', ' ')
        spec_sents[i] = spec_sents[i].replace("'", '')
        spec_sents[i] = spec_sents[i].replace("’", '')        

    return spec_sents

def gensim_preprocess(transcript):
    # remove stop words here using gensim, which doesn't require tokenization first
    #print('pre clean length: ', len(transcript))
    
    transcript = init_clean(transcript)
    #print('after init clean: ', len(transcript))

    transcript = remove_stopwords(transcript)
    #print('after stop word removal: ', len(transcript))

    transcript = gensim.parsing.preprocessing.strip_numeric(transcript)
    #print('after removing numbers: ', len(transcript))

    transcript = gensim.parsing.preprocessing.strip_short(transcript, minsize=3)
    #print('after stripping short sentences: ', len(transcript))

    
    #sentence tokenizing BEFORE stripping punctuation
    transcript_sentences = sent_tokenize(transcript)

    # clean up all punctuation, replace apostrophes with blank space
    transcript_sentences = punctuation_clean_up(transcript_sentences)
    
    t_sents = transcript_sentences
    for i in range(len(transcript_sentences)):
        t_sents[i] = word_tokenize(transcript_sentences[i])
        
    return t_sents

In [336]:
def word2sent_vec(w2v, special_sentences):
    
    no_of_sentences = len(special_sentences)
    word_vec_dim = len(w2v['word'])
    sentence_vec = np.zeros(word_vec_dim, dtype=np.float32)
    sentence_vec = list(np.zeros(no_of_sentences))
    errors = 0
    for i in range(no_of_sentences):
        no_of_words = 0
        sent_vec = np.zeros(word_vec_dim, dtype=np.float32)
    
        for j in range(len(special_sentences[i])):
            try:
                w2v_sents = w2v[special_sentences[i][j]]
                no_of_words += 1
                sent_vec += w2v_sents
            except:
                errors += 1
                pass
        if no_of_words == 0:
            ind_of_err = i
            
        else:
            sent_vec = sent_vec/no_of_words
            sentence_vec[i] = sent_vec
            
    print('errors: ', errors)
    
    # trim out error indices from zero word sentences
    #sentence_vec = np.delete(sentence_vec, no_of_words)
    
    # routine that checks for any zero'd elements and removes them
    #print('shape before: ', np.shape(sentence_vec))
    
    err_flag = 0
    err_list = []
    for i in range(no_of_sentences):
        if np.shape(sentence_vec[i]) == ():
            err_flag = 1
            err_list.append(i)
    
    if err_flag == 1:
        err_array = np.asarray(err_list, dtype=int)
        sentence_vec_fix = np.delete(sentence_vec, err_array)
        return sentence_vec_fix
    else:
        return sentence_vec

def cos_sim(sent1, sent2):
    norm = np.linalg.norm(sent1) * np.linalg.norm(sent2)
    cosine_similarity = np.dot(sent1, sent2) / norm
    return cosine_similarity

def sentence_sim_dist(sent_vecs, chunk_size):

    sim_dist = np.zeros(chunk_size)
    end_index = len(sent_vecs)
    # scan through all sentences!

    for i in range(end_index - chunk_size):
        sent_mid = sent_vecs[i]
        #print('i: ',i)
        for j in range(chunk_size):
            k = i + j
            #print('j: ', j, ' k: ', k)
            #print('cos_sim: ', cos_sim(sent_mid, sent_vecs[k]))
            sim_dist[j] += cos_sim(sent_mid, sent_vecs[k])
    norm = sim_dist[0]
    sim_dist = sim_dist/norm
    return sim_dist

def shuffler(sentence_vecs):
    sents_total = len(sentence_vecs)
    sent_dim = len(sentence_vecs[0])
    sentence_shuffle = np.zeros((sents_total, sent_dim), dtype=np.float32)
    rand_index = np.arange(sents_total)
    np.random.shuffle(rand_index)
    for i in range(sents_total):
        j = rand_index[i]
        sentence_shuffle[j] = sentence_vecs[i]
    return sentence_shuffle

In [337]:
# calculates the sentence vectors, measures sent2sent similarities and gets the baseline sent2sent similarity
# which is a measure of how the comedian's average style by shuffling the order of the sentences
# returns the array of sentence similarity adjusted by the baseline

def joke_length(special):
    
    chunk_size = 20
    special_sentences = gensim_preprocess(special)
    sentence_vecs = word2sent_vec(w2v_model, special_sentences)
    sim_dist = sentence_sim_dist(sentence_vecs, chunk_size)
#     for j in range(chunk_size):
#         print(i, sim_dist[j])

#     sentence_shuffle = shuffler(sentence_vecs)
#     sim_dist_shuffle = sentence_sim_dist(sentence_shuffle, chunk_size)
#     #print('baseline similarity: ', np.mean(sim_dist_shuffle[1:]))

    # ensemble average shuffled sentences to get a low variance baseline sentence2sentence similarity
    sim_dist_shuffle = np.zeros(chunk_size)
    ensemble_size = 10
    for i in range(ensemble_size):
        sentence_shuffle = shuffler(sentence_vecs)
        sim_dist_shuffle += sentence_sim_dist(sentence_shuffle, chunk_size)
    

    baseline_ensembled = np.mean(sim_dist_shuffle[1:])/ensemble_size
    print('baseline similarity: ', baseline_ensembled)
    baseline_sim = np.mean(sim_dist_shuffle[1:])

    sim_dist_adjusted = sim_dist - baseline_ensembled
    
    return sim_dist_adjusted


In [3]:
pickled_data = '/Users/johnpapaioannou/Desktop/insight/project/nlp_models/full_transcripts.pkl'
specials = pd.read_pickle(pickled_data)
print(specials.head())

       comedian                                         transcript
0   adam devine  \n[rock music playing]\n[indistinct chatter]\n...
1  adam sandler  \n[man] Okay, ready, and… Take your own cue, A...
2    adel karam  \nA NETFLIX COMEDY SPECIAL\nRecorded at the Ca...
3   al madrigal  \n[dog barks] [FisherGreen’s Sisters Brothers ...
4      ali wong  \nLadies and gentlemen, please welcome to the ...


In [327]:
#sim_dist_adjusted = joke_length(specials.iloc[7,1], 20)
s1 = joke_length(specials.iloc[7,1])[0]
s20 = joke_length(specials.iloc[7,1])[-1]
print('s1: ', 's20: ', s1, s20)

pre clean length:  94514
after init clean:  92429
after stop word removal:  62361
after removing numbers:  62262
after stripping short sentences:  59809
errors:  135
shape before:  (1785, 300)
0.3777849189333581
pre clean length:  94514
after init clean:  92429
after stop word removal:  62361
after removing numbers:  62262
after stripping short sentences:  59809
errors:  135
shape before:  (1785, 300)
0.37811070520508816
s1:  s20:  0.6222150810666419 0.003249604492868985


In [339]:
sim_dist = specials.transcript.map(joke_length)

errors:  85
baseline similarity:  0.43270611390807484
errors:  68
baseline similarity:  0.45999619088270666
errors:  47
baseline similarity:  0.31275833102821393
errors:  80
baseline similarity:  0.3616686354033089
errors:  91
baseline similarity:  0.3973392133294812
errors:  281
baseline similarity:  0.4218534621445659
errors:  109
baseline similarity:  0.3832003723078523
errors:  135
baseline similarity:  0.3777604131975894
errors:  190
baseline similarity:  0.38473355765521516
errors:  320
baseline similarity:  0.4086941596548284
errors:  74
baseline similarity:  0.4171354192413605
errors:  80
baseline similarity:  0.39846015682379926
errors:  446
baseline similarity:  0.4183855284147274
errors:  362
baseline similarity:  0.4000888384021226
errors:  248
baseline similarity:  0.37507689191878624
errors:  169
baseline similarity:  0.33660574279155353
errors:  71
baseline similarity:  0.44072297824543555
errors:  101
baseline similarity:  0.4038003454108437
errors:  114
baseline simila

In [342]:
specials['sim_dist'] = sim_dist

In [420]:
# use sim_dist to extract the S1 and S20 similarities, as head and tail respectively
sim_head_list = []
sim_tail_list = []
sim_ht_list = []
for i in range(126):
    sim_head_list.append(sim_dist[i][0])
    sim_tail_list.append(abs(sim_dist[i][-1]))
    sim_ht_list.append(abs(sim_dist[i][0]/sim_dist[i][-1]))
    
sim_head = pd.Series(sim_head_list)
sim_tail = pd.Series(sim_tail_list)
sim_ht = pd.Series(sim_ht_list)

In [430]:
# fold in similarity measures into the dataframe
specials['sim_head'] = sim_head
specials['sim_tail'] = sim_tail
specials['sim_ht'] = sim_ht

In [429]:
# exploratory analysis of new features!
i_head_min = sim_head.idxmin()
i_head_max = sim_head.idxmax()
i_tail_min = sim_tail.idxmin()
i_tail_max = sim_tail.idxmax()
i_ht_min = sim_ht.idxmin()
i_ht_max = sim_ht.idxmax()

head_mean = sim_head.mean()
tail_mean = sim_tail.mean()
ht_mean = sim_ht.mean()

print('head mean: ', head_mean)
print('tail mean: ', tail_mean)
print('min head comic: ', specials.iloc[i_head_min,0])
print('max head comic: ', specials.iloc[i_head_max,0])
print('min tail comic: ', specials.iloc[i_tail_min,0])
print('max tail comic: ', specials.iloc[i_tail_max,0])

print('\nH/T mean: ', ht_mean)
print('min H/T comic: ', specials.iloc[i_ht_min,0])
print('max H/T comic: ', specials.iloc[i_ht_max,0])

print('\nsummary statistics for short-range similarity\n')
print('1st third: ', np.quantile(sim_head, 0.33))
print('2nd third: ', np.quantile(sim_head, 0.67))

print('\nmax \ncomic: ', specials.iloc[sim_head.idxmax(),0], '  sim head: ', sim_head.max())
print('\nmin \ncomic: ', specials.iloc[sim_head.idxmin(),0], '  sim head: ', sim_head.min())

print('\nsummary statistics for long-range similarity\n')
print('1st third: ', np.quantile(sim_tail, 0.25))
print('2nd third: ', np.quantile(sim_tail, 0.62))

print('\nmax\\ncomic: ', specials.iloc[sim_tail.idxmax(),0], '  sim head: ', sim_tail.max())
print('\nmin\ncomic: ', specials.iloc[sim_tail.idxmin(),0], '  sim head: ', sim_tail.min())

print('\nsummary statistics for short/tail ratio similarity\n')
print('1st half: ', np.quantile(sim_ht, 0.5))
print('2nd third: ', np.quantile(sim_ht, 0.67))

print('\nmax\\ncomic: ', specials.iloc[sim_ht.idxmax(),0], '  sim head: ', sim_ht.max())
print('\nmin\ncomic: ', specials.iloc[sim_ht.idxmin(),0], '  sim head: ', sim_ht.min())

short_index = []
mid_index = []
long_index = []

form_factor = sim_ht
for i in range(len(sim_tail)):
    if form_factor[i] >= 77 :
        short_index.append(i)
#     if form_factor[i] > 60 and form_factor[i] < 120:
#         mid_index.append(i)
    if form_factor[i] < 77:
        long_index.append(i)
#     if sim[i] <= 0.0125:
#         clean_index.append(i)
short_comics = specials.iloc[short_index,0]
mid_comics = specials.iloc[mid_index,0]
long_comics = specials.iloc[long_index,0]
print('\n', len(short_comics), ' short comics: \n', list(short_comics))
print('\n', len(mid_comics), ' mid_comics: \n', list(mid_comics))
print('\n', len(long_comics), ' long comics: \n', list(long_comics))

head mean:  0.6118347466239409
tail mean:  0.00822266212477755
min head comic:  nikki glaser
max head comic:  judah friedlander
min tail comic:  mitch hedberg
max tail comic:  fred armisen

H/T mean:  219.87358023155969
min H/T comic:  fred armisen
max H/T comic:  mitch hedberg

summary statistics for short-range similarity

1st third:  0.5962200062712717
2nd third:  0.6265849577262826

max 
comic:  judah friedlander   sim head:  0.7195177800037722

min 
comic:  nikki glaser   sim head:  0.49944679293952177

summary statistics for long-range similarity

1st third:  0.005214894354035243
2nd third:  0.00946344501433552

max\ncomic:  fred armisen   sim head:  0.020080784200515678

min
comic:  mitch hedberg   sim head:  9.62989763982347e-05

summary statistics for short/tail ratio similarity

1st half:  77.42426274663654
2nd third:  101.02641496987955

max\ncomic:  mitch hedberg   sim head:  7061.832110319238

min
comic:  fred armisen   sim head:  33.277901838666104

 63  short comics: 
 [

In [1]:
# include new column to distinguish between short and long form comics
# spreading them into 2 evenly distributed groups, very fuzzy line that separates short from long form comics
# ideally, the metric would get fine-tuned for further differentiation into short form, mid form, and long form
short_long_div = 100

specials['joke_form'] = specials.apply(lambda row: 0 if (row.sim_ht > short_long_div) else 1, axis=1)
print(specials.head())

NameError: name 'specials' is not defined

In [440]:
# export dataframe with joke_form feature, will import into separate jupyter notebook for later trimming for deployment

data_path = '/Users/johnpapaioannou/Desktop/insight/project/data/'
file_path = data_path + 'joke_form.pkl'
specials.to_pickle(file_path)


In [320]:
no_of_specials = len(specials.iloc[:,1])
print('# of specials: ', no_of_specials)

# for i in range(no_of_specials):
#     print('i: ', i, 'comedian: ', specials.iloc[i,0])
    
jeselnik = specials.iloc[7,1]
birbigs = specials.iloc[89,1]
hedberg = specials.iloc[91,1]
patrice = specials.iloc[99,1]
bargatze = specials.iloc[93,1]
maron = specials.iloc[82,1]
carr = specials.iloc[65,1]
demetri_martin = specials.iloc[34,1]
amy_schumer = specials.iloc[5,1]
colin_quinn = specials.iloc[24,1]
kevin_smith = specials.iloc[77,1]
stewart_lee = specials.iloc[117,1]
attell = specials.iloc[31,1]


char_tot = 0
for i in range(no_of_specials):
    char_tot += len(specials.iloc[i,1])

print('total: ', char_tot, 'mean: ', char_tot/no_of_specials)

special_sentences = gensim_preprocess(specials.iloc[7,1])

print('# of sentences: ', len(special_sentences))

#print(special_sentences)
sentence_vecs = word2sent_vec(w2v_model, special_sentences)
chunk_size = 20

sim_dist = sentence_sim_dist(sentence_vecs, chunk_size)
for j in range(chunk_size):
    print(i, sim_dist[j])

sim_dist_shuffle = np.zeros(chunk_size)
ensemble_size = 10
for i in range(ensemble_size):
    sentence_shuffle = shuffler(sentence_vecs)
    sim_dist_shuffle += sentence_sim_dist(sentence_shuffle, chunk_size)
    
print('ensemble size: ', ensemble_size)

baseline_ensembled = np.mean(sim_dist_shuffle[1:])/ensemble_size
print(baseline_ensembled)

sim_dist_adjusted = sim_dist - baseline_ensembled

print(sim_dist_adjusted)

# of specials:  126
total:  11627863 mean:  92284.62698412698
pre clean length:  94514
after init clean:  92429
after stop word removal:  62361
after removing numbers:  62262
after stripping short sentences:  59809
# of sentences:  1785
errors:  135
shape before:  (1785, 300)
125 1.0
125 0.4637859738313628
125 0.4191660060992311
125 0.41029533751512604
125 0.4035567961775156
125 0.40087836517417913
125 0.39713082647933706
125 0.39178287376276116
125 0.3888707411193809
125 0.39231606604763625
125 0.39030795471254875
125 0.38910739667302036
125 0.3859207685206093
125 0.38402182368345833
125 0.3872987933615818
125 0.3863475426107288
125 0.38378192709016845
125 0.3830446343444706
125 0.3822209315308542
125 0.38136030969795714
ensemble size:  10
0.3778468022570286
[0.6221532  0.08593917 0.0413192  0.03244854 0.02570999 0.02303156
 0.01928402 0.01393607 0.01102394 0.01446926 0.01246115 0.01126059
 0.00807397 0.00617502 0.00945199 0.00850074 0.00593512 0.00519783
 0.00437413 0.00351351]
