In [2]:
# Basic functionalities
import numpy as np
import pandas as pd
import matplotlib.pyplot as plt
import re
import string
from collections import Counter
import pickle

# options
pd.set_option('max_colwidth',150)

In [2]:
# Web crawling
import requests
from bs4 import BeautifulSoup
import pickle

In [5]:
# Text encoding
from sklearn.feature_extraction import text 
from sklearn.feature_extraction.text import CountVectorizer
from sklearn.feature_extraction.text import TfidfVectorizer

In [4]:
# Visualization
from wordcloud import WordCloud

In [5]:
# Text Processing
from nltk import word_tokenize, pos_tag
import nltk
from nltk.stem import WordNetLemmatizer
wordnet_lemmatizer = WordNetLemmatizer()
from nltk.stem.porter import PorterStemmer
porter_stemmer = PorterStemmer()
# nltk.download('wordnet')
# nltk.set_proxy('SYSTEM PROXY')
# nltk.download('punkt')
# nltk.download('averaged_perceptron_tagger')

In [6]:
# Sentiment analysis
from textblob import TextBlob as tb

In [7]:
# Topic Modeling
from gensim import matutils, models
from gensim.models import LsiModel
import scipy.sparse



In [8]:
# Keyword Extraction
from summa.summarizer import summarize
from summa import keywords

In [9]:
# Progress bar
from tqdm import tqdm 

# Retrieve Data

## Data from google drive

In [33]:
df = pd.read_csv('transcript_paragraph_needed.csv')
df = df[["comedian","transcript"]]
df.columns = ["Comedian","Transcript"]
df

Unnamed: 0,Comedian,Transcript
0,andy woodhull,she told me she painted some clear cuz it makes her fingernail shiny and she likes them when to be shiny and i support her in that decision i want...
1,andy woodhull,we were on a road trip one time and she fell asleep i said honey you got to wake up im sleepy too and we need to talk to me she said i am not asle...
2,andy woodhull,i mention my girls already theyre my theyre my stepdaughters im a stepdad stepdad took over the lease on a couple of girls few years back thank yo...
3,andy woodhull,coopers you love the old bo you love the old bum forever cuz thats the boat let you know that you enjoy being the captain of a ship but if there w...
4,andy woodhull,my girls are teenagers now and got a couple teenagers at home and theyre really embracing it and every time i tell people i have teenage daughte...
...,...,...
2835,dave chappelle,lets not forget lets not forget ive never met bill cosby so im not defending him lets just remember that he has a valuable legacy that i cant just...
2836,dave chappelle,ahah ahah in every ghetto ahah ahah ahah in every ghetto ahah ahah ahah in every ghetto ahah ahah ahah in every ghet...
2837,dave chappelle,ahah ahah in every ghetto ahah revolution ahah ahah ahah in every ghetto one last thing before you go i just wanted to ackn...
2838,dave chappelle,can i kick it yes you can can i kick it yes you can can i kick it yes you can well im gone go on then can i kick...


In [34]:
len(df.Comedian.unique())

40

In [35]:
# get corpus
corpus = df[['Comedian','Transcript']].set_index('Comedian')
corpus

Unnamed: 0_level_0,Transcript
Comedian,Unnamed: 1_level_1
andy woodhull,she told me she painted some clear cuz it makes her fingernail shiny and she likes them when to be shiny and i support her in that decision i want...
andy woodhull,we were on a road trip one time and she fell asleep i said honey you got to wake up im sleepy too and we need to talk to me she said i am not asle...
andy woodhull,i mention my girls already theyre my theyre my stepdaughters im a stepdad stepdad took over the lease on a couple of girls few years back thank yo...
andy woodhull,coopers you love the old bo you love the old bum forever cuz thats the boat let you know that you enjoy being the captain of a ship but if there w...
andy woodhull,my girls are teenagers now and got a couple teenagers at home and theyre really embracing it and every time i tell people i have teenage daughte...
...,...
dave chappelle,lets not forget lets not forget ive never met bill cosby so im not defending him lets just remember that he has a valuable legacy that i cant just...
dave chappelle,ahah ahah in every ghetto ahah ahah ahah in every ghetto ahah ahah ahah in every ghetto ahah ahah ahah in every ghet...
dave chappelle,ahah ahah in every ghetto ahah revolution ahah ahah ahah in every ghetto one last thing before you go i just wanted to ackn...
dave chappelle,can i kick it yes you can can i kick it yes you can can i kick it yes you can well im gone go on then can i kick...


#### Clean the data

In [36]:
def apply_data_cleansing(text):
    # Lowercase every words
    text = text.lower()
    # Remove every words with [blah blah blah] format
    text = re.sub('\[.*?\]', '', text)
    # Remove every words with (blah blah blah) format
    text = re.sub('\(.*?\)', '', text)
    # Get rid of the punctuations
    text = re.sub('[%s]' % re.escape(string.punctuation), '', text)
    # Get rid of all the numbers or words that contain numbers
    text = re.sub('\w*\d\w*', '', text)
    # Get rid of these specific punctuations
    text = re.sub('[‘’“”…]', '', text)
    # Get rid of '\n'
    text = re.sub('\n', '', text)
    
    # Tokenizes and lemmetizes (or stems) them
    tokenized = word_tokenize(text)
    stemmed = [porter_stemmer.stem(t) for t in tokenized]
#     lemmetized = [wordnet_lemmatizer.lemmatize(t) for t in tokenized]
    return " ".join(lemmetized)

In [39]:
clean_corpus = pd.DataFrame(corpus.Transcript.apply(apply_data_cleansing))
clean_corpus_index = clean_corpus.reset_index()
clean_corpus_index

Unnamed: 0,Comedian,Transcript
0,andy woodhull,she told me she painted some clear cuz it make her fingernail shiny and she like them when to be shiny and i support her in that decision i want t...
1,andy woodhull,we were on a road trip one time and she fell asleep i said honey you got to wake up im sleepy too and we need to talk to me she said i am not asle...
2,andy woodhull,i mention my girl already theyre my theyre my stepdaughter im a stepdad stepdad took over the lease on a couple of girl few year back thank you ve...
3,andy woodhull,cooper you love the old bo you love the old bum forever cuz thats the boat let you know that you enjoy being the captain of a ship but if there wa...
4,andy woodhull,my girl are teenager now and got a couple teenager at home and theyre really embracing it and every time i tell people i have teenage daughter at ...
...,...,...
2835,dave chappelle,let not forget let not forget ive never met bill cosby so im not defending him let just remember that he ha a valuable legacy that i cant just thr...
2836,dave chappelle,ahah ahah in every ghetto ahah ahah ahah in every ghetto ahah ahah ahah in every ghetto ahah ahah ahah in every ghetto wait wait wait wait wait i ...
2837,dave chappelle,ahah ahah in every ghetto ahah revolution ahah ahah ahah in every ghetto one last thing before you go i just wanted to acknowledge for the real co...
2838,dave chappelle,can i kick it yes you can can i kick it yes you can can i kick it yes you can well im gone go on then can i kick it to my tribe that flow in layer...


In [40]:
df = pd.DataFrame(clean_corpus_index.Comedian)

In [41]:
# Pickles the result for later usage
clean_corpus_index.to_pickle('clean_corpus_index.pkl')

#### Retrieve only NOUNS of all transcripts

In [42]:
def nouns(text):
    is_noun = lambda pos: pos[:2] == 'NN'
    tokenized = word_tokenize(text)
    all_nouns = [word for (word, pos) in pos_tag(tokenized) if is_noun(pos)] 
    return ' '.join(all_nouns)

In [43]:
corpus = pd.read_pickle('clean_corpus_index.pkl')

In [44]:
corpus.index = corpus.Comedian
# Apply the nouns function to the transcripts to filter only on nouns
corpus = pd.DataFrame(corpus.Transcript.apply(nouns))
corpus_nouns_index = corpus.reset_index()
corpus_nouns_index.to_pickle('corpus_nouns_index.pkl')

#### Retrieve NOUNS and ADJECTIVES of all transcripts

In [46]:
def nouns_adj(text):
    is_noun_adj = lambda pos: pos[:2] == 'NN' or pos[:2] == 'JJ'
    tokenized = word_tokenize(text)
    nouns_adj = [word for (word, pos) in pos_tag(tokenized) if is_noun_adj(pos)] 
    return ' '.join(nouns_adj)

In [47]:
corpus = pd.read_pickle('clean_corpus_index.pkl')
corpus.index = corpus.Comedian
# Apply the nouns function to the transcripts to filter only on nouns
corpus = pd.DataFrame(corpus.Transcript.apply(nouns_adj))
corpus_na_index = corpus.reset_index()
corpus_na_index.to_pickle('corpus_na_index.pkl')

# Analyize the transcripts

#### Text Encoding

In [9]:
def dtm(corpus, stop_words='english', pickle_filename="", save_pickle=False):
    cv = CountVectorizer(stop_words=stop_words)
    data_cv = cv.fit_transform(corpus.Transcript)
    data_dtm = pd.DataFrame(data_cv.toarray(), columns=cv.get_feature_names())
    data_dtm.index = corpus.index
    if save_pickle:
        pickle.dump(cv, open(pickle_filename, "wb"))        
    return data_dtm.transpose()

def tf_idf(corpus, stop_words='english', pickle_filename="", save_pickle=False):
    vectorizer = TfidfVectorizer(stop_words='english')
    data_v = vectorizer.fit_transform(corpus.Transcript)
    data_tfidf = pd.DataFrame(data_v.toarray(), columns=vectorizer.get_feature_names())
    data_tfidf.index = corpus.index
    if save_pickle:
        pickle.dump(vectorizer, open(pickle_filename, "wb"))        
    return data_tfidf.transpose()

#### Top words

In [23]:
def get_top_words(matrix_dataframe, number_of_words = 30):
    top_dict = {}
    for c in matrix_dataframe.columns:
        top = matrix_dataframe[c].sort_values(ascending=False).head(number_of_words)
        top_dict[c]= list(zip(top.index, top.values))
    return top_dict

In [24]:
# Get all kinds of corpus
clean_corpus = pd.read_pickle('clean_corpus_index.pkl')
corpus_n = pd.read_pickle('corpus_nouns_index.pkl')
corpus_na = pd.read_pickle('corpus_na_index.pkl')

all_corpuses = [clean_corpus, corpus_n, corpus_na]

In [30]:
df_dtms = []
dtm_tops = []
for corpus in all_corpuses:
    # Document-Term Matrix
    df_dtm = dtm(corpus)
    # Get top words
    dtm_top = get_top_words(df_dtm)
    
    df_dtms.append(df_dtm)
    dtm_tops.append(dtm_top)

In [31]:
# Have a glimps on the top words of each transcript

df = pd.DataFrame(index=df_dtms[0].columns)
for i, corpus in enumerate(all_corpuses):
    dtm_top = dtm_tops[i]
    df[f'corpus{i}_DTM'] = [[w[0] for w in words] for words in dtm_top.values()]
    
df

Unnamed: 0,corpus0_DTM,corpus1_DTM,corpus2_DTM
0,"[shower, im, just, like, dont, know, shiny, said, wife, wa, late, rinsing, told, youre, thats, want, popcorn, netflix, movie, start, support, home...","[shower, wife, im, movie, youre, dont, grate, cuz, screen, colander, bunch, support, wa, id, forshes, sink, napkin, hour, thing, fingernail, pesti...","[shower, im, dont, shiny, wife, youre, popcorn, late, movie, netflix, wa, forshes, experience, extra, sink, water, hour, result, pesticide, finger..."
1,"[youre, gummy, said, just, asleep, bear, mouth, im, away, need, honey, trip, na, trying, road, awake, time, mile, ive, wake, kept, alert, airing, ...","[gummy, mouth, youre, gon, throat, mile, sleepy, bear, eye, road, flavor, time, trip, thats, mind, honey, fingernail, finisce, finn, finito, fin, ...","[youre, mouth, gummy, eye, mile, bear, thats, open, im, mind, asleep, gon, red, yellow, honey, sleepy, gross, road, alert, throat, ive, flavor, tr..."
2,"[think, love, boat, year, im, na, stepdad, dont, gon, girl, course, hand, exactly, theyre, step, just, amazing, wa, wheres, probably, right, best,...","[boat, year, stepdad, girl, step, hand, dont, course, gon, chance, life, parent, care, kid, yeah, lease, way, mention, purpose, stepdaughter, thin...","[boat, year, stepdad, step, course, girl, dont, hand, best, gon, good, parent, thats, human, judgmental, stuff, congratulation, genitals, person, ..."
3,"[im, big, responsibility, emergency, lot, thats, love, deal, number, pressure, list, know, boat, time, line, old, form, walk, terrifying, writing,...","[thats, lot, emergency, responsibility, form, boat, line, list, number, time, deal, pressure, harbor, stuff, stepdad, hurricane, cooper, candidate...","[im, emergency, thats, responsibility, lot, big, time, list, deal, old, boat, pressure, line, number, form, shoulder, contact, stepparent, stuff, ..."
4,"[theyre, just, teenager, na, home, gon, know, like, difficult, really, said, good, cuz, sure, aha, ride, tough, couple, think, luck, talk, languag...","[theyre, home, language, ride, time, couple, teenager, day, girl, mall, somebody, daughter, luck, people, finita, finir, finish, finiscono, finito...","[theyre, home, good, difficult, couple, luck, people, day, girl, mall, language, teenager, daughter, ride, tough, somebody, sure, time, gon, foods..."
...,...,...,...
2835,"[remember, im, black, rape, just, television, let, heard, wa, night, make, cosby, ive, forget, save, clinical, image, luther, college, guy, man, d...","[television, im, night, bowl, shit, cartoon, man, learning, lip, kicker, step, point, let, dream, institution, thousand, pa, number, college, thin...","[television, im, black, night, ive, thank, step, institution, pa, learning, responsible, cartoon, stood, dollar, number, dream, thursday, lincoln,..."
2836,"[ahah, wa, time, wait, ghetto, chris, seeing, like, juice, guy, familiar, tucker, picture, good, fourth, face, thats, admired, long, drinking, mic...","[ahah, time, wait, ghetto, juice, picture, face, wa, tucker, ready, night, thats, story, derby, simpson, michael, jordan, reason, career, room, cr...","[ahah, time, wait, ghetto, juice, face, familiar, good, wa, picture, tucker, chris, fourth, simpson, thats, oj, room, ive, career, crowd, right, h..."
2837,"[ahah, acknowledge, just, fan, phife, thank, everybody, comedy, ghetto, hand, shouting, music, thing, right, man, rap, peace, revolution, sinceres...","[ahah, comedy, ghetto, fan, everybody, hand, phife, fucking, man, garry, dawg, finger, peace, forever, friend, juggernaut, thing, hiphop, building...","[ahah, ghetto, fan, comedy, phife, everybody, hand, peace, good, man, music, im, family, night, fucking, garry, building, week, real, juggernaut, ..."
2838,"[kick, yes, im, rhythm, like, youll, really, tribe, recipe, studio, dinkins, lot, ha, breath, minor, flavor, favor, feel, player, fresh, right, sa...","[tribe, rhythm, favor, drop, garment, track, player, note, breath, studio, game, biatch, instruct, obeyer, right, dinkins, boy, flow, behavior, ma...","[rhythm, tribe, poem, dinkins, doesnt, boy, big, mr, minor, garment, free, rich, biatch, funky, im, flow, savior, mayor, savor, note, quest, game,..."


Since there are too many words that are common among all the transcript, we need to add them to the stop-word list and redo the text encoding again.

#### Extract all common words among the transcript

In [32]:
commons = []
for c in df.columns:
    words = []
    for r in df[c]:
        top = [word for word in r]
        for t in top:
            words.append(t)    
    commons.append(Counter(words).most_common())

In [33]:
common_words_list = []
for common in commons:
    new = [word for word, count in common if count > 6]
    common_words_list.append(np.array(new))
common_words_list

[array(['like', 'dont', 'im', ..., 'hasan', 'est', 'fue'], dtype='<U14'),
 array(['im', 'youre', 'people', ..., 'cuando', 'cil', 'tipo'],
       dtype='<U15'),
 array(['im', 'youre', 'thats', ..., 'quel', 'quiero', 'fue'], dtype='<U16')]

In [34]:
pickle.dump(common_words_list, open('common_words_list_index.pkl', 'wb'))

#### Add the stop words to the original stop word list and redo the text encoding

In [11]:
common_words_list = pickle.load(open('common_words_list_index.pkl','rb'))
corpus_na = pd.read_pickle('corpus_na_index.pkl')

In [12]:
stop_words_list = []
for common_words in common_words_list:
    stop_words = text.ENGLISH_STOP_WORDS.union(common_words)
    stop_words_list.append(stop_words)

In [13]:
df_dtm = dtm(corpus_na, stop_words, f'index_corpus{2}_cv.pkl', save_pickle=True)
df_dtm.to_pickle(f'index_corpus{2}_dtm.pkl')

In [6]:
for i, stop_words in enumerate(stop_words_list):
    if i % 2 == 0:
        # Document-Term Matrix
        df_dtm = dtm(corpus, stop_words, f'./pickles_index/index_corpus{int(i/2)}_cv.pkl', save_pickle=True)
        # Transpose back to comedians as index for topic modeling
        df_dtm.to_pickle(f'./pickles_index/index_corpus{int(i/2)}_dtm.pkl')
    else:
        # Tf-Idf Matrix
        df_tfidf = tf_idf(corpus, stop_words, f'./pickles_index/index_corpus{int((i-1)/2)}_tf_idf.pkl', save_pickle=True)  
        # Transpose back to comedians as index for topic modeling  
        df_tfidf.to_pickle(f'./pickles_index/index_corpus{int((i-1)/2)}_tim.pkl')

### Visualize the top words with WordCloud

In [None]:
# # 
# wc_list = []
# for stop_words in stop_words_list:
#     wc = WordCloud(stopwords=stop_words,
#                    background_color="white", 
#                    colormap="Dark2",
#                    max_font_size=150, 
#                    random_state=42)
#     wc_list.append(wc)

In [None]:
### the index should be 1 < num < 25
# plt.rcParams['figure.figsize'] = [30,10]

# index_now = clean_corpus.index

# for i, wc in enumerate(wc_list):
#     fig = plt.figure()
#     if i == 0 or i == 1:
#         corpus = clean_corpus
#     elif i == 2 or i == 3:
#         corpus = corpus_n
#     elif i == 4 or i == 5:
#         corpus = corpus_na
#     print(f"------------- Stop words {i} -------------")
#     # Create subplots for each transcript
#     for index, comedian in enumerate(index_now):
#         wc.generate(corpus.Transcript[index])
#         plt.subplot(4, 6, index+1)
#         plt.imshow(wc, interpolation="bilinear")
#         plt.axis("off")
#         plt.title(index_now[index])

#     plt.show()

# Sentiment Analysis

In [None]:
corpus_filename = './pickles_index/corpus_na_index.pkl'
clean_corpus = pd.read_pickle(corpus_filename)

In [None]:
pol = lambda x: tb(x).sentiment.polarity
sub = lambda x: tb(x).sentiment.subjectivity

data = pd.DataFrame()
data['polarity'] = clean_corpus['Transcript'].apply(pol)
data['subjectivity'] = clean_corpus['Transcript'].apply(sub)
data

In [None]:
plt.rcParams['figure.figsize'] = [10, 8]

for index, comedian in enumerate(data.index):
    x = data.polarity.loc[comedian]
    y = data.subjectivity.loc[comedian]
    plt.scatter(x, y, color='blue')
    plt.text(x+.001, y+.001, data.index[index], fontsize=10)
#     plt.xlim(-.01, .12) 
    
plt.title('Sentiment Analysis', fontsize=20)
plt.xlabel('<-- Negative -------- Positive -->', fontsize=15)
plt.ylabel('<-- Facts -------- Opinions -->', fontsize=15)

plt.show()

# Topic Modeling

# Latent Dirichlet Allocation (LDA)

In [30]:
def train_LDA_model(data_matrix, vectorizer, num_topics=4, passes=3000):
    # df --> sparse matrix --> gensim corpus
    sparse_counts = scipy.sparse.csr_matrix(data_matrix)
    gensim_corpus = matutils.Sparse2Corpus(sparse_counts)
    id2word = dict((v, k) for k, v in vectorizer.vocabulary_.items())
    lda = models.LdaModel(corpus=gensim_corpus, id2word=id2word, num_topics=num_topics, passes=passes)
    
    corpus_transformed = lda[gensim_corpus]
    result = list(zip([max(ct, key=lambda x: x[1])[0] for ct in corpus_transformed], data_matrix.columns))
    return lda, result

In [31]:
# Load all matrices
clean_corpus_dtm = pd.read_pickle('./pickles_index/index_corpus0_dtm.pkl')
clean_corpus_tim = pd.read_pickle('./pickles_index/index_corpus0_tim.pkl')
corpus_noun_dtm = pd.read_pickle('./pickles_index/index_corpus1_dtm.pkl')
corpus_noun_tim = pd.read_pickle('./pickles_index/index_corpus1_tim.pkl')
corpus_na_dtm = pd.read_pickle('./pickles_index/index_corpus2_dtm.pkl')
corpus_na_tim = pd.read_pickle('./pickles_index/index_corpus2_tim.pkl')

In [32]:
# Load all vectorizers
clean_corpus_cv = pickle.load(open('./pickles_index/index_corpus0_cv.pkl', 'rb'))
clean_corpus_tf_idf = pickle.load(open('./pickles_index/index_corpus0_tf_idf.pkl','rb'))
corpus_noun_cv = pickle.load(open('./pickles_index/index_corpus1_cv.pkl','rb'))
corpus_noun_tf_idf = pickle.load(open('./pickles_index/index_corpus1_tf_idf.pkl','rb'))
corpus_na_cv = pickle.load(open('./pickles_index/index_corpus2_cv.pkl','rb'))
corpus_na_tf_idf = pickle.load(open('./pickles_index/index_corpus2_tf_idf.pkl', 'rb'))

In [33]:
clean_corpus_lda, result1 = train_LDA_model(clean_corpus_dtm, clean_corpus_cv)
clean_corpus_lda.print_topics()

[(0,
  '0.002*"randy" + 0.002*"itll" + 0.002*"ryan" + 0.002*"blue" + 0.002*"parasite" + 0.002*"flag" + 0.002*"shrooms" + 0.002*"twin" + 0.002*"elevator" + 0.002*"naga"'),
 (1,
  '0.004*"pill" + 0.003*"ow" + 0.002*"gas" + 0.002*"mandela" + 0.002*"rumor" + 0.002*"cash" + 0.002*"captain" + 0.002*"nelson" + 0.002*"birth" + 0.001*"boot"'),
 (2,
  '0.008*"russian" + 0.003*"taco" + 0.003*"computer" + 0.002*"shitty" + 0.002*"wallet" + 0.002*"death" + 0.002*"doubt" + 0.002*"ci" + 0.001*"ebola" + 0.001*"clap"'),
 (3,
  '0.003*"dennys" + 0.002*"toy" + 0.002*"pm" + 0.002*"pain" + 0.002*"japan" + 0.002*"candy" + 0.002*"muy" + 0.002*"oprah" + 0.002*"yogurt" + 0.002*"peanut"')]

In [34]:
result1

[(1, 0),
 (2, 1),
 (2, 2),
 (2, 3),
 (3, 4),
 (3, 5),
 (0, 6),
 (0, 7),
 (0, 8),
 (1, 9),
 (3, 10),
 (2, 11),
 (0, 12),
 (2, 13),
 (2, 14),
 (2, 15),
 (2, 16),
 (1, 17),
 (0, 18),
 (0, 19),
 (0, 20),
 (2, 21),
 (1, 22),
 (1, 23),
 (0, 24),
 (2, 25),
 (2, 26),
 (2, 27),
 (1, 28),
 (2, 29),
 (2, 30),
 (2, 31),
 (1, 32),
 (2, 33),
 (1, 34),
 (1, 35),
 (2, 36),
 (2, 37),
 (1, 38),
 (2, 39),
 (1, 40),
 (1, 41),
 (0, 42),
 (3, 43),
 (3, 44),
 (2, 45),
 (2, 46),
 (0, 47),
 (0, 48),
 (1, 49),
 (3, 50),
 (2, 51),
 (0, 52),
 (3, 53),
 (2, 54),
 (3, 55),
 (2, 56),
 (0, 57),
 (2, 58),
 (1, 59),
 (2, 60),
 (1, 61),
 (3, 62),
 (0, 63),
 (2, 64),
 (0, 65),
 (2, 66),
 (2, 67),
 (0, 68),
 (1, 69),
 (3, 70),
 (2, 71),
 (1, 72),
 (0, 73),
 (0, 74),
 (3, 75),
 (0, 76),
 (1, 77),
 (1, 78),
 (2, 79),
 (2, 80),
 (3, 81),
 (1, 82),
 (1, 83),
 (1, 84),
 (0, 85),
 (1, 86),
 (3, 87),
 (0, 88),
 (3, 89),
 (0, 90),
 (2, 91),
 (1, 92),
 (1, 93),
 (1, 94),
 (2, 95),
 (0, 96),
 (2, 97),
 (3, 98),
 (2, 99),
 (0, 100),

In [35]:
corpus_noun_lda, result2 = train_LDA_model(corpus_noun_dtm, corpus_noun_cv)
corpus_noun_lda.print_topics()

[(0,
  '0.020*"little" + 0.009*"big" + 0.009*"white" + 0.009*"new" + 0.008*"bad" + 0.008*"black" + 0.008*"indian" + 0.007*"nice" + 0.007*"great" + 0.007*"different"'),
 (1,
  '0.015*"white" + 0.012*"little" + 0.012*"old" + 0.011*"black" + 0.009*"big" + 0.009*"great" + 0.009*"real" + 0.008*"new" + 0.007*"asian" + 0.006*"american"'),
 (2,
  '0.011*"una" + 0.003*"ci" + 0.002*"cowboy" + 0.002*"io" + 0.002*"twin" + 0.002*"vlad" + 0.002*"poi" + 0.002*"ogni" + 0.002*"prime" + 0.002*"persone"'),
 (3,
  '0.028*"black" + 0.011*"little" + 0.011*"bad" + 0.009*"white" + 0.007*"real" + 0.006*"hard" + 0.005*"new" + 0.005*"better" + 0.004*"big" + 0.004*"great"')]

In [36]:
result2

[(3, 0),
 (1, 1),
 (2, 2),
 (3, 3),
 (3, 4),
 (3, 5),
 (0, 6),
 (0, 7),
 (0, 8),
 (0, 9),
 (1, 10),
 (0, 11),
 (0, 12),
 (1, 13),
 (0, 14),
 (3, 15),
 (0, 16),
 (0, 17),
 (0, 18),
 (0, 19),
 (1, 20),
 (0, 21),
 (3, 22),
 (3, 23),
 (0, 24),
 (0, 25),
 (3, 26),
 (0, 27),
 (3, 28),
 (3, 29),
 (0, 30),
 (0, 31),
 (0, 32),
 (0, 33),
 (0, 34),
 (0, 35),
 (1, 36),
 (0, 37),
 (0, 38),
 (1, 39),
 (0, 40),
 (1, 41),
 (1, 42),
 (1, 43),
 (3, 44),
 (1, 45),
 (3, 46),
 (1, 47),
 (3, 48),
 (1, 49),
 (3, 50),
 (1, 51),
 (0, 52),
 (0, 53),
 (1, 54),
 (0, 55),
 (3, 56),
 (1, 57),
 (1, 58),
 (0, 59),
 (1, 60),
 (1, 61),
 (0, 62),
 (0, 63),
 (0, 64),
 (3, 65),
 (1, 66),
 (2, 67),
 (0, 68),
 (1, 69),
 (1, 70),
 (3, 71),
 (0, 72),
 (0, 73),
 (1, 74),
 (1, 75),
 (0, 76),
 (3, 77),
 (1, 78),
 (1, 79),
 (3, 80),
 (1, 81),
 (2, 82),
 (1, 83),
 (1, 84),
 (0, 85),
 (0, 86),
 (1, 87),
 (0, 88),
 (0, 89),
 (0, 90),
 (0, 91),
 (0, 92),
 (0, 93),
 (0, 94),
 (0, 95),
 (1, 96),
 (1, 97),
 (0, 98),
 (3, 99),
 (1, 100),

In [37]:
corpus_na_lda, result3 = train_LDA_model(corpus_na_dtm, corpus_na_cv)
corpus_na_lda.print_topics()

[(0,
  '0.003*"ow" + 0.002*"doubt" + 0.002*"randy" + 0.002*"bombay" + 0.002*"threat" + 0.002*"imma" + 0.001*"canada" + 0.001*"vince" + 0.001*"breakup" + 0.001*"shrooms"'),
 (1,
  '0.003*"dennys" + 0.002*"trap" + 0.002*"sperm" + 0.002*"wallet" + 0.002*"ebola" + 0.002*"passenger" + 0.002*"dan" + 0.002*"naga" + 0.001*"bien" + 0.001*"cow"'),
 (2,
  '0.003*"rumor" + 0.002*"namaste" + 0.002*"yogurt" + 0.002*"oprah" + 0.001*"golddiggers" + 0.001*"beyoncé" + 0.001*"hut" + 0.001*"clap" + 0.001*"vegetable" + 0.001*"hoodie"'),
 (3,
  '0.003*"mandela" + 0.002*"pull" + 0.002*"cowboy" + 0.002*"nelson" + 0.002*"island" + 0.002*"twin" + 0.002*"peanut" + 0.002*"motto" + 0.001*"barack" + 0.001*"metal"')]

In [38]:
result3

[(1, 0),
 (0, 1),
 (3, 2),
 (1, 3),
 (2, 4),
 (3, 5),
 (0, 6),
 (0, 7),
 (0, 8),
 (2, 9),
 (3, 10),
 (3, 11),
 (2, 12),
 (0, 13),
 (0, 14),
 (3, 15),
 (1, 16),
 (0, 17),
 (2, 18),
 (3, 19),
 (2, 20),
 (3, 21),
 (3, 22),
 (2, 23),
 (1, 24),
 (1, 25),
 (2, 26),
 (1, 27),
 (0, 28),
 (2, 29),
 (2, 30),
 (0, 31),
 (2, 32),
 (0, 33),
 (1, 34),
 (1, 35),
 (0, 36),
 (0, 37),
 (0, 38),
 (2, 39),
 (0, 40),
 (1, 41),
 (0, 42),
 (0, 43),
 (0, 44),
 (1, 45),
 (1, 46),
 (3, 47),
 (1, 48),
 (0, 49),
 (0, 50),
 (3, 51),
 (0, 52),
 (1, 53),
 (0, 54),
 (0, 55),
 (1, 56),
 (0, 57),
 (2, 58),
 (0, 59),
 (0, 60),
 (0, 61),
 (0, 62),
 (2, 63),
 (3, 64),
 (0, 65),
 (0, 66),
 (2, 67),
 (2, 68),
 (2, 69),
 (0, 70),
 (2, 71),
 (3, 72),
 (2, 73),
 (0, 74),
 (0, 75),
 (0, 76),
 (3, 77),
 (2, 78),
 (2, 79),
 (1, 80),
 (0, 81),
 (1, 82),
 (1, 83),
 (0, 84),
 (1, 85),
 (1, 86),
 (2, 87),
 (1, 88),
 (1, 89),
 (0, 90),
 (3, 91),
 (2, 92),
 (1, 93),
 (1, 94),
 (0, 95),
 (1, 96),
 (0, 97),
 (3, 98),
 (2, 99),
 (0, 100),

In [None]:
# clean_corpus_lda, result2 = train_LDA_model(clean_corpus_tim, clean_corpus_tf_idf)
# clean_corpus_lda.print_topics()

In [None]:
# result2

In [None]:
# corpus_noun_lda, result4 = train_LDA_model(corpus_noun_tim, corpus_noun_tf_idf)
# corpus_noun_lda.print_topics()

In [None]:
# result4

In [None]:
# corpus_na_lda, result6 = train_LDA_model(corpus_na_tim, corpus_na_tf_idf)
# corpus_na_lda.print_topics()

In [None]:
# result6

In [39]:
groups = []
group_df = pd.DataFrame(index=[0,1,2,3])
for i, result in enumerate([result1, result2, result3]):
    group = {}
    for r in result:
        if r[0] in group:
            group[r[0]].append(r[1])
        else:
            group[r[0]] = [r[1]]
    group_df[f'result_{i+1}'] = group.values()
    groups.append(group)

In [40]:
group_df.to_pickle('./pickles_index/group.pkl')
group_df.to_csv('./pickles_index/group.csv')

# Latent Semantic Analysis (LSA)

In [None]:
def train_LSA_model(data_matrix, vectorizer, num_topics=5):
    # df --> sparse matrix --> gensim corpus
    sparse_counts = scipy.sparse.csr_matrix(data_matrix)
    gensim_corpus = matutils.Sparse2Corpus(sparse_counts)
    id2word = dict((v, k) for k, v in vectorizer.vocabulary_.items())
    lsi = LsiModel(gensim_corpus, num_topics=num_topics, id2word=id2word)
    
    corpus_transformed = lsi[gensim_corpus]
    result = list(zip([max(ct, key=lambda x: x[1])[0] for ct in corpus_transformed], data_matrix.columns))
    return lsi, result

In [None]:
corpus_na_lsa, res = train_LSA_model(corpus_na_tim, corpus_na_tf_idf)
corpus_na_lsa.print_topics()

In [None]:
res

# Coherence Value

In [None]:
def compute_coherence_values(dictionary, doc_term_matrix, doc_clean, stop, start=2, step=3):
    """
    Input   : dictionary : Gensim dictionary
              corpus : Gensim corpus
              texts : List of input texts
              stop : Max num of topics
    purpose : Compute c_v coherence for various number of topics
    Output  : model_list : List of LSA topic models
              coherence_values : Coherence values corresponding to the LDA model with respective number of topics
    """
    coherence_values = []
    model_list = []
    for num_topics in range(start, stop, step):
        # generate LSA model
        model = LsiModel(doc_term_matrix, num_topics=number_of_topics, id2word = dictionary)  # train model
        model_list.append(model)
        coherencemodel = CoherenceModel(model=model, texts=doc_clean, dictionary=dictionary, coherence='c_v')
        coherence_values.append(coherencemodel.get_coherence())
    return model_list, coherence_values

In [None]:
def plot_graph(doc_clean,start, stop, step):
    dictionary,doc_term_matrix=prepare_corpus(doc_clean)
    model_list, coherence_values = compute_coherence_values(dictionary, doc_term_matrix,doc_clean,
                                                            stop, start, step)
    # Show graph
    x = range(start, stop, step)
    plt.plot(x, coherence_values)
    plt.xlabel("Number of Topics")
    plt.ylabel("Coherence score")
    plt.legend(("coherence_values"), loc='best')
    plt.show()

# Vectorize Comedians

In [41]:
# Read the corpus
clean_corpus_index = pd.read_pickle('./pickles_index/clean_corpus_index.pkl')
clean_corpus = pd.read_pickle('./pickles_index/clean_corpus_index.pkl')
corpus_nouns = pd.read_pickle('./pickles_index/corpus_nouns_index.pkl')
corpus_na = pd.read_pickle('./pickles_index/corpus_na_index.pkl')

In [42]:
comedians = clean_corpus_index.Comedian
comedian_df = pd.DataFrame(comedians)
comedian_df

Unnamed: 0,Comedian
0,ramy youssef
1,ramy youssef
2,ramy youssef
3,ramy youssef
4,ramy youssef
...,...
2705,whitney cummings
2706,whitney cummings
2707,whitney cummings
2708,whitney cummings


In [43]:
def append_comedians(df):
    df['Comedians'] = comedians
    return df

# clean corpus already has the comedian column
# 
corpus_nouns['Comedians'] = comedians
corpus_na['Comedians'] = comedians

In [44]:
# Read the styles for each transcript
styles = pd.read_pickle('./pickles_index/group.pkl')
styles

Unnamed: 0,result_1,result_2,result_3
0,"[0, 9, 17, 22, 23, 28, 32, 34, 35, 38, 40, 41,...","[0, 3, 4, 5, 15, 22, 23, 26, 28, 29, 44, 46, 4...","[0, 3, 16, 24, 25, 27, 34, 35, 41, 45, 46, 48,..."
1,"[1, 2, 3, 11, 13, 14, 15, 16, 21, 25, 26, 27, ...","[1, 10, 13, 20, 36, 39, 41, 42, 43, 45, 47, 49...","[1, 6, 7, 8, 13, 14, 17, 28, 31, 33, 36, 37, 3..."
2,"[4, 5, 10, 43, 44, 50, 53, 55, 62, 70, 75, 81,...","[2, 67, 82, 123, 165, 179, 266, 314, 316, 320,...","[2, 5, 10, 11, 15, 19, 21, 22, 47, 51, 64, 72,..."
3,"[6, 7, 8, 12, 18, 19, 20, 24, 42, 47, 48, 52, ...","[6, 7, 8, 9, 11, 12, 14, 16, 17, 18, 19, 21, 2...","[4, 9, 12, 18, 20, 23, 26, 29, 30, 32, 39, 58,..."


In [45]:
def comedian2vec(groups, lookup_table):
    """
    :param groups: group should be a 2D matrix that contains transcript ids for each group.
    :param lookup_table: comedian-transcript_id table to look up which comedian an id is corresponeded to. This should have a "Comedian".
    """
    comedians = list(set(lookup_table.Comedian))
    comedian_vectors = {}
    for g_id, group in enumerate(groups):
        for i in group:
            com = lookup_table.iloc[i][0]
            if com not in comedian_vectors:
                comedian_vectors[com] = [0 for j in range(len(groups))]
            comedian_vectors[com][g_id] += 1
            
    for k, v in comedian_vectors.items():
        comedian_vectors[k] = np.divide(v, sum(v))        
            
    return comedian_vectors   

In [46]:
g1 = list(styles.result_1.values)
comedian2vec(g1, comedian_df)

{'ramy youssef': array([0.25641026, 0.46153846, 0.07692308, 0.20512821]),
 'andy woodhull': array([0.18518519, 0.33333333, 0.22222222, 0.25925926]),
 'amy schumer': array([0.26865672, 0.28358209, 0.1641791 , 0.28358209]),
 'aziz ansari': array([0.27118644, 0.3559322 , 0.18644068, 0.18644068]),
 'bert kreischer': array([0.34285714, 0.24285714, 0.15714286, 0.25714286]),
 'bill burr': array([0.23664122, 0.19847328, 0.2519084 , 0.3129771 ]),
 'chris rock': array([0.23717949, 0.23717949, 0.31410256, 0.21153846]),
 'dave chappelle': array([0.22972973, 0.21171171, 0.36936937, 0.18918919]),
 'eric andre': array([0.25490196, 0.23529412, 0.29411765, 0.21568627]),
 'george lopez': array([0.35714286, 0.17857143, 0.35714286, 0.10714286]),
 'hannah gadsby': array([0.32786885, 0.21311475, 0.13114754, 0.32786885]),
 'hasan minhaj': array([0.19354839, 0.38709677, 0.25806452, 0.16129032]),
 'iliza shlesinger': array([0.34042553, 0.20567376, 0.19858156, 0.25531915]),
 'jack whitehall': array([0.1875  , 0

In [47]:
g2 = list(styles.result_2.values)
comedian2vec(g2, comedian_df)

{'ramy youssef': array([0.25641026, 0.12820513, 0.02564103, 0.58974359]),
 'andy woodhull': array([0.22222222, 0.48148148, 0.        , 0.2962963 ]),
 'amy schumer': array([0.1641791 , 0.35820896, 0.04477612, 0.43283582]),
 'aziz ansari': array([0.10169492, 0.3559322 , 0.03389831, 0.50847458]),
 'bert kreischer': array([0.24285714, 0.34285714, 0.        , 0.41428571]),
 'bill burr': array([0.19847328, 0.35114504, 0.09923664, 0.35114504]),
 'chris rock': array([0.27564103, 0.37820513, 0.04487179, 0.30128205]),
 'dave chappelle': array([0.27477477, 0.18468468, 0.22522523, 0.31531532]),
 'eric andre': array([0.29411765, 0.35294118, 0.11764706, 0.23529412]),
 'george lopez': array([0.17857143, 0.28571429, 0.14285714, 0.39285714]),
 'hannah gadsby': array([0.1147541 , 0.14754098, 0.01639344, 0.72131148]),
 'hasan minhaj': array([0.27419355, 0.20967742, 0.01612903, 0.5       ]),
 'iliza shlesinger': array([0.14893617, 0.23404255, 0.04255319, 0.57446809]),
 'jack whitehall': array([0.125  , 0.

In [48]:
g3 = list(styles.result_3.values)
comedian2vec(g3, comedian_df)

{'ramy youssef': array([0.20512821, 0.33333333, 0.20512821, 0.25641026]),
 'andy woodhull': array([0.22222222, 0.55555556, 0.11111111, 0.11111111]),
 'amy schumer': array([0.31343284, 0.34328358, 0.13432836, 0.20895522]),
 'aziz ansari': array([0.23728814, 0.30508475, 0.3220339 , 0.13559322]),
 'bert kreischer': array([0.21428571, 0.3       , 0.25714286, 0.22857143]),
 'bill burr': array([0.24427481, 0.24427481, 0.29007634, 0.22137405]),
 'chris rock': array([0.25641026, 0.32051282, 0.19871795, 0.22435897]),
 'dave chappelle': array([0.3963964 , 0.18468468, 0.15765766, 0.26126126]),
 'eric andre': array([0.11764706, 0.29411765, 0.33333333, 0.25490196]),
 'george lopez': array([0.28571429, 0.28571429, 0.07142857, 0.35714286]),
 'hannah gadsby': array([0.26229508, 0.40983607, 0.14754098, 0.18032787]),
 'hasan minhaj': array([0.08064516, 0.29032258, 0.25806452, 0.37096774]),
 'iliza shlesinger': array([0.19858156, 0.31914894, 0.25531915, 0.22695035]),
 'jack whitehall': array([0.09375 , 0

# Keyword Extraction

In [144]:
clean_corpus = pd.read_pickle('clean_corpus_index.pkl')
corpus_nouns = pd.read_pickle('corpus_nouns_index.pkl')
corpus_na = pd.read_pickle('corpus_na_index.pkl')

In [145]:
def merge_transcripts(df, filename):
    """
    :param df: transcript/comedians dataframe.
    :param filename: the csv file to save for the comedian/keywords data
    """
    merged_df = df.groupby('Comedian').agg(lambda t: " ".join(t))
    r_df = pd.DataFrame(columns=['Comedian','Keywords'])
    for c in tqdm(merged_df.index, desc="Extracting..."):
        text = merged_df.loc[c][0]
        kw = keywords.keywords(text)
        r_df = r_df.append({'Comedian': c, "Keywords":", ".join(kw.split('\n'))}, ignore_index=True)
    r_df.to_excel(filename)       
    return r_df 


In [146]:
common_words_list = pickle.load(open('common_words_list_index.pkl', 'rb'))
# Additional stop words
common_words_list.append(np.array(['im','youre','theyre','we','hes','shes','yeah','uh','ill','hell',
                          'shell','theyll','well']))
common_words_list.append(np.array([i for i in 'qwertyuiopasdfghjklzxcvbnm']))

In [147]:
common_words_list

[array(['like', 'dont', 'im', ..., 'hasan', 'est', 'fue'], dtype='<U14'),
 array(['im', 'youre', 'people', ..., 'cuando', 'cil', 'tipo'],
       dtype='<U15'),
 array(['im', 'youre', 'thats', ..., 'quel', 'quiero', 'fue'], dtype='<U16'),
 array(['im', 'youre', 'theyre', 'we', 'hes', 'shes', 'yeah', 'uh', 'ill',
        'hell', 'shell', 'theyll', 'well'], dtype='<U6'),
 array(['q', 'w', 'e', 'r', 't', 'y', 'u', 'i', 'o', 'p', 'a', 's', 'd',
        'f', 'g', 'h', 'j', 'k', 'l', 'z', 'x', 'c', 'v', 'b', 'n', 'm'],
       dtype='<U1')]

In [148]:
stop_words_list = []
for common_words in common_words_list:
    stop_words = text.ENGLISH_STOP_WORDS.union(common_words)
    stop_words_list.extend(list(stop_words))

In [149]:
len(stop_words_list)

6262

In [150]:
def remove_stopwords(text):    
    # split the text
    wd = text.split(' ')
    # remove stop words
    resultwords  = [word for word in wd if word.lower() not in stop_words_list]
    return' '.join(resultwords)

In [151]:
clean_corpus.index = clean_corpus.Comedian
# corpus_nouns.index = corpus_nouns.Comedian
# corpus_na.index = corpus_na.Comedian
clean_corpus = pd.DataFrame(clean_corpus.Transcript.apply(remove_stopwords))
# corpus_nouns = pd.DataFrame(corpus_nouns.Transcript.apply(remove_stopwords))
# corpus_na = pd.DataFrame(corpus_na.Transcript.apply(remove_stopwords))

In [152]:
clean_corpus.reset_index()
# corpus_nouns.reset_index()
# corpus_na.reset_index()

Unnamed: 0,Comedian,Transcript
0,andy woodhull,painted shiny shiny shiny achieved ordering popcorn large popcorn forshes impossible blank rinsing denies leading rinsing colander grate pesticide
1,andy woodhull,sleepy resting airing chewing gummy staying gummy alert awake
2,andy woodhull,mention stepdaughter lease unless admitting judgmental wanting grow loved helped genitals workmanship
3,andy woodhull,cooper bo bum hurricane harbor pickits stepparent knowing multiple writing candidate speaker
4,andy woodhull,embracing aha resist letting
...,...,...
2835,dave chappelle,defending valuable legacy emmy drawn proportionately equivalent thursday partnered clinical psychologist africanamericans africanamerican institut...
2836,dave chappelle,ahah ahah ghetto ahah ahah ahah ghetto ahah ahah ahah ghetto ahah ahah ahah ghetto simpson funniest kentucky derby spectacular hosted jordan admir...
2837,dave chappelle,ahah ahah ghetto ahah revolution ahah ahah ahah ghetto acknowledge acknowledge juggernaut shouting garry shandling sincerest condolence hiphop que...
2838,dave chappelle,layer phife poem sayer conveyor dinkins mayor rhythm quest funky rhythm garment instruct obeyer rhythm recipe savor minor inhale biatch


In [153]:
clean_corpus.Transcript[1]

'sleepy resting airing chewing gummy staying gummy alert awake'

In [154]:
merge_transcripts(clean_corpus, 'keywords_clean.xlsx')


Extracting...:   0%|                                                                            | 0/40 [00:00<?, ?it/s]
Extracting...:   2%|█▋                                                                  | 1/40 [00:02<01:27,  2.25s/it]
Extracting...:   5%|███▍                                                                | 2/40 [00:02<01:01,  1.61s/it]
Extracting...:   8%|█████                                                               | 3/40 [00:02<00:45,  1.23s/it]
Extracting...:  10%|██████▊                                                             | 4/40 [00:05<00:59,  1.65s/it]
Extracting...:  12%|████████▌                                                           | 5/40 [00:06<00:51,  1.47s/it]
Extracting...:  15%|██████████▏                                                         | 6/40 [00:36<05:41, 10.05s/it]
Extracting...:  18%|███████████▉                                                        | 7/40 [00:37<03:57,  7.21s/it]
Extracting...:  20%|█████████████▌     

Unnamed: 0,Comedian,Keywords
0,amy schumer,"grow, grows, arrested, schumers, schumer, buttholes, butthole, growing comforting, uti, popcorn, bleeding, bleed, actress, showed, hyperemesis, sh..."
1,andy woodhull,"toast, maze, punishing, punishment, ordered punish, dressing ate, parenting, dish, floss, tradition crane, rinsing, stepparent, sourced, source, d..."
2,arsenio hall,"wesley, marching, tiger, grow, plus, bernie, ambien, booger, ashy, tyson, dust, peed, nevada, semmi, kim jongun, hoe jose, offends, changing, arse..."
3,aziz ansari,"randy, randys, harris, texted, writing, writes, ja, darwish, perform, performed, performer, performing, performance, exist, existing, existence, p..."
4,bert kreischer,"marshmallow, leeann, ilas, pajama, glock, leeanns alexa, ralphs, stayed, owl, panicking, pouring whiskey, hardy, ralph sampson, deodorant, ringwor..."
5,bill burr,"questa, io, mai, po, tutto, quando, sei, tutti, nel, cose, ed os, giusto, persone colore, quello cui, penso ne, testa lei, freaked, nessuno, visto..."
6,chris d’elia,"anika, anikas, laid, gayer, inspirational, inspiring, inspired, staying peach, stayed, changing delia, invite, invitation, invited, dallas, grow, ..."
7,chris rock,"lm, lts, kobe, lt, pork, cooked, cook, witness, witnessing, tupac, cheated, bout, bully, bullying, loved custody, wealth, realised, toss, tossed, ..."
8,dave chappelle,"esos, unos, mujer, mujeres, blanco, esa, muchos, uno este, dio, dios, ellas, familia estaba, algo, mismo saben amigo, blancos sincero, ir, ese mom..."
9,eric andre,"salvia, bukkake, calvin, calvinism, eric, jessies, licked, licking, pus, jessie texted, snoop, atm, brian, tasted, texting, slap, arrested, scrub,..."
