In [77]:
# Basic functionalities
import numpy as np
import pandas as pd
import matplotlib.pyplot as plt
import re
import string
from collections import Counter
import pickle

# options
pd.set_option('max_colwidth',150)

In [2]:
# Web crawling
import requests
from bs4 import BeautifulSoup
import pickle

In [3]:
# Text encoding
from sklearn.feature_extraction import text 
from sklearn.feature_extraction.text import CountVectorizer
from sklearn.feature_extraction.text import TfidfVectorizer

In [4]:
# Visualization
from wordcloud import WordCloud

In [5]:
# Text Processing
from nltk import word_tokenize, pos_tag
import nltk
from nltk.stem import WordNetLemmatizer
wordnet_lemmatizer = WordNetLemmatizer()
from nltk.stem.porter import PorterStemmer
porter_stemmer = PorterStemmer()
# nltk.download('wordnet')
# nltk.set_proxy('SYSTEM PROXY')
# nltk.download('punkt')
# nltk.download('averaged_perceptron_tagger')

In [6]:
# Sentiment analysis
from textblob import TextBlob as tb

In [7]:
# Topic Modeling
from gensim import matutils, models
from gensim.models import LsiModel
import scipy.sparse



In [8]:
# Keyword Extraction
from summa.summarizer import summarize
from summa import keywords

In [54]:
# Progress bar
from tqdm import tqdm 

# Retrieve Data

## Data from google drive

In [9]:
df = pd.read_csv('./scrap_all_transcript/transcript_paragraph_needed.csv')
df = df[["comedian","transcript"]]
df.columns = ["Comedian","Transcript"]
df

Unnamed: 0,Comedian,Transcript
0,ramy youssef,and i feel bad for him i dont know if anybody ...
1,ramy youssef,i think it doesnt matter if he was lying or no...
2,ramy youssef,im not gonna let them do this man they put up ...
3,ramy youssef,thats why i had to watch the whole r kelly doc...
4,ramy youssef,its the music it makes you forget music can do...
...,...,...
2705,whitney cummings,i dont know if in general as a society were ge...
2706,whitney cummings,im worried im worried for the next generation ...
2707,whitney cummings,you guys are getting desensitized and its like...
2708,whitney cummings,thank you guys so much for coming out to the s...


In [10]:
# get corpus
corpus = df[['Comedian','Transcript']].set_index('Comedian')
corpus

Unnamed: 0_level_0,Transcript
Comedian,Unnamed: 1_level_1
ramy youssef,and i feel bad for him i dont know if anybody ...
ramy youssef,i think it doesnt matter if he was lying or no...
ramy youssef,im not gonna let them do this man they put up ...
ramy youssef,thats why i had to watch the whole r kelly doc...
ramy youssef,its the music it makes you forget music can do...
...,...
whitney cummings,i dont know if in general as a society were ge...
whitney cummings,im worried im worried for the next generation ...
whitney cummings,you guys are getting desensitized and its like...
whitney cummings,thank you guys so much for coming out to the s...


#### Clean the data

In [11]:
def apply_data_cleansing(text):
    # Lowercase every words
    text = text.lower()
    # Remove every words with [blah blah blah] format
    text = re.sub('\[.*?\]', '', text)
    # Remove every words with (blah blah blah) format
    text = re.sub('\(.*?\)', '', text)
    # Get rid of the punctuations
    text = re.sub('[%s]' % re.escape(string.punctuation), '', text)
    # Get rid of all the numbers or words that contain numbers
    text = re.sub('\w*\d\w*', '', text)
    # Get rid of these specific punctuations
    text = re.sub('[‘’“”…]', '', text)
    # Get rid of '\n'
    text = re.sub('\n', '', text)
    
    # Tokenizes and lemmetizes (or stems) them
    tokenized = word_tokenize(text)
    stemmed = [porter_stemmer.stem(t) for t in tokenized]
    lemmetized = [wordnet_lemmatizer.lemmatize(t) for t in tokenized]
    return " ".join(lemmetized)

In [12]:
clean_corpus = pd.DataFrame(corpus.Transcript.apply(lambda x: apply_data_cleansing(x)))
clean_corpus_index = clean_corpus.reset_index()
clean_corpus_index

Unnamed: 0,Comedian,Transcript
0,ramy youssef,and i feel bad for him i dont know if anybody ...
1,ramy youssef,i think it doesnt matter if he wa lying or not...
2,ramy youssef,im not gon na let them do this man they put up...
3,ramy youssef,thats why i had to watch the whole r kelly doc...
4,ramy youssef,it the music it make you forget music can do c...
...,...,...
2705,whitney cummings,i dont know if in general a a society were get...
2706,whitney cummings,im worried im worried for the next generation ...
2707,whitney cummings,you guy are getting desensitized and it like i...
2708,whitney cummings,thank you guy so much for coming out to the sh...


In [13]:
df = pd.DataFrame(clean_corpus_index.Comedian)

In [14]:
# Pickles the result for later usage
clean_corpus_index.to_pickle('./pickles_index/clean_corpus_index.pkl')

#### Retrieve only NOUNS of all transcripts

In [15]:
def nouns(text):
    is_noun = lambda pos: pos[:2] == 'NN'
    tokenized = word_tokenize(text)
    all_nouns = [word for (word, pos) in pos_tag(tokenized) if is_noun(pos)] 
    return ' '.join(all_nouns)

In [16]:
corpus = pd.read_pickle('./pickles_index/clean_corpus_index.pkl')

In [17]:
# Apply the nouns function to the transcripts to filter only on nouns
corpus_nouns_index = pd.DataFrame(corpus.Transcript.apply(nouns))
corpus_nouns_index.to_pickle('./pickles_index/corpus_nouns_index.pkl')

#### Retrieve NOUNS and ADJECTIVES of all transcripts

In [18]:
def nouns_adj(text):
    is_noun_adj = lambda pos: pos[:2] == 'NN' or pos[:2] == 'JJ'
    tokenized = word_tokenize(text)
    nouns_adj = [word for (word, pos) in pos_tag(tokenized) if is_noun_adj(pos)] 
    return ' '.join(nouns_adj)

In [19]:
corpus = pd.read_pickle('./pickles_index/clean_corpus_index.pkl')
# Apply the nouns function to the transcripts to filter only on nouns
corpus_na_index = pd.DataFrame(corpus.Transcript.apply(nouns_adj))
corpus_na_index.to_pickle('./pickles_index/corpus_na_index.pkl')

# Analyize the transcripts

#### Text Encoding

In [20]:
def dtm(corpus, stop_words='english', pickle_filename="", save_pickle=False):
    cv = CountVectorizer(stop_words=stop_words)
    data_cv = cv.fit_transform(corpus.Transcript)
    data_dtm = pd.DataFrame(data_cv.toarray(), columns=cv.get_feature_names())
    data_dtm.index = corpus.index
    if save_pickle:
        pickle.dump(cv, open(pickle_filename, "wb"))        
    return data_dtm.transpose()

def tf_idf(corpus, stop_words='english', pickle_filename="", save_pickle=False):
    vectorizer = TfidfVectorizer(stop_words='english')
    data_v = vectorizer.fit_transform(corpus.Transcript)
    data_tfidf = pd.DataFrame(data_v.toarray(), columns=vectorizer.get_feature_names())
    data_tfidf.index = corpus.index
    if save_pickle:
        pickle.dump(vectorizer, open(pickle_filename, "wb"))        
    return data_tfidf.transpose()

#### Top words

In [21]:
def get_top_words(matrix_dataframe, number_of_words = 30):
    top_dict = {}
    for c in matrix_dataframe.columns:
        top = matrix_dataframe[c].sort_values(ascending=False).head(number_of_words)
        top_dict[c]= list(zip(top.index, top.values))
    return top_dict

In [22]:
# Get all kinds of corpus
clean_corpus = pd.read_pickle('./pickles_index/clean_corpus_index.pkl')
corpus_n = pd.read_pickle('./pickles_index/corpus_nouns_index.pkl')
corpus_na = pd.read_pickle('./pickles_index/corpus_na_index.pkl')

all_corpuses = [clean_corpus, corpus_n, corpus_na]

In [23]:
df_dtms = []
df_tf_idfs = []
dtm_tops = []
tf_idf_tops = []
for corpus in all_corpuses:
    # Document-Term Matrix
    df_dtm = dtm(corpus)
    # Tf-Idf Matrix
    df_tfidf = tf_idf(corpus)    
    # Get top words
    dtm_top = get_top_words(df_dtm)
    tf_idf_top = get_top_words(df_tfidf)
    
    df_dtms.append(df_dtm)
    df_tf_idfs.append(df_tfidf)
    dtm_tops.append(dtm_top)
    tf_idf_tops.append(tf_idf_top)

In [24]:
# Have a glimps on the top words of each transcript

df = pd.DataFrame(index=df_dtms[0].columns)
for i, corpus in enumerate(all_corpuses):
    dtm_top = dtm_tops[i]
    tf_idf_top = tf_idf_tops[i]
    df[f'corpus{i}_DTM'] = [[w[0] for w in words] for words in dtm_top.values()]
    df[f'corpus{i}_TF_IDF'] = [[w[0] for w in words] for words in tf_idf_top.values()]
    
df

Unnamed: 0,corpus0_DTM,corpus0_TF_IDF,corpus1_DTM,corpus1_TF_IDF,corpus2_DTM,corpus2_TF_IDF
0,"[like, know, think, wa, just, got, mean, thats...","[like, empire, think, wouldve, wish, icecap, k...","[youre, head, empire, thats, wouldve, america,...","[empire, wouldve, icecap, contract, violence, ...","[youre, empire, bad, wouldve, wish, head, that...","[empire, wouldve, wish, icecap, navajo, contra..."
1,"[just, like, did, thing, wa, fest, cause, smol...","[fest, fyre, smollett, lie, weve, jussie, pron...","[thing, lie, smollett, seed, fyre, cause, some...","[smollett, lie, fyre, suis, je, seed, document...","[fyre, fest, smollett, lie, thing, yeah, littl...","[fest, fyre, smollett, lie, suis, je, jussie, ..."
2,"[like, wa, good, just, gon, kelly, na, beat, t...","[ignition, kelly, like, watched, beat, distrus...","[man, thing, beat, im, ignition, random, song,...","[ignition, beat, random, documentary, crime, s...","[good, thing, ignition, beat, im, man, cause, ...","[ignition, beat, distrust, mugshot, spun, used..."
3,"[like, kelly, youre, na, know, prayer, believe...","[prayer, doubt, kelly, quran, devastating, all...","[doubt, youre, prayer, quran, people, gon, cau...","[doubt, quran, akbar, islam, booth, prayer, ep...","[doubt, youre, community, allahu, gon, quran, ...","[doubt, quran, allahu, akbar, islam, booth, pr..."
4,"[like, know, michael, just, dont, got, kid, le...","[michael, like, lebron, jackson, warrior, know...","[michael, jackson, kid, man, music, dont, im, ...","[michael, jackson, warrior, music, kid, supert...","[michael, kid, man, jackson, music, person, fa...","[michael, jackson, warrior, famous, music, kid..."
...,...,...,...,...,...,...
2705,"[im, like, god, sex, thing, squirting, want, j...","[squirting, squirt, sex, god, anal, im, body, ...","[sex, thing, god, thats, year, body, im, time,...","[sex, god, body, nope, thing, tarp, fun, date,...","[im, thing, god, sex, thats, body, year, new, ...","[sex, anal, god, im, body, thing, nope, new, t..."
2706,"[just, like, guy, na, base, day, wa, dick, gon...","[base, squirting, just, guy, dick, generation,...","[base, day, dick, wa, way, sex, woman, generat...","[base, dick, generation, virginity, day, boob,...","[base, day, dick, sex, wa, second, woman, beau...","[base, dick, generation, virginity, sex, day, ..."
2707,"[like, guy, thing, dick, know, gagging, make, ...","[gagging, gag, psychology, like, guy, trick, t...","[thing, youre, dick, whats, deal, psychology, ...","[psychology, thing, deal, kinky, gaggings, gel...","[thing, dick, trick, gon, big, youre, deal, gu...","[psychology, trick, thing, dick, deal, gagging..."
2708,"[thank, guy, selfie, just, special, know, love...","[thank, selfie, guy, special, cuties, dedicate...","[thank, youre, phone, feed, angeles, mode, let...","[cuties, selfies, thank, feed, selfie, mode, i...","[big, thank, special, selfie, angeles, hi, fuc...","[selfie, special, cuties, selfies, hbo, thank,..."


Since there are too many words that are common among all the transcript, we need to add them to the stop-word list and redo the text encoding again.

#### Extract all common words among the transcript

In [25]:
commons = []
for c in df.columns:
    words = []
    for r in df[c]:
        top = [word for word in r]
        for t in top:
            words.append(t)    
    commons.append(Counter(words).most_common())

In [26]:
common_words_list = []
for common in commons:
    new = [word for word, count in common if count > 6]
    common_words_list.append(np.array(new))
common_words_list

[array(['like', 'im', 'just', ..., 'josep', 'australian', 'learning'],
       dtype='<U14'),
 array(['like', 'wa', 'im', ..., 'crenshaw', 'malaysia', 'whitney'],
       dtype='<U14'),
 array(['im', 'youre', 'people', ..., 'kong', 'possibility', 'tiffany'],
       dtype='<U15'),
 array(['im', 'youre', 'people', ..., 'selfies', 'vest', 'pfft'],
       dtype='<U15'),
 array(['im', 'youre', 'thats', ..., 'possibility', 'neal', 'pete'],
       dtype='<U16'),
 array(['im', 'youre', 'people', ..., 'iron', 'melbourne', 'malaysia'],
       dtype='<U16')]

In [27]:
pickle.dump(common_words_list, open('./pickles_index/common_words_list_index.pkl', 'wb'))

#### Add the stop words to the original stop word list and redo the text encoding

In [28]:
stop_words_list = []
for common_words in common_words_list:
    stop_words = text.ENGLISH_STOP_WORDS.union(common_words)
    stop_words_list.append(stop_words)

In [29]:
for i, stop_words in enumerate(stop_words_list):
    if i % 2 == 0:
        # Document-Term Matrix
        df_dtm = dtm(corpus, stop_words, f'./pickles_index/index_corpus{int(i/2)}_cv.pkl', save_pickle=True)
        # Transpose back to comedians as index for topic modeling
        df_dtm.to_pickle(f'./pickles_index/index_corpus{int(i/2)}_dtm.pkl')
    else:
        # Tf-Idf Matrix
        df_tfidf = tf_idf(corpus, stop_words, f'./pickles_index/index_corpus{int((i-1)/2)}_tf_idf.pkl', save_pickle=True)  
        # Transpose back to comedians as index for topic modeling  
        df_tfidf.to_pickle(f'./pickles_index/index_corpus{int((i-1)/2)}_tim.pkl')

### Visualize the top words with WordCloud

In [None]:
# # 
# wc_list = []
# for stop_words in stop_words_list:
#     wc = WordCloud(stopwords=stop_words,
#                    background_color="white", 
#                    colormap="Dark2",
#                    max_font_size=150, 
#                    random_state=42)
#     wc_list.append(wc)

In [None]:
### the index should be 1 < num < 25
# plt.rcParams['figure.figsize'] = [30,10]

# index_now = clean_corpus.index

# for i, wc in enumerate(wc_list):
#     fig = plt.figure()
#     if i == 0 or i == 1:
#         corpus = clean_corpus
#     elif i == 2 or i == 3:
#         corpus = corpus_n
#     elif i == 4 or i == 5:
#         corpus = corpus_na
#     print(f"------------- Stop words {i} -------------")
#     # Create subplots for each transcript
#     for index, comedian in enumerate(index_now):
#         wc.generate(corpus.Transcript[index])
#         plt.subplot(4, 6, index+1)
#         plt.imshow(wc, interpolation="bilinear")
#         plt.axis("off")
#         plt.title(index_now[index])

#     plt.show()

# Sentiment Analysis

In [None]:
corpus_filename = './pickles_index/corpus_na_index.pkl'
clean_corpus = pd.read_pickle(corpus_filename)

In [None]:
pol = lambda x: tb(x).sentiment.polarity
sub = lambda x: tb(x).sentiment.subjectivity

data = pd.DataFrame()
data['polarity'] = clean_corpus['Transcript'].apply(pol)
data['subjectivity'] = clean_corpus['Transcript'].apply(sub)
data

In [None]:
plt.rcParams['figure.figsize'] = [10, 8]

for index, comedian in enumerate(data.index):
    x = data.polarity.loc[comedian]
    y = data.subjectivity.loc[comedian]
    plt.scatter(x, y, color='blue')
    plt.text(x+.001, y+.001, data.index[index], fontsize=10)
#     plt.xlim(-.01, .12) 
    
plt.title('Sentiment Analysis', fontsize=20)
plt.xlabel('<-- Negative -------- Positive -->', fontsize=15)
plt.ylabel('<-- Facts -------- Opinions -->', fontsize=15)

plt.show()

# Topic Modeling

# Latent Dirichlet Allocation (LDA)

In [30]:
def train_LDA_model(data_matrix, vectorizer, num_topics=4, passes=3000):
    # df --> sparse matrix --> gensim corpus
    sparse_counts = scipy.sparse.csr_matrix(data_matrix)
    gensim_corpus = matutils.Sparse2Corpus(sparse_counts)
    id2word = dict((v, k) for k, v in vectorizer.vocabulary_.items())
    lda = models.LdaModel(corpus=gensim_corpus, id2word=id2word, num_topics=num_topics, passes=passes)
    
    corpus_transformed = lda[gensim_corpus]
    result = list(zip([max(ct, key=lambda x: x[1])[0] for ct in corpus_transformed], data_matrix.columns))
    return lda, result

In [31]:
# Load all matrices
clean_corpus_dtm = pd.read_pickle('./pickles_index/index_corpus0_dtm.pkl')
clean_corpus_tim = pd.read_pickle('./pickles_index/index_corpus0_tim.pkl')
corpus_noun_dtm = pd.read_pickle('./pickles_index/index_corpus1_dtm.pkl')
corpus_noun_tim = pd.read_pickle('./pickles_index/index_corpus1_tim.pkl')
corpus_na_dtm = pd.read_pickle('./pickles_index/index_corpus2_dtm.pkl')
corpus_na_tim = pd.read_pickle('./pickles_index/index_corpus2_tim.pkl')

In [32]:
# Load all vectorizers
clean_corpus_cv = pickle.load(open('./pickles_index/index_corpus0_cv.pkl', 'rb'))
clean_corpus_tf_idf = pickle.load(open('./pickles_index/index_corpus0_tf_idf.pkl','rb'))
corpus_noun_cv = pickle.load(open('./pickles_index/index_corpus1_cv.pkl','rb'))
corpus_noun_tf_idf = pickle.load(open('./pickles_index/index_corpus1_tf_idf.pkl','rb'))
corpus_na_cv = pickle.load(open('./pickles_index/index_corpus2_cv.pkl','rb'))
corpus_na_tf_idf = pickle.load(open('./pickles_index/index_corpus2_tf_idf.pkl', 'rb'))

In [33]:
clean_corpus_lda, result1 = train_LDA_model(clean_corpus_dtm, clean_corpus_cv)
clean_corpus_lda.print_topics()

[(0,
  '0.002*"randy" + 0.002*"itll" + 0.002*"ryan" + 0.002*"blue" + 0.002*"parasite" + 0.002*"flag" + 0.002*"shrooms" + 0.002*"twin" + 0.002*"elevator" + 0.002*"naga"'),
 (1,
  '0.004*"pill" + 0.003*"ow" + 0.002*"gas" + 0.002*"mandela" + 0.002*"rumor" + 0.002*"cash" + 0.002*"captain" + 0.002*"nelson" + 0.002*"birth" + 0.001*"boot"'),
 (2,
  '0.008*"russian" + 0.003*"taco" + 0.003*"computer" + 0.002*"shitty" + 0.002*"wallet" + 0.002*"death" + 0.002*"doubt" + 0.002*"ci" + 0.001*"ebola" + 0.001*"clap"'),
 (3,
  '0.003*"dennys" + 0.002*"toy" + 0.002*"pm" + 0.002*"pain" + 0.002*"japan" + 0.002*"candy" + 0.002*"muy" + 0.002*"oprah" + 0.002*"yogurt" + 0.002*"peanut"')]

In [34]:
result1

[(1, 0),
 (2, 1),
 (2, 2),
 (2, 3),
 (3, 4),
 (3, 5),
 (0, 6),
 (0, 7),
 (0, 8),
 (1, 9),
 (3, 10),
 (2, 11),
 (0, 12),
 (2, 13),
 (2, 14),
 (2, 15),
 (2, 16),
 (1, 17),
 (0, 18),
 (0, 19),
 (0, 20),
 (2, 21),
 (1, 22),
 (1, 23),
 (0, 24),
 (2, 25),
 (2, 26),
 (2, 27),
 (1, 28),
 (2, 29),
 (2, 30),
 (2, 31),
 (1, 32),
 (2, 33),
 (1, 34),
 (1, 35),
 (2, 36),
 (2, 37),
 (1, 38),
 (2, 39),
 (1, 40),
 (1, 41),
 (0, 42),
 (3, 43),
 (3, 44),
 (2, 45),
 (2, 46),
 (0, 47),
 (0, 48),
 (1, 49),
 (3, 50),
 (2, 51),
 (0, 52),
 (3, 53),
 (2, 54),
 (3, 55),
 (2, 56),
 (0, 57),
 (2, 58),
 (1, 59),
 (2, 60),
 (1, 61),
 (3, 62),
 (0, 63),
 (2, 64),
 (0, 65),
 (2, 66),
 (2, 67),
 (0, 68),
 (1, 69),
 (3, 70),
 (2, 71),
 (1, 72),
 (0, 73),
 (0, 74),
 (3, 75),
 (0, 76),
 (1, 77),
 (1, 78),
 (2, 79),
 (2, 80),
 (3, 81),
 (1, 82),
 (1, 83),
 (1, 84),
 (0, 85),
 (1, 86),
 (3, 87),
 (0, 88),
 (3, 89),
 (0, 90),
 (2, 91),
 (1, 92),
 (1, 93),
 (1, 94),
 (2, 95),
 (0, 96),
 (2, 97),
 (3, 98),
 (2, 99),
 (0, 100),

In [35]:
corpus_noun_lda, result2 = train_LDA_model(corpus_noun_dtm, corpus_noun_cv)
corpus_noun_lda.print_topics()

[(0,
  '0.020*"little" + 0.009*"big" + 0.009*"white" + 0.009*"new" + 0.008*"bad" + 0.008*"black" + 0.008*"indian" + 0.007*"nice" + 0.007*"great" + 0.007*"different"'),
 (1,
  '0.015*"white" + 0.012*"little" + 0.012*"old" + 0.011*"black" + 0.009*"big" + 0.009*"great" + 0.009*"real" + 0.008*"new" + 0.007*"asian" + 0.006*"american"'),
 (2,
  '0.011*"una" + 0.003*"ci" + 0.002*"cowboy" + 0.002*"io" + 0.002*"twin" + 0.002*"vlad" + 0.002*"poi" + 0.002*"ogni" + 0.002*"prime" + 0.002*"persone"'),
 (3,
  '0.028*"black" + 0.011*"little" + 0.011*"bad" + 0.009*"white" + 0.007*"real" + 0.006*"hard" + 0.005*"new" + 0.005*"better" + 0.004*"big" + 0.004*"great"')]

In [36]:
result2

[(3, 0),
 (1, 1),
 (2, 2),
 (3, 3),
 (3, 4),
 (3, 5),
 (0, 6),
 (0, 7),
 (0, 8),
 (0, 9),
 (1, 10),
 (0, 11),
 (0, 12),
 (1, 13),
 (0, 14),
 (3, 15),
 (0, 16),
 (0, 17),
 (0, 18),
 (0, 19),
 (1, 20),
 (0, 21),
 (3, 22),
 (3, 23),
 (0, 24),
 (0, 25),
 (3, 26),
 (0, 27),
 (3, 28),
 (3, 29),
 (0, 30),
 (0, 31),
 (0, 32),
 (0, 33),
 (0, 34),
 (0, 35),
 (1, 36),
 (0, 37),
 (0, 38),
 (1, 39),
 (0, 40),
 (1, 41),
 (1, 42),
 (1, 43),
 (3, 44),
 (1, 45),
 (3, 46),
 (1, 47),
 (3, 48),
 (1, 49),
 (3, 50),
 (1, 51),
 (0, 52),
 (0, 53),
 (1, 54),
 (0, 55),
 (3, 56),
 (1, 57),
 (1, 58),
 (0, 59),
 (1, 60),
 (1, 61),
 (0, 62),
 (0, 63),
 (0, 64),
 (3, 65),
 (1, 66),
 (2, 67),
 (0, 68),
 (1, 69),
 (1, 70),
 (3, 71),
 (0, 72),
 (0, 73),
 (1, 74),
 (1, 75),
 (0, 76),
 (3, 77),
 (1, 78),
 (1, 79),
 (3, 80),
 (1, 81),
 (2, 82),
 (1, 83),
 (1, 84),
 (0, 85),
 (0, 86),
 (1, 87),
 (0, 88),
 (0, 89),
 (0, 90),
 (0, 91),
 (0, 92),
 (0, 93),
 (0, 94),
 (0, 95),
 (1, 96),
 (1, 97),
 (0, 98),
 (3, 99),
 (1, 100),

In [37]:
corpus_na_lda, result3 = train_LDA_model(corpus_na_dtm, corpus_na_cv)
corpus_na_lda.print_topics()

[(0,
  '0.003*"ow" + 0.002*"doubt" + 0.002*"randy" + 0.002*"bombay" + 0.002*"threat" + 0.002*"imma" + 0.001*"canada" + 0.001*"vince" + 0.001*"breakup" + 0.001*"shrooms"'),
 (1,
  '0.003*"dennys" + 0.002*"trap" + 0.002*"sperm" + 0.002*"wallet" + 0.002*"ebola" + 0.002*"passenger" + 0.002*"dan" + 0.002*"naga" + 0.001*"bien" + 0.001*"cow"'),
 (2,
  '0.003*"rumor" + 0.002*"namaste" + 0.002*"yogurt" + 0.002*"oprah" + 0.001*"golddiggers" + 0.001*"beyoncé" + 0.001*"hut" + 0.001*"clap" + 0.001*"vegetable" + 0.001*"hoodie"'),
 (3,
  '0.003*"mandela" + 0.002*"pull" + 0.002*"cowboy" + 0.002*"nelson" + 0.002*"island" + 0.002*"twin" + 0.002*"peanut" + 0.002*"motto" + 0.001*"barack" + 0.001*"metal"')]

In [38]:
result3

[(1, 0),
 (0, 1),
 (3, 2),
 (1, 3),
 (2, 4),
 (3, 5),
 (0, 6),
 (0, 7),
 (0, 8),
 (2, 9),
 (3, 10),
 (3, 11),
 (2, 12),
 (0, 13),
 (0, 14),
 (3, 15),
 (1, 16),
 (0, 17),
 (2, 18),
 (3, 19),
 (2, 20),
 (3, 21),
 (3, 22),
 (2, 23),
 (1, 24),
 (1, 25),
 (2, 26),
 (1, 27),
 (0, 28),
 (2, 29),
 (2, 30),
 (0, 31),
 (2, 32),
 (0, 33),
 (1, 34),
 (1, 35),
 (0, 36),
 (0, 37),
 (0, 38),
 (2, 39),
 (0, 40),
 (1, 41),
 (0, 42),
 (0, 43),
 (0, 44),
 (1, 45),
 (1, 46),
 (3, 47),
 (1, 48),
 (0, 49),
 (0, 50),
 (3, 51),
 (0, 52),
 (1, 53),
 (0, 54),
 (0, 55),
 (1, 56),
 (0, 57),
 (2, 58),
 (0, 59),
 (0, 60),
 (0, 61),
 (0, 62),
 (2, 63),
 (3, 64),
 (0, 65),
 (0, 66),
 (2, 67),
 (2, 68),
 (2, 69),
 (0, 70),
 (2, 71),
 (3, 72),
 (2, 73),
 (0, 74),
 (0, 75),
 (0, 76),
 (3, 77),
 (2, 78),
 (2, 79),
 (1, 80),
 (0, 81),
 (1, 82),
 (1, 83),
 (0, 84),
 (1, 85),
 (1, 86),
 (2, 87),
 (1, 88),
 (1, 89),
 (0, 90),
 (3, 91),
 (2, 92),
 (1, 93),
 (1, 94),
 (0, 95),
 (1, 96),
 (0, 97),
 (3, 98),
 (2, 99),
 (0, 100),

In [None]:
# clean_corpus_lda, result2 = train_LDA_model(clean_corpus_tim, clean_corpus_tf_idf)
# clean_corpus_lda.print_topics()

In [None]:
# result2

In [None]:
# corpus_noun_lda, result4 = train_LDA_model(corpus_noun_tim, corpus_noun_tf_idf)
# corpus_noun_lda.print_topics()

In [None]:
# result4

In [None]:
# corpus_na_lda, result6 = train_LDA_model(corpus_na_tim, corpus_na_tf_idf)
# corpus_na_lda.print_topics()

In [None]:
# result6

In [39]:
groups = []
group_df = pd.DataFrame(index=[0,1,2,3])
for i, result in enumerate([result1, result2, result3]):
    group = {}
    for r in result:
        if r[0] in group:
            group[r[0]].append(r[1])
        else:
            group[r[0]] = [r[1]]
    group_df[f'result_{i+1}'] = group.values()
    groups.append(group)

In [40]:
group_df.to_pickle('./pickles_index/group.pkl')
group_df.to_csv('./pickles_index/group.csv')

# Latent Semantic Analysis (LSA)

In [None]:
def train_LSA_model(data_matrix, vectorizer, num_topics=5):
    # df --> sparse matrix --> gensim corpus
    sparse_counts = scipy.sparse.csr_matrix(data_matrix)
    gensim_corpus = matutils.Sparse2Corpus(sparse_counts)
    id2word = dict((v, k) for k, v in vectorizer.vocabulary_.items())
    lsi = LsiModel(gensim_corpus, num_topics=num_topics, id2word=id2word)
    
    corpus_transformed = lsi[gensim_corpus]
    result = list(zip([max(ct, key=lambda x: x[1])[0] for ct in corpus_transformed], data_matrix.columns))
    return lsi, result

In [None]:
corpus_na_lsa, res = train_LSA_model(corpus_na_tim, corpus_na_tf_idf)
corpus_na_lsa.print_topics()

In [None]:
res

# Coherence Value

In [None]:
def compute_coherence_values(dictionary, doc_term_matrix, doc_clean, stop, start=2, step=3):
    """
    Input   : dictionary : Gensim dictionary
              corpus : Gensim corpus
              texts : List of input texts
              stop : Max num of topics
    purpose : Compute c_v coherence for various number of topics
    Output  : model_list : List of LSA topic models
              coherence_values : Coherence values corresponding to the LDA model with respective number of topics
    """
    coherence_values = []
    model_list = []
    for num_topics in range(start, stop, step):
        # generate LSA model
        model = LsiModel(doc_term_matrix, num_topics=number_of_topics, id2word = dictionary)  # train model
        model_list.append(model)
        coherencemodel = CoherenceModel(model=model, texts=doc_clean, dictionary=dictionary, coherence='c_v')
        coherence_values.append(coherencemodel.get_coherence())
    return model_list, coherence_values

In [None]:
def plot_graph(doc_clean,start, stop, step):
    dictionary,doc_term_matrix=prepare_corpus(doc_clean)
    model_list, coherence_values = compute_coherence_values(dictionary, doc_term_matrix,doc_clean,
                                                            stop, start, step)
    # Show graph
    x = range(start, stop, step)
    plt.plot(x, coherence_values)
    plt.xlabel("Number of Topics")
    plt.ylabel("Coherence score")
    plt.legend(("coherence_values"), loc='best')
    plt.show()

# Vectorize Comedians

In [41]:
# Read the corpus
clean_corpus_index = pd.read_pickle('./pickles_index/clean_corpus_index.pkl')
clean_corpus = pd.read_pickle('./pickles_index/clean_corpus_index.pkl')
corpus_nouns = pd.read_pickle('./pickles_index/corpus_nouns_index.pkl')
corpus_na = pd.read_pickle('./pickles_index/corpus_na_index.pkl')

In [42]:
comedians = clean_corpus_index.Comedian
comedian_df = pd.DataFrame(comedians)
comedian_df

Unnamed: 0,Comedian
0,ramy youssef
1,ramy youssef
2,ramy youssef
3,ramy youssef
4,ramy youssef
...,...
2705,whitney cummings
2706,whitney cummings
2707,whitney cummings
2708,whitney cummings


In [43]:
def append_comedians(df):
    df['Comedians'] = comedians
    return df

# clean corpus already has the comedian column
# 
corpus_nouns['Comedians'] = comedians
corpus_na['Comedians'] = comedians

In [44]:
# Read the styles for each transcript
styles = pd.read_pickle('./pickles_index/group.pkl')
styles

Unnamed: 0,result_1,result_2,result_3
0,"[0, 9, 17, 22, 23, 28, 32, 34, 35, 38, 40, 41,...","[0, 3, 4, 5, 15, 22, 23, 26, 28, 29, 44, 46, 4...","[0, 3, 16, 24, 25, 27, 34, 35, 41, 45, 46, 48,..."
1,"[1, 2, 3, 11, 13, 14, 15, 16, 21, 25, 26, 27, ...","[1, 10, 13, 20, 36, 39, 41, 42, 43, 45, 47, 49...","[1, 6, 7, 8, 13, 14, 17, 28, 31, 33, 36, 37, 3..."
2,"[4, 5, 10, 43, 44, 50, 53, 55, 62, 70, 75, 81,...","[2, 67, 82, 123, 165, 179, 266, 314, 316, 320,...","[2, 5, 10, 11, 15, 19, 21, 22, 47, 51, 64, 72,..."
3,"[6, 7, 8, 12, 18, 19, 20, 24, 42, 47, 48, 52, ...","[6, 7, 8, 9, 11, 12, 14, 16, 17, 18, 19, 21, 2...","[4, 9, 12, 18, 20, 23, 26, 29, 30, 32, 39, 58,..."


In [45]:
def comedian2vec(groups, lookup_table):
    """
    :param groups: group should be a 2D matrix that contains transcript ids for each group.
    :param lookup_table: comedian-transcript_id table to look up which comedian an id is corresponeded to. This should have a "Comedian".
    """
    comedians = list(set(lookup_table.Comedian))
    comedian_vectors = {}
    for g_id, group in enumerate(groups):
        for i in group:
            com = lookup_table.iloc[i][0]
            if com not in comedian_vectors:
                comedian_vectors[com] = [0 for j in range(len(groups))]
            comedian_vectors[com][g_id] += 1
            
    for k, v in comedian_vectors.items():
        comedian_vectors[k] = np.divide(v, sum(v))        
            
    return comedian_vectors   

In [46]:
g1 = list(styles.result_1.values)
comedian2vec(g1, comedian_df)

{'ramy youssef': array([0.25641026, 0.46153846, 0.07692308, 0.20512821]),
 'andy woodhull': array([0.18518519, 0.33333333, 0.22222222, 0.25925926]),
 'amy schumer': array([0.26865672, 0.28358209, 0.1641791 , 0.28358209]),
 'aziz ansari': array([0.27118644, 0.3559322 , 0.18644068, 0.18644068]),
 'bert kreischer': array([0.34285714, 0.24285714, 0.15714286, 0.25714286]),
 'bill burr': array([0.23664122, 0.19847328, 0.2519084 , 0.3129771 ]),
 'chris rock': array([0.23717949, 0.23717949, 0.31410256, 0.21153846]),
 'dave chappelle': array([0.22972973, 0.21171171, 0.36936937, 0.18918919]),
 'eric andre': array([0.25490196, 0.23529412, 0.29411765, 0.21568627]),
 'george lopez': array([0.35714286, 0.17857143, 0.35714286, 0.10714286]),
 'hannah gadsby': array([0.32786885, 0.21311475, 0.13114754, 0.32786885]),
 'hasan minhaj': array([0.19354839, 0.38709677, 0.25806452, 0.16129032]),
 'iliza shlesinger': array([0.34042553, 0.20567376, 0.19858156, 0.25531915]),
 'jack whitehall': array([0.1875  , 0

In [47]:
g2 = list(styles.result_2.values)
comedian2vec(g2, comedian_df)

{'ramy youssef': array([0.25641026, 0.12820513, 0.02564103, 0.58974359]),
 'andy woodhull': array([0.22222222, 0.48148148, 0.        , 0.2962963 ]),
 'amy schumer': array([0.1641791 , 0.35820896, 0.04477612, 0.43283582]),
 'aziz ansari': array([0.10169492, 0.3559322 , 0.03389831, 0.50847458]),
 'bert kreischer': array([0.24285714, 0.34285714, 0.        , 0.41428571]),
 'bill burr': array([0.19847328, 0.35114504, 0.09923664, 0.35114504]),
 'chris rock': array([0.27564103, 0.37820513, 0.04487179, 0.30128205]),
 'dave chappelle': array([0.27477477, 0.18468468, 0.22522523, 0.31531532]),
 'eric andre': array([0.29411765, 0.35294118, 0.11764706, 0.23529412]),
 'george lopez': array([0.17857143, 0.28571429, 0.14285714, 0.39285714]),
 'hannah gadsby': array([0.1147541 , 0.14754098, 0.01639344, 0.72131148]),
 'hasan minhaj': array([0.27419355, 0.20967742, 0.01612903, 0.5       ]),
 'iliza shlesinger': array([0.14893617, 0.23404255, 0.04255319, 0.57446809]),
 'jack whitehall': array([0.125  , 0.

In [48]:
g3 = list(styles.result_3.values)
comedian2vec(g3, comedian_df)

{'ramy youssef': array([0.20512821, 0.33333333, 0.20512821, 0.25641026]),
 'andy woodhull': array([0.22222222, 0.55555556, 0.11111111, 0.11111111]),
 'amy schumer': array([0.31343284, 0.34328358, 0.13432836, 0.20895522]),
 'aziz ansari': array([0.23728814, 0.30508475, 0.3220339 , 0.13559322]),
 'bert kreischer': array([0.21428571, 0.3       , 0.25714286, 0.22857143]),
 'bill burr': array([0.24427481, 0.24427481, 0.29007634, 0.22137405]),
 'chris rock': array([0.25641026, 0.32051282, 0.19871795, 0.22435897]),
 'dave chappelle': array([0.3963964 , 0.18468468, 0.15765766, 0.26126126]),
 'eric andre': array([0.11764706, 0.29411765, 0.33333333, 0.25490196]),
 'george lopez': array([0.28571429, 0.28571429, 0.07142857, 0.35714286]),
 'hannah gadsby': array([0.26229508, 0.40983607, 0.14754098, 0.18032787]),
 'hasan minhaj': array([0.08064516, 0.29032258, 0.25806452, 0.37096774]),
 'iliza shlesinger': array([0.19858156, 0.31914894, 0.25531915, 0.22695035]),
 'jack whitehall': array([0.09375 , 0

# Keyword Extraction

In [86]:
def merge_transcripts(df, filename):
    """
    :param df: transcript/comedians dataframe.
    :param filename: the csv file to save for the comedian/keywords data
    """
    merged_df = df.groupby('Comedians').agg(lambda t: " ".join(t))
    r_df = pd.DataFrame(columns=['Comedian','Keywords'])
    for c in tqdm(merged_df.index, desc="Extracting..."):
        text = merged_df.loc[c][0]
        kw = keywords.keywords(text)
        r_df = r_df.append({'Comedian': c, "Keywords":", ".join(kw.split('\n'))}, ignore_index=True)
    r_df.to_excel(filename)       
    return r_df 

In [87]:
merge_transcripts(corpus_nouns, 'keywords_noun.xlsx')

Extracting...: 100%|███████████████████████████████████████████████████████████████████| 38/38 [03:08<00:00,  4.97s/it]


Unnamed: 0,Comedian,Keywords
0,amy schumer,"thats thing, shes, kind, theyre thank, woman, pregnancy youre bump picture guy, life im year cigarette mouth, dont, god, ill time, people hand, th..."
1,andy woodhull,"im, thats, time wife wa, home, youre, dont, day, shes thing, stuff kid, gon, snake, house, year life experience, people daughter, way, bird, backy..."
2,aziz ansari,"guy, thing, im, ims, youre, time, cause, kid, wa, kind, dude, thats situation people youve stuff video, gon, shit, girl, man egg, shes hey, person..."
3,bert kreischer,"im, wife, shes, youre, fuck, fucking, dad, wa, guy, thats, day woman, shit, yeah, kid counter, line time, mom, gun, night, gon, georgia, hand, han..."
4,bill burr,"di, im, fucking, right, youre, person fuck, la, guy, colore e le, è stato, shit, time, non penso, che hanno dovuto, gon, yeah, man, woman, dont, h..."
5,chris rock,"fuck, fucking, man drug people thats, woman, im, motherfucking, time, gon, kid, thing, witness hand shit, l, house, dont, money, motherfucker jail..."
6,dave chappelle,"la, wa, man, time, timing, motherfucker, motherfucking, motherfuck, motherfuckers, pero, por, fuck, se, bien ser negro y que te, dont, shit chappe..."
7,eric andre,"man, im, time, wa, fucking, game guy, dude, thats drug, okay, parent, parents, dont look, dad, yeah, shit weed, minute, gon, mind mom jew house, c..."
8,george lopez,"fucking, youre, car pero fuck, shit, kid time, day, people country latino, thats anybody, year, doctor, wa, theyre, life, hey, dog, im, somebody, ..."
9,hannah gadsby,"thing, woman, joke, thats, story, man, bit work im, way people, dont, men, lot, right youre, fucking, trauma wa, world, ha fuck, life, tension, co..."


In [88]:
merge_transcripts(corpus_na, 'keywords_na.xlsx')

Extracting...: 100%|███████████████████████████████████████████████████████████████████| 38/38 [05:18<00:00,  8.39s/it]


Unnamed: 0,Comedian,Keywords
0,amy schumer,"guy, theyre, thats thing, like, shes, right people hand, woman, okay, youre little bump picture, life im year old cigarette mouth, time, thank, th..."
1,andy woodhull,"im, timely, youre, thats, dont, time wife wa, home, snake, stuff kid, shes thing, gon, day, people daughter, dog, house, real, bird, girl theyre s..."
2,aziz ansari,"ims, guy, thing, time, im dumb, thats situation, youre god, little kid, wa, shit, people youve stuff, kind, right kitchen, shes hey, girl, cause d..."
3,bert kreischer,"im, wife, shes, youre, fucking, dad, black kid, thats, mom, shit, day woman, right, ilas, yeah, wrong fuck, theyre, guy, gun, look, hey, girl, han..."
4,bill burr,"di, right, fucked, im, youre, goddamn fucking, dumb fuck, la, fuckin, guy, non penso, colore e le, è stato, dick shit, che hanno dovuto, una cosa,..."
5,chris rock,"fucking, fuck, thats, im, woman, man drug people, motherfucking, motherfuck, world mean shit, dont, good, white, time, youre right hour, l, thing,..."
6,dave chappelle,"la, el, una, unas, y que te, wa, shit, man, motherfucker, motherfucking, motherfuck, motherfuckers, pero, fuck, los, timing, hi im, nigga, al, you..."
7,eric andre,"im, man, right, fucking, wa, okay, youre, time, coke fuck, little, entire, dude, game guy, shit weed, parent, parents, dad, dont look, oh marijuan..."
8,george lopez,"fuck, fucking, pathway panocha youre, people country latino, im, day, hey, right, doctor, thats anybody, kid, dont u listen, theyre, shit little, ..."
9,hannah gadsby,"im, ive, thing, woman, youre, wa, fucking, story, joke, men, right, way people, lot, bit work, ha fuck, laughter time lady, yes thats, world, lesb..."
