In [1]:
# Basic functionalities
import numpy as np
import pandas as pd
import matplotlib.pyplot as plt
import re
import string

In [2]:
# Web crawling
import requests
from bs4 import BeautifulSoup
import pickle

In [1]:
# Text Processing
from nltk import word_tokenize, pos_tag
# import nltk
# nltk.download('punkt')
# nltk.download('averaged_perceptron_tagger')

# Retrieve Data

## Data from google drive

In [4]:
df = pd.read_excel('DataSource.xlsx')
# remove the "who" and sort the table with "comedian"
df = df.drop('Who?', axis=1)
df = df.sort_values('Comedian')
df.head()

Unnamed: 0,Comedian,Title,Link of the transcript,Transcript,Video link
19,Amy Schumer,Porn Endings,https://scrapsfromtheloft.com/2017/09/08/amy-s...,"Well, I like to watch porn. Like, what girls i...",https://www.youtube.com/watch?v=EmzPp33kl7o
28,Arsenio Hall,Smart and Classy (2019),https://scrapsfromtheloft.com/2019/10/30/arsen...,Everything’s changing. This has changed a lot....,https://www.youtube.com/watch?v=c_KmV-hquvc&ab...
16,Aziz Ansari,So Sorry You Had a Baby,https://scrapsfromtheloft.com/2017/12/07/aziz-...,"I, uh, turned 30 years old this year. Yes, it’...",https://www.youtube.com/watch?v=lWA-xsAVzgE
38,CHRIS ROCK,TAMBORINE (2018),https://scrapsfromtheloft.com/2018/02/15/chris...,"Oh, man, the older you get, the more shit you ...",https://www.youtube.com/watch?v=dUzaSm0aAKw&ab...
37,CHRIS ROCK,TAMBORINE (2018),https://scrapsfromtheloft.com/2018/02/15/chris...,"Nah, man. But I love religion. I love watching...",https://www.youtube.com/watch?v=5bbKnj1-mR0&ab...


In [5]:
df.describe()

Unnamed: 0,Comedian,Title,Link of the transcript,Transcript,Video link
count,40,40,39,40,40
unique,23,35,31,40,40
top,Sebastian Maniscalco,TAMBORINE (2018),https://scrapsfromtheloft.com/2019/01/24/sebas...,I gotta go to a body shop. You ever go there? ...,https://www.youtube.com/watch?v=A1OKxp91B2M
freq,5,2,3,1,1


In [6]:
# See how many comedians we've got along with the frequency.
gdf = df.groupby('Comedian', sort=True).agg(['count']).Title.sort_values('count', ascending=False)
gdf.T

Comedian,Sebastian Maniscalco,Kevin Hart,Trevor Noah,Whitney Cummings,Vir Das,CHRIS ROCK,Hasan Minhaj,JACK WHITEHALL,JO KOY,Jimmy O. Yang,...,Neal Brennan,Amy Schumer,Michael McIntyre,MICHAEL CHE,Arsenio Hall,Jo Koy,Dave Chappelle,Chris Rock,Aziz Ansari,Joe Rogan
count,5,4,3,2,2,2,2,2,2,2,...,1,1,1,1,1,1,1,1,1,1


#### Isolate the transcripts

In [7]:
pd.set_option('max_colwidth',150)
corpus = df[['Comedian','Transcript']].set_index('Comedian')
corpus

Unnamed: 0_level_0,Transcript
Comedian,Unnamed: 1_level_1
Amy Schumer,"Well, I like to watch porn. Like, what girls in here like porn? Thank you. I love it. All the… all the whores are in the front. This is the best. ..."
Arsenio Hall,"Everything’s changing. This has changed a lot. This is my favorite thing in the world. My woman hates this. My phone. She’s like, “You’re always o..."
Aziz Ansari,"I, uh, turned 30 years old this year. Yes, it’s been a good year. I have found this year to be the year where a lot of my friends are getting seri..."
CHRIS ROCK,"Oh, man, the older you get, the more shit you learn. One thing… the gangster-est shit in the world. You learn nothing more gangster, nothing smart..."
CHRIS ROCK,"Nah, man. But I love religion. I love watching religion, studying religion. The cool thing about religion is, like, no matter who you pray to, no ..."
Chris Rock,"See, relationships are hard. But in order for any relationship to work, both people have to be on the same page. Both people have to have the same..."
Dave Chappelle,Don’t forget who I am. Don’t forget what I am. I am a black dude. And don’t ever forget how I got here. My ancestors were kidnapped. I don’t even ...
Hasan Minhaj,Have you seen the show called The Slap? This is a real show on NBC. This is a real show about a white kid that gets slapped at a birthday party. A...
Hasan Minhaj,"What's up, Montreal? How are you guys? Yeah, oh, man, thank you. Thank you. It's been a big year for me, you guys. Major, major announcement I wan..."
JACK WHITEHALL,"It wasn’t even the worst sign that I saw. The worst sign that I saw was in the same hotel, but it was by the pool. I went down to the hotel swimmi..."


#### Clean the data

In [8]:
def apply_data_cleansing(text):
    # Lowercase every words
    text = text.lower()
    # Remove every words with [blah blah blah] format
    text = re.sub('\[.*?\]', '', text)
    # Remove every words with (blah blah blah) format
    text = re.sub('\(.*?\)', '', text)
    # Get rid of the punctuations
    text = re.sub('[%s]' % re.escape(string.punctuation), '', text)
    # Get rid of all the numbers or words that contain numbers
    text = re.sub('\w*\d\w*', '', text)
    # Get rid of these specific punctuations
    text = re.sub('[‘’“”…]', '', text)
    # Get rid of '\n'
    text = re.sub('\n', '', text)
    return text

In [9]:
clean_corpus = pd.DataFrame(corpus.Transcript.apply(lambda x: apply_data_cleansing(x)))
clean_corpus

Unnamed: 0_level_0,Transcript
Comedian,Unnamed: 1_level_1
Amy Schumer,well i like to watch porn like what girls in here like porn thank you i love it all the all the whores are in the front this is the best girls in ...
Arsenio Hall,everythings changing this has changed a lot this is my favorite thing in the world my woman hates this my phone shes like youre always on that dam...
Aziz Ansari,i uh turned years old this year yes its been a good year i have found this year to be the year where a lot of my friends are getting serious abou...
CHRIS ROCK,oh man the older you get the more shit you learn one thing the gangsterest shit in the world you learn nothing more gangster nothing smarter nothi...
CHRIS ROCK,nah man but i love religion i love watching religion studying religion the cool thing about religion is like no matter who you pray to no matter w...
Chris Rock,see relationships are hard but in order for any relationship to work both people have to be on the same page both people have to have the same foc...
Dave Chappelle,dont forget who i am dont forget what i am i am a black dude and dont ever forget how i got here my ancestors were kidnapped i dont even know wher...
Hasan Minhaj,have you seen the show called the slap this is a real show on nbc this is a real show about a white kid that gets slapped at a birthday party are ...
Hasan Minhaj,whats up montreal how are you guys yeah oh man thank you thank you its been a big year for me you guys major major announcement i want to share wi...
JACK WHITEHALL,it wasnt even the worst sign that i saw the worst sign that i saw was in the same hotel but it was by the pool i went down to the hotel swimming p...


In [10]:
# After cleaing the data, merge all the transcripts of the same comedian
clean_corpus_with_unique_comdian = clean_corpus.groupby('Comedian').agg(lambda x: " ".join(x))
clean_corpus_with_unique_comdian

Unnamed: 0_level_0,Transcript
Comedian,Unnamed: 1_level_1
Amy Schumer,well i like to watch porn like what girls in here like porn thank you i love it all the all the whores are in the front this is the best girls in ...
Arsenio Hall,everythings changing this has changed a lot this is my favorite thing in the world my woman hates this my phone shes like youre always on that dam...
Aziz Ansari,i uh turned years old this year yes its been a good year i have found this year to be the year where a lot of my friends are getting serious abou...
CHRIS ROCK,oh man the older you get the more shit you learn one thing the gangsterest shit in the world you learn nothing more gangster nothing smarter nothi...
Chris Rock,see relationships are hard but in order for any relationship to work both people have to be on the same page both people have to have the same foc...
Dave Chappelle,dont forget who i am dont forget what i am i am a black dude and dont ever forget how i got here my ancestors were kidnapped i dont even know wher...
Hasan Minhaj,have you seen the show called the slap this is a real show on nbc this is a real show about a white kid that gets slapped at a birthday party are ...
JACK WHITEHALL,it wasnt even the worst sign that i saw the worst sign that i saw was in the same hotel but it was by the pool i went down to the hotel swimming p...
JO KOY,people in hawaii brag about shit that no one else brags about thats real cause you cher you cherish the things that you have its yours i love thi...
Jimmy O. Yang,representation matters man a lot of asian people come up to me very proud very nice theyre like jimmy thank you for represent the asians man im li...


In [11]:
# Pickles the result for later usage
clean_corpus_with_unique_comdian.to_pickle('./pickles/clean_corpus.pkl')

#### Retrieve only NOUNS of all transcripts

In [12]:
def nouns(text):
    is_noun = lambda pos: pos[:2] == 'NN'
    tokenized = word_tokenize(text)
    all_nouns = [word for (word, pos) in pos_tag(tokenized) if is_noun(pos)] 
    return ' '.join(all_nouns)

In [13]:
corpus = pd.read_pickle('./pickles/clean_corpus.pkl')

In [14]:
# Apply the nouns function to the transcripts to filter only on nouns
corpus_nouns = pd.DataFrame(corpus.Transcript.apply(nouns))
corpus_nouns

Unnamed: 0_level_0,Transcript
Comedian,Unnamed: 1_level_1
Amy Schumer,porn girls thank whores front girls back books shades reference time show people hack oh god references everyones time editing room i porn i direc...
Arsenio Hall,everythings lot thing world woman phone shes youre phone everything bills flashlight photo album camera right everything phone somebody call phone...
Aziz Ansari,i years year year i year year lot friends lives things aziz baby i reaction i ha ha ha ha ha werent condoms birth control youre gon care thing tal...
CHRIS ROCK,man thing shit world nothing gangster nothing nothing housewife housewives motherfuckers people woman suckers thats people people thats thats hous...
Chris Rock,relationships order relationship people page people focus page focus order relationship people focus focus shes shes fellas morning mirror fuck ho...
Dave Chappelle,forget forget dude dont ancestors i fuck im bottom boats humanity people beasts burdens work irony hundreds years war freed ourselves reconstructi...
Hasan Minhaj,show slap show nbc show kid birthday party episodes kid kids brown birthday party kid birthday point biju birthday cardiologists bees game kid bee...
JACK WHITEHALL,sign i sign i hotel pool i hotel pool refreshing dip hotel pool i sign ladies gentlemen tracks core hotel pool diarrhea swimming pool sign someone...
JO KOY,people brag shit one thats cause things yours mine brother i guy walk hey brother brand tacoma cherry cherry bro brand toyota tacoma youre paradis...
Jimmy O. Yang,representation matters man lot people theyre jimmy thank asians man youre welcome choice asians couldnt day fuck nigerians asians time theres pres...


In [15]:
corpus_nouns.to_pickle('./pickles/corpus_nouns.pkl')

#### Retrieve NOUNS and ADJECTIVES of all transcripts

In [16]:
def nouns_adj(text):
    is_noun_adj = lambda pos: pos[:2] == 'NN' or pos[:2] == 'JJ'
    tokenized = word_tokenize(text)
    nouns_adj = [word for (word, pos) in pos_tag(tokenized) if is_noun_adj(pos)] 
    return ' '.join(nouns_adj)

In [17]:
corpus = pd.read_pickle('./pickles/clean_corpus.pkl')

In [18]:
# Apply the nouns function to the transcripts to filter only on nouns
corpus_na = pd.DataFrame(corpus.Transcript.apply(nouns_adj))
corpus_na

Unnamed: 0_level_0,Transcript
Comedian,Unnamed: 1_level_1
Amy Schumer,porn girls porn thank whores front best girls back books shades reference time show people hack oh god references sandusky everyones time editing ...
Arsenio Hall,everythings lot favorite thing world woman phone shes youre phone everything i bills flashlight photo album camera right everything phone many mad...
Aziz Ansari,i years old year good year i year year lot friends serious lives serious things aziz baby i same reaction i ha ha ha ha ha ha sorry werent condoms...
CHRIS ROCK,man older more shit thing gangsterest shit world nothing gangster nothing nothing powerful housewife housewives smartest motherfuckers earth peopl...
Chris Rock,relationships hard order relationship people same page people same focus page focus order relationship people same focus focus shes fellas shes fe...
Dave Chappelle,dont forget dont forget i black dude dont i ancestors i fuck im bottom boats atlantic many strongest humanity people beasts burdens work irony hun...
Hasan Minhaj,show slap real show nbc real show white kid birthday party thirteen episodes kid brown kids brown birthday party kid birthday point ah biju birthd...
JACK WHITEHALL,worst sign i worst sign i same hotel pool i hotel pool lovely refreshing dip hotel pool i sign ladies gentlemen tracks core hotel pool active diar...
JO KOY,people hawaii brag shit one thats real cause things yours mine brother i guy walk hey brother brand new toyota tacoma cherry cherry bro brand new ...
Jimmy O. Yang,representation matters man lot asian people proud nice theyre jimmy thank asians man eh youre welcome choice asian asians i couldnt day fuck niger...


In [19]:
corpus_na.to_pickle('./pickles/corpus_na.pkl')

## All transcript data from https://scrapsfromtheloft.com/stand-up-comedy-scripts/

In [20]:
# Scrapes transcript data from scrapsfromtheloft.com
def url_to_transcript(url):
    '''Returns transcript data specifically from scrapsfromtheloft.com.'''
    page = requests.get(url).text
    soup = BeautifulSoup(page, "lxml")
    text = [p.text for p in soup.find(class_="post-content").find_all('p')]
    print(url)
    return text