In [2]:
import numpy as np
import pandas as pd
import nltk
from textblob import TextBlob
import re
nltk.download('averaged_perceptron_tagger')

[nltk_data] Downloading package averaged_perceptron_tagger to
[nltk_data]     /Users/Chloe/nltk_data...
[nltk_data]   Package averaged_perceptron_tagger is already up-to-
[nltk_data]       date!


True

In [3]:
df = pd.read_csv('transcripts.csv')
df

Unnamed: 0.1,Unnamed: 0,Comedian,Date,Title,Subtitle,Transcript
0,0,Chris Rock,"March 8, 2023",Selective Outrage (2023) | Transcript,,[slow instrumental music playing] [funk drums ...
1,1,Marc Maron,"March 3, 2023",Thinky Pain (2013) | Transcript,Marc Maron returns to his old stomping grounds...,[siren wailing] I don’t know what you were thi...
2,2,Chelsea Handler,"March 3, 2023",Evolution (2020) | Transcript,Chelsea Handler is back and better than ever -...,Join me in welcoming the author of six number ...
3,3,Tom Papa,"March 3, 2023",What A Day! (2022) | Transcript,"Follows Papa as he shares about parenting, his...","Premiered on December 13, 2022 Ladies and gent..."
4,4,Jim Jefferies,"February 22, 2023",High n’ Dry (2023) | Transcript,Jim Jefferies is back and no topic is off limi...,"Please welcome to the stage, Jim Jefferies! He..."
...,...,...,...,...,...,...
410,410,George Carlin,"April 6, 2017",Jamming in New York (1992) – Transcript,,Jammin’ in New York is George Carlin’s 14th al...
411,411,,"April 1, 2017",Jim Jefferies on Gun Control [Full Transcript],Australian comedian Jim Jefferies breaks down ...,Australian comedian Jim Jefferies breaks down ...
412,412,Reggie Watts,"March 30, 2017",Spatial (2016) – Full Transcript,Completely improvised show weaving together sk...,"Hello, I’m Thomas. I’m so glad to meet you Mum..."
413,413,GEORGE CARLIN,"March 29, 2017",COMPLAINTS AND GRIEVANCES (2001) – FULL TRANSC...,"Full transcript of Complaints and Grievances, ...",Complaints and Grievances is a HBO stand-up sp...


## Extract sentences and actions from each transcript

In [4]:
sent_rows = []

#function to extract each sentence of each transcript
def extract_sent(row):
    text = row['Transcript']
    title = row['Title']
    subtitle = row['Subtitle']
    comedian = row['Comedian']
    
    sentences = re.split('\[[a-zA-Z\s\-]+\]|\([a-zA-Z\s\-]+\)', text)
    actions = re.findall('\[[a-zA-Z\s\-]+\]|\([a-zA-Z\s\-]+\)', text)
    
    #only keep transcripts with actions
    if len(sentences) > 1 and len(actions) > 0:
        #account for consecutive actions
        sentences1 = []
        actions1 = []
        for a in range(0, len(actions)):
            pre_text = sentences[a]
            if pre_text == '' or pre_text == ' ':
                #concatenate current action with latest action in actions1 unless index 0
                if a == 0:
                    actions1.append(actions[a])
                    sentences1.append(pre_text)
                else:
                    concat_act = actions1.pop() + ' ' + actions[a]
                    actions1.append(concat_act)
            else:
                sentences1.append(pre_text)
                actions1.append(actions[a])
        
        #account for non-empty last element in sentences that doesn't have an action after
        if len(re.findall('\w+', sentences[-1])) > 0:
            sentences1.append(sentences[-1])
            actions1.append('NA')
                
        #add to list of dataframe rows
        for i in range(0, len(sentences1)):
            sent_row = [comedian, title, subtitle, sentences1[i], actions1[i]]
            sent_rows.append(sent_row)

df.apply(extract_sent, axis = 1)

sent_df = pd.DataFrame(sent_rows, columns=['Comedian', 'Title', 'Subtitle', 'Transcript Part', 'Action'])
sent_df

Unnamed: 0,Comedian,Title,Subtitle,Transcript Part,Action
0,Chris Rock,Selective Outrage (2023) | Transcript,,,[slow instrumental music playing] [funk drums ...
1,Chris Rock,Selective Outrage (2023) | Transcript,,Let’s go!,[hip-hop music playing] [audience cheering] [C...
2,Chris Rock,Selective Outrage (2023) | Transcript,,"She said, “$300, I’ll do anything you want.” ...",[audience laughing] [audience cheering] [hip-h...
3,Chris Rock,Selective Outrage (2023) | Transcript,,Ladies and gentlemen. Ladies and gentlemen. C...,[audience cheering] [audience continue cheerin...
4,Chris Rock,Selective Outrage (2023) | Transcript,,"What’s up, Baltimore?",[audience cheers loudly]
...,...,...,...,...,...
25676,GEORGE CARLIN,COMPLAINTS AND GRIEVANCES (2001) – FULL TRANSC...,"Full transcript of Complaints and Grievances, ...",Complaints and Grievances is a HBO stand-up sp...,(Things That Come Off of Your Body)
25677,GEORGE CARLIN,COMPLAINTS AND GRIEVANCES (2001) – FULL TRANSC...,"Full transcript of Complaints and Grievances, ...",” Now. Folks. This next piece of material’s go...,
25678,George Carlin,It’s Bad For Ya (2008) Full transcript,"Full transcript of It's Bad for Ya, final HBO ...","Full transcript of It’s Bad for Ya, final HBO ...",(sound)
25679,George Carlin,It’s Bad For Ya (2008) Full transcript,"Full transcript of It's Bad for Ya, final HBO ...",. What I am is an old fuck. It’s kind of like ...,(Fart sound)


## Filter actions for audience reactions only

In [5]:
unique_actions = list(sent_df['Action'].unique())
unique_actions

['[slow instrumental music playing] [funk drums playing] [indistinct chatter] [man]',
 '[hip-hop music playing] [audience cheering] [Chris Rock]',
 '[audience laughing] [audience cheering] [hip-hop music playing] [female announcer]',
 '[audience cheering] [audience continue cheering] [Chris Rock]',
 '[audience cheers loudly]',
 '[audience cheering]',
 '[man shouting unintelligibly]',
 '[audience laughing]',
 '[audience applauding]',
 '[audience laughing and applauding] [imperceptible]',
 '[chuckling]',
 '[man whooping]',
 '[chuckles]',
 '[laughs]',
 '[sputters]',
 '[audience cheer in agreement]',
 '[high-pitch voice]',
 '[normal voice]',
 '[audience]',
 '[audience laughs]',
 '[audience cheering and whistling]',
 '[audience laughing and applauding]',
 '[groans]',
 '[voice breaking]',
 '[man whoops]',
 '[guffaws]',
 '[scattered laughs]',
 '[exclaims]',
 '[yelps in pain]',
 '[audience laughing and applauding] [man]',
 '[crying]',
 '[imitating a woman]',
 '[chuckling] [man whoops]',
 '[scr

In [6]:
len(unique_actions)

5764

In [7]:
aud_unique_actions = []

#only keep actions that contain 'audience' or 'crowd', and the 'NA' action (which is neutral)
for act in unique_actions:
    lower_act = act.lower()
    if 'audience' in lower_act or 'crowd' in lower_act or act == 'NA':
        aud_unique_actions.append(act)

aud_unique_actions

['[hip-hop music playing] [audience cheering] [Chris Rock]',
 '[audience laughing] [audience cheering] [hip-hop music playing] [female announcer]',
 '[audience cheering] [audience continue cheering] [Chris Rock]',
 '[audience cheers loudly]',
 '[audience cheering]',
 '[audience laughing]',
 '[audience applauding]',
 '[audience laughing and applauding] [imperceptible]',
 '[audience cheer in agreement]',
 '[audience]',
 '[audience laughs]',
 '[audience cheering and whistling]',
 '[audience laughing and applauding]',
 '[audience laughing and applauding] [man]',
 '[slams mic] [audience cheering and applauding] [hip-hop music playing] [audience continue cheering]',
 'NA',
 '[woman groans in audience]',
 '[audience exclaims in agreement] [audience exclaims in disagreement] [laughter]',
 '[audience member]',
 '[crowd cheers]',
 '[audience members]',
 '[audience applauds]',
 '[upbeat music playing] [crowd cheering]',
 '[upbeat music plays] [audience cheering] [announcer]',
 '[indistinct chatte

In [8]:
len(aud_unique_actions)

1051

In [9]:
#filter sent_df for rows whose action is in aud_unique_actions
sent_df1 = sent_df[sent_df['Action'].isin(aud_unique_actions)]
sent_df1

Unnamed: 0,Comedian,Title,Subtitle,Transcript Part,Action
1,Chris Rock,Selective Outrage (2023) | Transcript,,Let’s go!,[hip-hop music playing] [audience cheering] [C...
2,Chris Rock,Selective Outrage (2023) | Transcript,,"She said, “$300, I’ll do anything you want.” ...",[audience laughing] [audience cheering] [hip-h...
3,Chris Rock,Selective Outrage (2023) | Transcript,,Ladies and gentlemen. Ladies and gentlemen. C...,[audience cheering] [audience continue cheerin...
4,Chris Rock,Selective Outrage (2023) | Transcript,,"What’s up, Baltimore?",[audience cheers loudly]
5,Chris Rock,Selective Outrage (2023) | Transcript,,"Yes! Yes, yes. Thank you! Thank you so much! ...",[audience cheering]
...,...,...,...,...,...
25672,,Jim Jefferies on Gun Control [Full Transcript],Australian comedian Jim Jefferies breaks down ...,"“Guns! Who wants to sell me a gun?” Now, I’m ...",[Audience cheering]
25673,,Jim Jefferies on Gun Control [Full Transcript],Australian comedian Jim Jefferies breaks down ...,Yeah! And that made a hell of a lot of sense ...,
25675,Reggie Watts,Spatial (2016) – Full Transcript,Completely improvised show weaving together sk...,” Fuck. # ..Then jump for my love # Jump in # ...,
25677,GEORGE CARLIN,COMPLAINTS AND GRIEVANCES (2001) – FULL TRANSC...,"Full transcript of Complaints and Grievances, ...",” Now. Folks. This next piece of material’s go...,


## Filter for non-empty transcript parts only

In [10]:
#get rows with empty transcript parts to check that actions are not relevant
empty_trans = sent_df1[(sent_df1['Transcript Part'] == '')|(sent_df1['Transcript Part'] == ' ')]
list(empty_trans.loc[:, 'Action'])

['[upbeat music playing] [crowd cheering]',
 '[upbeat music plays] [audience cheering] [announcer]',
 '[gentle music playing] [audience applauding] [audience cheering]',
 '[audience cheering and applauding] [announcer]',
 '[audience murmuring] [murmuring continues] [audience clapping in unison] [sudden silence] [vinyl pops]',
 '[hip-hop music playing] [audience cheering and applauding] [music stops]',
 '(Chattering) (music playing) (audience cheering)',
 '[audience chattering indistinctly] [man]',
 '[heartbeat] [indistinct chatter] [atmospheric whooshing] [audience cheering]',
 '[crowd chanting]',
 '(crowd murmurs)',
 '[indistinct chattering] [faint laughter] [audience cheering and applauding]',
 '(AUDIENCE CHEERING)',
 '[audience cheering] [announcer]',
 '[Audience cheering] (DL Hughley enters from stage left with his head cocked and grabs the microphone)',
 '[upbeat music playing] [exhaling] [indistinct conversations] [crowd cheering] [inaudible] [audience cheering uproariously]',
 '

In [11]:
len(empty_trans)

26

In [12]:
#list of indexes where actions with empty transcripts are NOT found at the beginning of the transcripts
non_beginning_actions = []

#function to check that actions with empty transcripts are found at the beginning of the transcripts
def find_non_beginning_actions(row):
    #find full transcript from df
    full_trans = list(df[df['Title'] == row['Title']]['Transcript'])[0]
    
    start_index = full_trans.index(row['Action'])
    if start_index > 0:
        non_beginning_actions.append(row.index)

empty_trans.apply(find_non_beginning_actions, axis = 1)
len(non_beginning_actions)

0

Since length of the 'non_beginning_actions' list is 0, it means that all the actions with empty transcript parts are indeed found at the beginning of their respective transcripts. Hence, there are no words spoken by the comedian before these actions and we may remove these rows from the sentence dataframe as they do not provide any insights into what are the words spoken by a comedian that lead to an audience reaction after.

In [13]:
#remove rows in empty_trans from the sentence dataframe
sent_df2 = sent_df1.drop(list(empty_trans.index))
sent_df2

Unnamed: 0,Comedian,Title,Subtitle,Transcript Part,Action
1,Chris Rock,Selective Outrage (2023) | Transcript,,Let’s go!,[hip-hop music playing] [audience cheering] [C...
2,Chris Rock,Selective Outrage (2023) | Transcript,,"She said, “$300, I’ll do anything you want.” ...",[audience laughing] [audience cheering] [hip-h...
3,Chris Rock,Selective Outrage (2023) | Transcript,,Ladies and gentlemen. Ladies and gentlemen. C...,[audience cheering] [audience continue cheerin...
4,Chris Rock,Selective Outrage (2023) | Transcript,,"What’s up, Baltimore?",[audience cheers loudly]
5,Chris Rock,Selective Outrage (2023) | Transcript,,"Yes! Yes, yes. Thank you! Thank you so much! ...",[audience cheering]
...,...,...,...,...,...
25672,,Jim Jefferies on Gun Control [Full Transcript],Australian comedian Jim Jefferies breaks down ...,"“Guns! Who wants to sell me a gun?” Now, I’m ...",[Audience cheering]
25673,,Jim Jefferies on Gun Control [Full Transcript],Australian comedian Jim Jefferies breaks down ...,Yeah! And that made a hell of a lot of sense ...,
25675,Reggie Watts,Spatial (2016) – Full Transcript,Completely improvised show weaving together sk...,” Fuck. # ..Then jump for my love # Jump in # ...,
25677,GEORGE CARLIN,COMPLAINTS AND GRIEVANCES (2001) – FULL TRANSC...,"Full transcript of Complaints and Grievances, ...",” Now. Folks. This next piece of material’s go...,


## Further filtering for audience reactions within each action row

In [14]:
#list of valid audience action parts that are hard to capture using pos tags later
valid_aud_parts = ['[scattered applause in audience]', '[applause from the audience]', '[scattered cheers from crowd]', '(Audience and Hughlely laugh)', '(Hughley pauses and laughs with audience)', '(Audience and Hughley laugh)', '(Audience lightly laughs)']

#list of action parts containing 'audience' or 'crowd' but are not valid audience reactions
nonaud_act_parts = []

#function to remove the non-audience parts of each row's action, eg: [hip-hop music playing], and only keep audience parts
def get_aud_reaction(act):
    if act != 'NA':
        #get all the bracketed parts into a list
        act_parts = re.findall('\[[a-zA-Z\s\-]+\]|\([a-zA-Z\s\-]+\)', act)
        aud_act_parts = []
        
        #only keep audience parts
        for p in act_parts:
            #check whether p is found in valid_aud_parts
            if p in valid_aud_parts:
                aud_act_parts.append(p)
                continue
            p_lower = p.lower()
            if 'audience' in p_lower or 'crowd' in p_lower:
                p_blob = TextBlob(p_lower[1:-1])
                p_tags = p_blob.tags
                for i in range(0, len(p_tags)):
                    token, tag = p_tags[i]
                    if token == 'audience' or token == 'crowd':
                        if i < len(p_tags) - 1:
                            next_tag = p_tags[i + 1][1]
                            if 'VB' in next_tag or 'NN' in next_tag:
                                aud_act_parts.append(p)
                                break
                            else:
                                nonaud_act_parts.append(p)
                        else:
                            nonaud_act_parts.append(p)
        
        new_act = ' '.join(aud_act_parts)
        if new_act == '':
            new_act = 'to remove row'
        return new_act
    return 'NA'

new_actions = sent_df2['Action'].apply(get_aud_reaction)

for x in new_actions.unique():
    print(x)

[audience cheering]
[audience laughing] [audience cheering]
[audience cheering] [audience continue cheering]
[audience cheers loudly]
[audience laughing]
[audience applauding]
[audience laughing and applauding]
[audience cheer in agreement]
to remove row
[audience laughs]
[audience cheering and whistling]
[audience cheering and applauding] [audience continue cheering]
NA
[audience exclaims in agreement] [audience exclaims in disagreement]
[audience member]
[crowd cheers]
[audience members]
[audience applauds]
[crowd cheering]
[crowd laughing]
[crowd continues laughing]
[voice in crowd whoops] [crowd clapping]
[woman in crowd screams]
[crowd laughing] [crowd clapping]
[crowd laughing] [crowd continues laughing]
[crowd clapping]
[crowd laughing] [a voice in crowd cackling]
[crowd laughing] [female voice in crowd laughing]
[crowd murmur in laughter]
[crowd laughing] [voice in crowd laughing]
[crowd laugh]
[a few voices in crowd laugh]
[crowd laughing] [female voice in crowd cackling] [cro

In [15]:
for x in list(set(nonaud_act_parts)):
    print(x)

[points to the side and then dramatically moves point towards the audience]
[whistling from audience]
[she looks into the audience]
[sympathy from audience]
[man in the crowd]
[women in audience]
[mumbles to the crowd while gesturing to the camera]
[turns on his heels toward audience and holds up a finger]
[John freezes his pose for a moment while waiting for audience to finish laughing]
[voices in crowd]
(man from audience)
[crowd]
[she points into the audience]
[audience]
[whistling in crowd]
(Audience)
[returns to normal demeanor and turns toward audience]
[to the single people in the audience]
(Hughley points at another person in the audience)
[turns towards audience with dramatically serious expression and tone of voice]
[voice whoops in the crowd]
(Widens his eyes and rotates his body away from audience)
[loud whoop from the audience]
[man from audience]
[speaking to someone in the audience]
[woman in audience]
[stands up straight and looks into the audience with a confident expr

In [16]:
#new dataframe with new 'Action' column
sent_df3 = sent_df2.copy()
sent_df3['Action'] = new_actions

#filter sent_df3 for rows where 'Action' != 'to remove row'
sent_df3 = sent_df3[sent_df3['Action'] != 'to remove row']
sent_df3

Unnamed: 0,Comedian,Title,Subtitle,Transcript Part,Action
1,Chris Rock,Selective Outrage (2023) | Transcript,,Let’s go!,[audience cheering]
2,Chris Rock,Selective Outrage (2023) | Transcript,,"She said, “$300, I’ll do anything you want.” ...",[audience laughing] [audience cheering]
3,Chris Rock,Selective Outrage (2023) | Transcript,,Ladies and gentlemen. Ladies and gentlemen. C...,[audience cheering] [audience continue cheering]
4,Chris Rock,Selective Outrage (2023) | Transcript,,"What’s up, Baltimore?",[audience cheers loudly]
5,Chris Rock,Selective Outrage (2023) | Transcript,,"Yes! Yes, yes. Thank you! Thank you so much! ...",[audience cheering]
...,...,...,...,...,...
25672,,Jim Jefferies on Gun Control [Full Transcript],Australian comedian Jim Jefferies breaks down ...,"“Guns! Who wants to sell me a gun?” Now, I’m ...",[Audience cheering]
25673,,Jim Jefferies on Gun Control [Full Transcript],Australian comedian Jim Jefferies breaks down ...,Yeah! And that made a hell of a lot of sense ...,
25675,Reggie Watts,Spatial (2016) – Full Transcript,Completely improvised show weaving together sk...,” Fuck. # ..Then jump for my love # Jump in # ...,
25677,GEORGE CARLIN,COMPLAINTS AND GRIEVANCES (2001) – FULL TRANSC...,"Full transcript of Complaints and Grievances, ...",” Now. Folks. This next piece of material’s go...,


## Text preprocessing for each transcript part

In [17]:
import string
from nltk import word_tokenize, pos_tag
from nltk.corpus import stopwords, wordnet
from nltk.stem import WordNetLemmatizer

#function from utils
def get_wordnet_pos(treebank_tag):
    if treebank_tag.startswith('J'):
        return wordnet.ADJ
    elif treebank_tag.startswith('V'):
        return wordnet.VERB
    elif treebank_tag.startswith('N'):
        return wordnet.NOUN
    elif treebank_tag.startswith('R'):
        return wordnet.ADV
    else:
        # As default pos in lemmatization is Noun
        return wordnet.NOUN

#function from utils
def pos_then_lemmatize(pos_tagged_words):
    lemmatizer = WordNetLemmatizer()

    res = []
    for pos in pos_tagged_words : 
        word = pos[0]
        pos_tag = pos[1]

        lem = lemmatizer.lemmatize(word, get_wordnet_pos(pos_tag))
        res.append(lem)
    return res

#function to preprocess transcript parts [heavily adapted from custom_tokenizer_stop(doc) in utils, and added digit removal]
def preprocess(doc):
    punct = string.punctuation + "’“”–"
    #remove punctuation
    doc = doc.translate(str.maketrans(punct, " " * len(punct)))

    #remove digits
    doc = ''.join([i for i in doc if not i.isdigit()])

    #make doc lowercase and tokenize to words
    words = word_tokenize(doc.lower())
    
    #add our own stop word list to the existing English stop words 
    stop_words = stopwords.words("english") + [
        "get", "go", "know", "dont", "im", "like", "say", "thats", "one", "come", "right", "think", "youre", 
        "people", "see", "look", "want", "time", "make", "na", "gon", "thing", "oh", "take", "good", "guy", 
        "fuck", "would", "yeah", "tell", "well", "he", "shit", "cause", "back", "theyre", "man", "really", "cant", "little",
        "let", "just", "okay", "ive", "♪", "–", "ta", "uh", "wan", "g", "e", "ah", "r", "mi", "le"
    ]
    
    #remove stopwords before lemmatization
    filtered_words = [w.strip() for w in words if not w in stop_words] 

    #pos tag words correctly and lemmatize them according to their corrected pos tag
    pos_tagged_words = pos_tag(filtered_words)
    pos_lemmatized_words = pos_then_lemmatize(pos_tagged_words)

    #remove stopwords after lemmatization
    filtered_words_2 = [w for w in pos_lemmatized_words if not w in stop_words] 

    #join list of words back into string
    final_text = ' '.join(filtered_words_2)

    return final_text




In [18]:
#preprocess transcript part of each row to form new 'Processed Transcript' column
sent_df3['Processed Transcript'] = sent_df3['Transcript Part'].apply(preprocess)
sent_df3

Unnamed: 0,Comedian,Title,Subtitle,Transcript Part,Action,Processed Transcript
1,Chris Rock,Selective Outrage (2023) | Transcript,,Let’s go!,[audience cheering],
2,Chris Rock,Selective Outrage (2023) | Transcript,,"She said, “$300, I’ll do anything you want.” ...",[audience laughing] [audience cheering],anything bitch paint house need death penalty ...
3,Chris Rock,Selective Outrage (2023) | Transcript,,Ladies and gentlemen. Ladies and gentlemen. C...,[audience cheering] [audience continue cheering],lady gentleman lady gentleman chris rock
4,Chris Rock,Selective Outrage (2023) | Transcript,,"What’s up, Baltimore?",[audience cheers loudly],baltimore
5,Chris Rock,Selective Outrage (2023) | Transcript,,"Yes! Yes, yes. Thank you! Thank you so much! ...",[audience cheering],yes yes yes thank thank much thank much netfli...
...,...,...,...,...,...,...
25672,,Jim Jefferies on Gun Control [Full Transcript],Australian comedian Jim Jefferies breaks down ...,"“Guns! Who wants to sell me a gun?” Now, I’m ...",[Audience cheering],gun sell gun wrap talk anymore now… agree bear...
25673,,Jim Jefferies on Gun Control [Full Transcript],Australian comedian Jim Jefferies breaks down ...,Yeah! And that made a hell of a lot of sense ...,,hell lot sense musket government drone bring g...
25675,Reggie Watts,Spatial (2016) – Full Transcript,Completely improvised show weaving together sk...,” Fuck. # ..Then jump for my love # Jump in # ...,,jump love jump feel touch jump taste kiss nigh...
25677,GEORGE CARLIN,COMPLAINTS AND GRIEVANCES (2001) – FULL TRANSC...,"Full transcript of Complaints and Grievances, ...",” Now. Folks. This next piece of material’s go...,,folk next piece material give u chance bond am...


In [19]:
#remove rows with empty 'Processed Transcript' column
sent_df3 = sent_df3[sent_df3['Processed Transcript'] != '']
sent_df3

Unnamed: 0,Comedian,Title,Subtitle,Transcript Part,Action,Processed Transcript
2,Chris Rock,Selective Outrage (2023) | Transcript,,"She said, “$300, I’ll do anything you want.” ...",[audience laughing] [audience cheering],anything bitch paint house need death penalty ...
3,Chris Rock,Selective Outrage (2023) | Transcript,,Ladies and gentlemen. Ladies and gentlemen. C...,[audience cheering] [audience continue cheering],lady gentleman lady gentleman chris rock
4,Chris Rock,Selective Outrage (2023) | Transcript,,"What’s up, Baltimore?",[audience cheers loudly],baltimore
5,Chris Rock,Selective Outrage (2023) | Transcript,,"Yes! Yes, yes. Thank you! Thank you so much! ...",[audience cheering],yes yes yes thank thank much thank much netfli...
7,Chris Rock,Selective Outrage (2023) | Transcript,,"I’mma try… N*gga, sit down! I’mma try to do a...",[audience laughing],mma try… n gga sit mma try show tonight withou...
...,...,...,...,...,...,...
25672,,Jim Jefferies on Gun Control [Full Transcript],Australian comedian Jim Jefferies breaks down ...,"“Guns! Who wants to sell me a gun?” Now, I’m ...",[Audience cheering],gun sell gun wrap talk anymore now… agree bear...
25673,,Jim Jefferies on Gun Control [Full Transcript],Australian comedian Jim Jefferies breaks down ...,Yeah! And that made a hell of a lot of sense ...,,hell lot sense musket government drone bring g...
25675,Reggie Watts,Spatial (2016) – Full Transcript,Completely improvised show weaving together sk...,” Fuck. # ..Then jump for my love # Jump in # ...,,jump love jump feel touch jump taste kiss nigh...
25677,GEORGE CARLIN,COMPLAINTS AND GRIEVANCES (2001) – FULL TRANSC...,"Full transcript of Complaints and Grievances, ...",” Now. Folks. This next piece of material’s go...,,folk next piece material give u chance bond am...


## Processing and labelling actions

In [20]:
action_dict = {}

#get dictionary of audience reactions
def fill_dict(a):
    blob = TextBlob(a)
    blob_tags = blob.tags
    for token, tag in blob_tags:
        if tag == 'NNS' or tag == 'NN' or 'VB' in tag:
            if token in action_dict:
                action_dict[token] += 1
            else:
                action_dict[token] = 1
                
sent_df3['Action'].apply(fill_dict)
action_dict

{'audience': 5589,
 'laughing': 2857,
 'cheering': 547,
 ']': 5862,
 'continue': 3,
 'cheers': 220,
 'applauding': 237,
 'cheer': 8,
 'agreement': 2,
 'laughs': 2194,
 'whistling': 13,
 'NA': 189,
 'exclaims': 15,
 'disagreement': 1,
 'member': 34,
 '[': 863,
 'crowd': 848,
 'members': 14,
 'applauds': 132,
 'continues': 36,
 'voice': 9,
 'whoops': 19,
 'clapping': 67,
 'woman': 2,
 'screams': 6,
 'cackling': 2,
 'murmur': 1,
 'laughter': 320,
 'voices': 1,
 'laugh': 17,
 'quietens': 2,
 'whooping': 27,
 'exclaiming': 12,
 'chuckling': 23,
 'chuckles': 63,
 'whopping': 1,
 'muttering': 1,
 'snickering': 1,
 'groaning': 19,
 'titters': 3,
 'coos': 1,
 'boos': 4,
 'shouts': 6,
 'scattered': 2,
 'applause': 8,
 'shouting': 13,
 'gasps': 14,
 'claps': 33,
 'cries': 1,
 'surprise': 1,
 'sympathizes': 1,
 'jeering': 2,
 'groans': 26,
 'catcalls': 2,
 'wolf': 1,
 'whistles': 6,
 'beat': 2,
 'singing': 3,
 'applaud': 4,
 'moaning': 1,
 'screaming': 5,
 'hooting': 4,
 'hoots': 4,
 'imitating': 

In [21]:
#determine list of more significant audience reactions from action_dict
aud_reactions = ['cheer', 'laugh', 'applaud', 'NA']

In [22]:
def label_action(text):
    label = ''
    for a in aud_reactions:
        if a in text:
            if label == '':
                label += a
            else:
                label = label + ' ' + a
    
    if label == '':
        label = 'to remove row'
    return label

#label each row of sent_df3 according to the 'Action' column
sent_df3['Audience Reaction'] = sent_df3['Action'].apply(label_action)

A value is trying to be set on a copy of a slice from a DataFrame.
Try using .loc[row_indexer,col_indexer] = value instead

See the caveats in the documentation: https://pandas.pydata.org/pandas-docs/stable/user_guide/indexing.html#returning-a-view-versus-a-copy
  sent_df3['Audience Reaction'] = sent_df3['Action'].apply(label_action)


In [23]:
#remove rows with the 'to remove row' reaction
sent_df4 = sent_df3[sent_df3['Audience Reaction'] != 'to remove row']
sent_df4

Unnamed: 0,Comedian,Title,Subtitle,Transcript Part,Action,Processed Transcript,Audience Reaction
2,Chris Rock,Selective Outrage (2023) | Transcript,,"She said, “$300, I’ll do anything you want.” ...",[audience laughing] [audience cheering],anything bitch paint house need death penalty ...,cheer laugh
3,Chris Rock,Selective Outrage (2023) | Transcript,,Ladies and gentlemen. Ladies and gentlemen. C...,[audience cheering] [audience continue cheering],lady gentleman lady gentleman chris rock,cheer
4,Chris Rock,Selective Outrage (2023) | Transcript,,"What’s up, Baltimore?",[audience cheers loudly],baltimore,cheer
5,Chris Rock,Selective Outrage (2023) | Transcript,,"Yes! Yes, yes. Thank you! Thank you so much! ...",[audience cheering],yes yes yes thank thank much thank much netfli...,cheer
7,Chris Rock,Selective Outrage (2023) | Transcript,,"I’mma try… N*gga, sit down! I’mma try to do a...",[audience laughing],mma try… n gga sit mma try show tonight withou...,laugh
...,...,...,...,...,...,...,...
25672,,Jim Jefferies on Gun Control [Full Transcript],Australian comedian Jim Jefferies breaks down ...,"“Guns! Who wants to sell me a gun?” Now, I’m ...",[Audience cheering],gun sell gun wrap talk anymore now… agree bear...,cheer
25673,,Jim Jefferies on Gun Control [Full Transcript],Australian comedian Jim Jefferies breaks down ...,Yeah! And that made a hell of a lot of sense ...,,hell lot sense musket government drone bring g...,
25675,Reggie Watts,Spatial (2016) – Full Transcript,Completely improvised show weaving together sk...,” Fuck. # ..Then jump for my love # Jump in # ...,,jump love jump feel touch jump taste kiss nigh...,
25677,GEORGE CARLIN,COMPLAINTS AND GRIEVANCES (2001) – FULL TRANSC...,"Full transcript of Complaints and Grievances, ...",” Now. Folks. This next piece of material’s go...,,folk next piece material give u chance bond am...,


## Creating target 'Funniness' variable
Funniness categorical variable has 4 categories:
- 0: neutral
- 1: a little funny
- 2: moderately funny
- 3: very funny

To generate a value of funniness according to the audience reaction, first calculate the 'funny score' which should range from 0 to 1 inclusive.

**Funny score = 0.55 * laugh + 0.25 * applaud + 0.2 * cheer**  
where laugh = 1 when laugh is present in reaction, 0 otherwise  
      applaud = 1 when applaud is present in reaction, 0 otherwise  
      cheer = 1 when cheer is present in reaction, 0 otherwise

Then, according to the funny score and the range that it falls in below, determine the value of funniness:
- Score of 0: 0 (neutral)
- Score in range (0, 0.55): 1 (a little funny)
- Score in range \[0.55, 0.80): 2 (moderately funny)
- Score in range \[0.80, 1\]: 3 (very funny)

In [24]:
#function to calculate funny score and output funniness for each row
def get_funniness(reaction):
    laugh = 0
    applaud = 0
    cheer = 0
    
    if 'laugh' in reaction:
        laugh = 1
    if 'applaud'in reaction:
        applaud = 1
    if 'cheer' in reaction:
        cheer = 1
    
    score = 0.55 * laugh + 0.25 * applaud + 0.2 * cheer
    
    if score == 0:
        funniness = 0
    elif score > 0 and score < 0.55:
        funniness = 1
    elif score >= 0.55 and score < 0.80:
        funniness = 2
    elif score >= 0.80 and score <= 1:
        funniness = 3
    
    return funniness


In [25]:
#add 'Funniness' column to sent_df4
sent_df4['Funniness'] = sent_df4['Audience Reaction'].apply(get_funniness)

#reset index of sent_df4 to range from 0 to 6491
sent_df4 = sent_df4.reset_index()
sent_df4 = sent_df4.drop(columns='index')
sent_df4

A value is trying to be set on a copy of a slice from a DataFrame.
Try using .loc[row_indexer,col_indexer] = value instead

See the caveats in the documentation: https://pandas.pydata.org/pandas-docs/stable/user_guide/indexing.html#returning-a-view-versus-a-copy
  sent_df4['Funniness'] = sent_df4['Audience Reaction'].apply(get_funniness)


Unnamed: 0,Comedian,Title,Subtitle,Transcript Part,Action,Processed Transcript,Audience Reaction,Funniness
0,Chris Rock,Selective Outrage (2023) | Transcript,,"She said, “$300, I’ll do anything you want.” ...",[audience laughing] [audience cheering],anything bitch paint house need death penalty ...,cheer laugh,2
1,Chris Rock,Selective Outrage (2023) | Transcript,,Ladies and gentlemen. Ladies and gentlemen. C...,[audience cheering] [audience continue cheering],lady gentleman lady gentleman chris rock,cheer,1
2,Chris Rock,Selective Outrage (2023) | Transcript,,"What’s up, Baltimore?",[audience cheers loudly],baltimore,cheer,1
3,Chris Rock,Selective Outrage (2023) | Transcript,,"Yes! Yes, yes. Thank you! Thank you so much! ...",[audience cheering],yes yes yes thank thank much thank much netfli...,cheer,1
4,Chris Rock,Selective Outrage (2023) | Transcript,,"I’mma try… N*gga, sit down! I’mma try to do a...",[audience laughing],mma try… n gga sit mma try show tonight withou...,laugh,2
...,...,...,...,...,...,...,...,...
6337,,Jim Jefferies on Gun Control [Full Transcript],Australian comedian Jim Jefferies breaks down ...,"“Guns! Who wants to sell me a gun?” Now, I’m ...",[Audience cheering],gun sell gun wrap talk anymore now… agree bear...,cheer,1
6338,,Jim Jefferies on Gun Control [Full Transcript],Australian comedian Jim Jefferies breaks down ...,Yeah! And that made a hell of a lot of sense ...,,hell lot sense musket government drone bring g...,,0
6339,Reggie Watts,Spatial (2016) – Full Transcript,Completely improvised show weaving together sk...,” Fuck. # ..Then jump for my love # Jump in # ...,,jump love jump feel touch jump taste kiss nigh...,,0
6340,GEORGE CARLIN,COMPLAINTS AND GRIEVANCES (2001) – FULL TRANSC...,"Full transcript of Complaints and Grievances, ...",” Now. Folks. This next piece of material’s go...,,folk next piece material give u chance bond am...,,0


In [26]:
import pickle
pickle.dump(sent_df4, open("chloe_valid_transcripts_sent_df", "wb"))

## Split train and test sets

In [27]:
from sklearn.model_selection import train_test_split

X = sent_df4[['Processed Transcript']]
y = sent_df4.loc[:, 'Funniness']

X_train, X_test, y_train, y_test = train_test_split(X, y, random_state = 0)

## TF-IDF vectorization of transcript parts

In [28]:
from sklearn.feature_extraction.text import TfidfVectorizer

vectorizer = TfidfVectorizer()
tfidf_matrix = vectorizer.fit_transform(X_train['Processed Transcript'])
X_test_matrix = vectorizer.transform(X_test['Processed Transcript'])

#create dataframe from tfidf_matrix, which is a sparse matrix
X_train_trans = pd.DataFrame(tfidf_matrix.toarray(), columns = vectorizer.get_feature_names_out())
X_test_trans = pd.DataFrame(X_test_matrix.toarray(), columns = vectorizer.get_feature_names_out())
X_train_trans

Unnamed: 0,aa,aaa,aaaa,aaaaaa,aaaaaaaaaaall,aaaaahh,aaaaave,aaaah,aaaand,aah,...,íbamos,índole,íntimo,órale,última,últimas,último,única,único,únicos
0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0
1,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0
2,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0
3,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0
4,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
4751,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0
4752,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0
4753,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0
4754,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0


In [29]:
pickle.dump(vectorizer, open('full_vectorizer', 'wb'))

## Feature selection

In [30]:
from sklearn.feature_selection import SelectKBest, mutual_info_regression

#Select top 10000 features based on mutual info regression
selector = SelectKBest(mutual_info_regression, k = 10000)
selector.fit(X_train_trans, y_train)
new_features = X_train_trans.columns[selector.get_support()]

#filter X_train_trans for top 10000 features from new_features
X_train_trans1 = X_train_trans[list(new_features)]
X_test_trans1 = X_test_trans[list(new_features)]
X_train_trans1

Unnamed: 0,aa,aaa,aaaaaaaaaaall,aaaah,aardvark,aargh,aaron,ab,abajo,abbattere,...,íbamos,índole,íntimo,órale,última,últimas,último,única,único,únicos
0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0
1,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0
2,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0
3,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0
4,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
4751,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0
4752,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0
4753,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0
4754,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0


In [31]:
pickle.dump(list(new_features), open('full_features', 'wb'))

## Checking value counts of train set and resampling

In [32]:
y_train.value_counts()

2    3920
1     609
0     141
3      86
Name: Funniness, dtype: int64

Train dataset is highly inbalanced across the 4 categories, hence resampling is required. In our case, we will do oversampling.

In [33]:
from imblearn.over_sampling import SMOTE

smote = SMOTE()
X_train_sm, y_train_sm = smote.fit_resample(X_train_trans1, y_train)
y_train_sm.value_counts()

1    3920
2    3920
3    3920
0    3920
Name: Funniness, dtype: int64

Train dataset is now balanced across the 4 categories.

## Naive Bayes classification

In [34]:
from sklearn.naive_bayes import MultinomialNB
from sklearn.model_selection import GridSearchCV

nb_clf = MultinomialNB()
nb_params = {'alpha':[1, 2, 3]}
nb_grid = GridSearchCV(nb_clf, param_grid = nb_params).fit(X_train_sm, y_train_sm)
#update nb_clf with the best estimator from nb_grid
nb_clf = nb_grid.best_estimator_
print('Train score: ', nb_clf.score(X_train_sm, y_train_sm))
print('Test score: ', nb_clf.score(X_test_trans1, y_test))
print('Cross validated score: ', nb_grid.best_score_)

Train score:  0.8565688775510204
Test score:  0.519546027742749
Cross validated score:  0.788201530612245


In [38]:
import plotly.graph_objects as go
import plotly.express as px
from sklearn.metrics import roc_curve, roc_auc_score

class_names = ['Neutral', 'A little funny', 'Moderately funny', 'Very funny']

nb_y_scores = nb_clf.predict_proba(X_test_trans1)

# One hot encode the labels in order to plot them
nb_y_onehot = pd.get_dummies(y_test, columns = nb_clf.classes_)

# Create an empty figure, and iteratively add new lines
# every time we compute a new class
nb_roc_fig = go.Figure()
nb_roc_fig.add_shape(
    type = 'line', line = dict(dash='dash'),
    x0 = 0, x1 = 1, y0 = 0, y1 = 1
)

for i in range(nb_y_scores.shape[1]):
    y_true = nb_y_onehot.iloc[:, i]
    y_score = nb_y_scores[:, i]

    fpr, tpr, _ = roc_curve(y_true, y_score)
    auc_score = roc_auc_score(y_true, y_score)

    name = f"{class_names[i]} (AUC={auc_score:.2f})"
    nb_roc_fig.add_trace(go.Scatter(x = fpr, y = tpr, name = name, mode = 'lines'))

nb_roc_fig.update_layout(
    title = 'ROC Curve (Multinomial Naive Bayes)',
    xaxis_title = 'False Positive Rate',
    yaxis_title = 'True Positive Rate',
    yaxis = dict(scaleanchor = "x", scaleratio = 1),
    xaxis = dict(constrain = 'domain'),
    width = 700, height = 500
)
nb_roc_fig.show()

In [39]:
import pickle
pickle.dump(nb_clf, open("trained_nb_model", "wb"))

In [40]:
pickle.dump(nb_roc_fig, open("nb_roc_fig", "wb"))

## Logistic regression

In [41]:
from sklearn.linear_model import LogisticRegression

logreg = LogisticRegression(random_state = 0)
logreg_params = {'C':[0.001, 0.01, 0.1], 'max_iter':[500, 1000]}
logreg_grid = GridSearchCV(logreg, param_grid = logreg_params).fit(X_train_sm, y_train_sm)
#update logreg with the best estimator from logreg_grid
logreg = logreg_grid.best_estimator_
print('Train score: ', logreg.score(X_train_sm, y_train_sm))
print('Test score: ', logreg.score(X_test_trans1, y_test))
print('Cross validated score: ', logreg_grid.best_score_)

Train score:  0.7873724489795918
Test score:  0.5933165195460277
Cross validated score:  0.749170918367347


In [43]:
logreg_y_scores = logreg.predict_proba(X_test_trans1)

# One hot encode the labels in order to plot them
logreg_y_onehot = pd.get_dummies(y_test, columns = logreg.classes_)

# Create an empty figure, and iteratively add new lines
# every time we compute a new class
logreg_roc_fig = go.Figure()
logreg_roc_fig.add_shape(
    type = 'line', line = dict(dash='dash'),
    x0 = 0, x1 = 1, y0 = 0, y1 = 1
)

for i in range(logreg_y_scores.shape[1]):
    y_true = logreg_y_onehot.iloc[:, i]
    y_score = logreg_y_scores[:, i]

    fpr, tpr, _ = roc_curve(y_true, y_score)
    auc_score = roc_auc_score(y_true, y_score)

    name = f"{class_names[i]} (AUC={auc_score:.2f})"
    logreg_roc_fig.add_trace(go.Scatter(x = fpr, y = tpr, name = name, mode = 'lines'))

logreg_roc_fig.update_layout(
    title = 'ROC Curve (Logistic Regression)',
    xaxis_title = 'False Positive Rate',
    yaxis_title = 'True Positive Rate',
    yaxis = dict(scaleanchor = "x", scaleratio = 1),
    xaxis = dict(constrain = 'domain'),
    width = 700, height = 500
)
logreg_roc_fig.show()

In [44]:
pickle.dump(logreg, open("trained_logreg_model", "wb"))

In [45]:
pickle.dump(logreg_roc_fig, open("logreg_roc_fig", "wb"))

## Random forest classification

In [46]:
from sklearn.ensemble import RandomForestClassifier

rf_clf = RandomForestClassifier(random_state = 0)
rf_params = {'max_depth':[3, 5], 'n_estimators':[75, 100]}
rf_grid = GridSearchCV(rf_clf, param_grid = rf_params).fit(X_train_sm, y_train_sm)
#update rf_clf with the best estimator from rf_grid
rf_clf = rf_grid.best_estimator_
print('Train score: ', rf_clf.score(X_train_sm, y_train_sm))
print('Test score: ', rf_clf.score(X_test_trans1, y_test))
print('Cross validated score: ', rf_grid.best_score_)

Train score:  0.6368622448979592
Test score:  0.7099621689785625
Cross validated score:  0.6535076530612245


In [53]:
rf_y_scores = rf_clf.predict_proba(X_test_trans1)

# One hot encode the labels in order to plot them
rf_y_onehot = pd.get_dummies(y_test, columns = rf_clf.classes_)

# Create an empty figure, and iteratively add new lines
# every time we compute a new class
rf_roc_fig = go.Figure()
rf_roc_fig.add_shape(
    type = 'line', line = dict(dash='dash'),
    x0 = 0, x1 = 1, y0 = 0, y1 = 1
)

for i in range(rf_y_scores.shape[1]):
    y_true = rf_y_onehot.iloc[:, i]
    y_score = rf_y_scores[:, i]

    fpr, tpr, _ = roc_curve(y_true, y_score)
    auc_score = roc_auc_score(y_true, y_score)

    name = f"{class_names[i]} (AUC={auc_score:.2f})"
    rf_roc_fig.add_trace(go.Scatter(x = fpr, y = tpr, name = name, mode = 'lines'))

rf_roc_fig.update_layout(
    title = 'ROC Curve (Random Forest)',
    xaxis_title = 'False Positive Rate',
    yaxis_title = 'True Positive Rate',
    yaxis = dict(scaleanchor = "x", scaleratio = 1),
    xaxis = dict(constrain = 'domain'),
    width = 700, height = 500
)
rf_roc_fig.show()

In [54]:
pickle.dump(rf_clf, open("trained_rf_model", "wb"))

In [55]:
pickle.dump(rf_roc_fig, open("rf_roc_fig", "wb"))

## Adaboost classification

In [56]:
from sklearn.ensemble import AdaBoostClassifier
from sklearn.model_selection import cross_val_score

ada_clf = AdaBoostClassifier(n_estimators = 60, learning_rate = 0.1, random_state = 0)
ada_cross_val = np.mean(cross_val_score(ada_clf, X_train_sm, y_train_sm))
ada_clf.fit(X_train_sm, y_train_sm)
print('Train score: ', ada_clf.score(X_train_sm, y_train_sm))
print('Test score: ', ada_clf.score(X_test_trans1, y_test))
print('Cross validated score: ', ada_cross_val)

Train score:  0.620344387755102
Test score:  0.7408575031525851
Cross validated score:  0.6199617346938775


In [57]:
ada_y_scores = ada_clf.predict_proba(X_test_trans1)

# One hot encode the labels in order to plot them
ada_y_onehot = pd.get_dummies(y_test, columns = ada_clf.classes_)

# Create an empty figure, and iteratively add new lines
# every time we compute a new class
ada_roc_fig = go.Figure()
ada_roc_fig.add_shape(
    type = 'line', line = dict(dash='dash'),
    x0 = 0, x1 = 1, y0 = 0, y1 = 1
)

for i in range(ada_y_scores.shape[1]):
    y_true = ada_y_onehot.iloc[:, i]
    y_score = ada_y_scores[:, i]

    fpr, tpr, _ = roc_curve(y_true, y_score)
    auc_score = roc_auc_score(y_true, y_score)

    name = f"{class_names[i]} (AUC={auc_score:.2f})"
    ada_roc_fig.add_trace(go.Scatter(x = fpr, y = tpr, name = name, mode = 'lines'))

ada_roc_fig.update_layout(
    title = 'ROC Curve (AdaBoost)',
    xaxis_title = 'False Positive Rate',
    yaxis_title = 'True Positive Rate',
    yaxis = dict(scaleanchor = "x", scaleratio = 1),
    xaxis = dict(constrain = 'domain'),
    width = 700, height = 500
)
ada_roc_fig.show()

In [58]:
pickle.dump(ada_clf, open("trained_ada_model", "wb"))

In [59]:
pickle.dump(ada_roc_fig, open("ada_roc_fig", "wb"))

## Gradient boosting classification

In [63]:
from sklearn.ensemble import GradientBoostingClassifier

grad_clf = GradientBoostingClassifier(n_estimators = 40, learning_rate = 0.1, random_state = 0, warm_start = True)
grad_cross_val = np.mean(cross_val_score(grad_clf, X_train_sm, y_train_sm))
grad_clf.fit(X_train_sm, y_train_sm)
print('Train score: ', grad_clf.score(X_train_sm, y_train_sm))
print('Test score: ', grad_clf.score(X_test_trans1, y_test))
print('Cross validated score: ', grad_cross_val)

Train score:  0.7474489795918368
Test score:  0.7736443883984867
Cross validated score:  0.7282525510204081


In [67]:
grad_y_scores = grad_clf.predict_proba(X_test_trans1)

# One hot encode the labels in order to plot them
grad_y_onehot = pd.get_dummies(y_test, columns = grad_clf.classes_)

# Create an empty figure, and iteratively add new lines
# every time we compute a new class
grad_roc_fig = go.Figure()
grad_roc_fig.add_shape(
    type = 'line', line = dict(dash='dash'),
    x0 = 0, x1 = 1, y0 = 0, y1 = 1
)

for i in range(grad_y_scores.shape[1]):
    y_true = grad_y_onehot.iloc[:, i]
    y_score = grad_y_scores[:, i]

    fpr, tpr, _ = roc_curve(y_true, y_score)
    auc_score = roc_auc_score(y_true, y_score)

    name = f"{class_names[i]} (AUC={auc_score:.2f})"
    grad_roc_fig.add_trace(go.Scatter(x = fpr, y = tpr, name = name, mode = 'lines'))

grad_roc_fig.update_layout(
    title = 'ROC Curve (Gradient Boosting)',
    xaxis_title = 'False Positive Rate',
    yaxis_title = 'True Positive Rate',
    yaxis = dict(scaleanchor = "x", scaleratio = 1),
    xaxis = dict(constrain = 'domain'),
    width = 700, height = 500
)
grad_roc_fig.show()

In [68]:
pickle.dump(grad_clf, open("trained_grad_model", "wb"))

In [69]:
pickle.dump(grad_roc_fig, open("grad_roc_fig", "wb"))

## XGBoost classification

In [79]:
import xgboost as xgb
from sklearn.metrics import accuracy_score

xgb_clf = xgb.XGBClassifier(objective = "multi:softprob", n_estimators = 50, learning_rate = 0.1, early_stopping_rounds = 10, eval_metric = ['auc', 'merror'], random_state = 0)

xgb_clf.fit(X_train_sm, y_train_sm, eval_set = [(X_test_trans1, y_test)])

xgb_train_acc = accuracy_score(y_train_sm, xgb_clf.predict(X_train_sm))
xgb_test_acc = accuracy_score(y_test, xgb_clf.predict(X_test_trans1))

print('Train score: ', xgb_train_acc)
print('Test score: ', xgb_test_acc)

[0]	validation_0-auc:0.57420	validation_0-merror:0.20996
[1]	validation_0-auc:0.57414	validation_0-merror:0.21122
[2]	validation_0-auc:0.58262	validation_0-merror:0.21059
[3]	validation_0-auc:0.57815	validation_0-merror:0.21438
[4]	validation_0-auc:0.58679	validation_0-merror:0.21185
[5]	validation_0-auc:0.58250	validation_0-merror:0.21185
[6]	validation_0-auc:0.58100	validation_0-merror:0.21438
[7]	validation_0-auc:0.59032	validation_0-merror:0.21185
[8]	validation_0-auc:0.58908	validation_0-merror:0.20933
[9]	validation_0-auc:0.59189	validation_0-merror:0.21501
[10]	validation_0-auc:0.60434	validation_0-merror:0.21690
[11]	validation_0-auc:0.60805	validation_0-merror:0.21564
[12]	validation_0-auc:0.60877	validation_0-merror:0.21879
[13]	validation_0-auc:0.61681	validation_0-merror:0.22005
[14]	validation_0-auc:0.61699	validation_0-merror:0.21753
[15]	validation_0-auc:0.61253	validation_0-merror:0.21753
[16]	validation_0-auc:0.61321	validation_0-merror:0.21753
[17]	validation_0-auc:0.

In [80]:
roc_auc_score(y_test, xgb_clf.predict_proba(X_test_trans1), multi_class='ovr')

0.5897382181450973

In [81]:
xgb_y_scores = xgb_clf.predict_proba(X_test_trans1)

# One hot encode the labels in order to plot them
xgb_y_onehot = pd.get_dummies(y_test, columns = xgb_clf.classes_)

# Create an empty figure, and iteratively add new lines
# every time we compute a new class
xgb_roc_fig = go.Figure()
xgb_roc_fig.add_shape(
    type = 'line', line = dict(dash='dash'),
    x0 = 0, x1 = 1, y0 = 0, y1 = 1
)

for i in range(xgb_y_scores.shape[1]):
    y_true = xgb_y_onehot.iloc[:, i]
    y_score = xgb_y_scores[:, i]

    fpr, tpr, _ = roc_curve(y_true, y_score)
    auc_score = roc_auc_score(y_true, y_score)

    name = f"{class_names[i]} (AUC={auc_score:.2f})"
    xgb_roc_fig.add_trace(go.Scatter(x = fpr, y = tpr, name = name, mode = 'lines'))

xgb_roc_fig.update_layout(
    title = 'ROC Curve (XGBoost)',
    xaxis_title = 'False Positive Rate',
    yaxis_title = 'True Positive Rate',
    yaxis = dict(scaleanchor = "x", scaleratio = 1),
    xaxis = dict(constrain = 'domain'),
    width = 700, height = 500
)
xgb_roc_fig.show()

In [82]:
pickle.dump(xgb_clf, open("trained_xgb_model", "wb"))

In [83]:
pickle.dump(xgb_roc_fig, open("xgb_roc_fig", "wb"))

## Voting classification

In [84]:
from sklearn.ensemble import VotingClassifier

vote_clf = VotingClassifier(estimators=[('rf_clf', rf_clf), ('ada_clf', ada_clf), ('grad_clf', grad_clf)],
                        voting='soft', weights=[1,1,2])

vote_clf.fit(X_train_sm, y_train_sm)

print('Train score: ', vote_clf.score(X_train_sm, y_train_sm))
print('Test score: ', vote_clf.score(X_test_trans1, y_test))

Train score:  0.7507015306122449
Test score:  0.7755359394703657


In [86]:
roc_auc_score(y_test, vote_clf.predict_proba(X_test_trans1), multi_class='ovr')

0.610007535935785

In [85]:
vote_y_scores = vote_clf.predict_proba(X_test_trans1)

# One hot encode the labels in order to plot them
vote_y_onehot = pd.get_dummies(y_test, columns = vote_clf.classes_)

# Create an empty figure, and iteratively add new lines
# every time we compute a new class
vote_roc_fig = go.Figure()
vote_roc_fig.add_shape(
    type = 'line', line = dict(dash='dash'),
    x0 = 0, x1 = 1, y0 = 0, y1 = 1
)

for i in range(vote_y_scores.shape[1]):
    y_true = vote_y_onehot.iloc[:, i]
    y_score = vote_y_scores[:, i]

    fpr, tpr, _ = roc_curve(y_true, y_score)
    auc_score = roc_auc_score(y_true, y_score)

    name = f"{class_names[i]} (AUC={auc_score:.2f})"
    vote_roc_fig.add_trace(go.Scatter(x = fpr, y = tpr, name = name, mode = 'lines'))

vote_roc_fig.update_layout(
    title = 'ROC Curve (Voting [RF + ADA + GB])',
    xaxis_title = 'False Positive Rate',
    yaxis_title = 'True Positive Rate',
    yaxis = dict(scaleanchor = "x", scaleratio = 1),
    xaxis = dict(constrain = 'domain'),
    width = 700, height = 500
)
vote_roc_fig.show()

In [87]:
pickle.dump(vote_clf, open("trained_voting_model", "wb"))

In [88]:
pickle.dump(vote_roc_fig, open("vote_roc_fig", "wb"))

## Model evaluation

In [89]:
from sklearn.metrics import f1_score, precision_score, recall_score, roc_auc_score

train_acc = [nb_clf.score(X_train_sm, y_train_sm), 
             logreg.score(X_train_sm, y_train_sm), 
             rf_clf.score(X_train_sm, y_train_sm), 
             ada_clf.score(X_train_sm, y_train_sm), 
             grad_clf.score(X_train_sm, y_train_sm), 
             xgb_train_acc, 
            vote_clf.score(X_train_sm, y_train_sm)]

test_acc = [nb_clf.score(X_test_trans1, y_test), 
            logreg.score(X_test_trans1, y_test), 
            rf_clf.score(X_test_trans1, y_test), 
            ada_clf.score(X_test_trans1, y_test), 
            grad_clf.score(X_test_trans1, y_test), 
            xgb_test_acc, 
           vote_clf.score(X_test_trans1, y_test)]

precision = [precision_score(y_test, nb_clf.predict(X_test_trans1), average = 'micro'), 
            precision_score(y_test, logreg.predict(X_test_trans1), average = 'micro'), 
            precision_score(y_test, rf_clf.predict(X_test_trans1), average = 'micro'), 
            precision_score(y_test, ada_clf.predict(X_test_trans1), average = 'micro'), 
            precision_score(y_test, grad_clf.predict(X_test_trans1), average = 'micro'), 
            precision_score(y_test, xgb_clf.predict(X_test_trans1), average = 'micro'), 
            precision_score(y_test, vote_clf.predict(X_test_trans1), average = 'micro')]

recall = [recall_score(y_test, nb_clf.predict(X_test_trans1), average = 'micro'), 
         recall_score(y_test, logreg.predict(X_test_trans1), average = 'micro'), 
         recall_score(y_test, rf_clf.predict(X_test_trans1), average = 'micro'), 
         recall_score(y_test, ada_clf.predict(X_test_trans1), average = 'micro'), 
         recall_score(y_test, grad_clf.predict(X_test_trans1), average = 'micro'), 
         recall_score(y_test, xgb_clf.predict(X_test_trans1), average = 'micro'), 
         recall_score(y_test, vote_clf.predict(X_test_trans1), average = 'micro')]

f1 = [f1_score(y_test, nb_clf.predict(X_test_trans1), average = 'micro'), 
     f1_score(y_test, logreg.predict(X_test_trans1), average = 'micro'), 
     f1_score(y_test, rf_clf.predict(X_test_trans1), average = 'micro'), 
     f1_score(y_test, ada_clf.predict(X_test_trans1), average = 'micro'), 
     f1_score(y_test, grad_clf.predict(X_test_trans1), average = 'micro'), 
     f1_score(y_test, xgb_clf.predict(X_test_trans1), average = 'micro'), 
     f1_score(y_test, vote_clf.predict(X_test_trans1), average = 'micro')]

auc = [roc_auc_score(y_test, nb_clf.predict_proba(X_test_trans1), multi_class='ovr'), 
      roc_auc_score(y_test, logreg.predict_proba(X_test_trans1), multi_class='ovr'), 
      roc_auc_score(y_test, rf_clf.predict_proba(X_test_trans1), multi_class='ovr'), 
      roc_auc_score(y_test, ada_clf.predict_proba(X_test_trans1), multi_class='ovr'), 
      roc_auc_score(y_test, grad_clf.predict_proba(X_test_trans1), multi_class='ovr'), 
      roc_auc_score(y_test, xgb_clf.predict_proba(X_test_trans1), multi_class='ovr'), 
      roc_auc_score(y_test, vote_clf.predict_proba(X_test_trans1), multi_class='ovr')]

#create model performance dataframe
mdl_perf_df = pd.DataFrame({'Train Accuracy': train_acc, 
                           'Test Accuracy': test_acc, 
                           'Precision': precision, 
                           'Recall': recall, 
                           'F1-score': f1, 
                           'AUC': auc}, index = ['Multinomial Naive Bayes', 'Logistic Regression', 'Random Forest', 'AdaBoost', 'Gradient Boost', 'XGBoost', 'Voting (RF+ADA+GB)'])

mdl_perf_df

Unnamed: 0,Train Accuracy,Test Accuracy,Precision,Recall,F1-score,AUC
Multinomial Naive Bayes,0.856569,0.519546,0.519546,0.519546,0.519546,0.59619
Logistic Regression,0.787372,0.593317,0.593317,0.593317,0.593317,0.605967
Random Forest,0.636862,0.709962,0.709962,0.709962,0.709962,0.586283
AdaBoost,0.620344,0.740858,0.740858,0.740858,0.740858,0.589639
Gradient Boost,0.747449,0.773644,0.773644,0.773644,0.773644,0.612695
XGBoost,0.718622,0.790668,0.790668,0.790668,0.790668,0.589738
Voting (RF+ADA+GB),0.750702,0.775536,0.775536,0.775536,0.775536,0.610008


In [467]:
pickle.dump(mdl_perf_df, open("model_performance_df", "wb"))