In [None]:
import numpy as np
import pandas as pd
import nltk
from textblob import TextBlob
import re
nltk.download('averaged_perceptron_tagger')

In [356]:
df = pd.read_csv('transcripts.csv')
df

Unnamed: 0.1,Unnamed: 0,Comedian,Date,Title,Subtitle,Transcript
0,0,Chris Rock,"March 8, 2023",Selective Outrage (2023) | Transcript,,[slow instrumental music playing] [funk drums ...
1,1,Marc Maron,"March 3, 2023",Thinky Pain (2013) | Transcript,Marc Maron returns to his old stomping grounds...,[siren wailing] I don’t know what you were thi...
2,2,Chelsea Handler,"March 3, 2023",Evolution (2020) | Transcript,Chelsea Handler is back and better than ever -...,Join me in welcoming the author of six number ...
3,3,Tom Papa,"March 3, 2023",What A Day! (2022) | Transcript,"Follows Papa as he shares about parenting, his...","Premiered on December 13, 2022 Ladies and gent..."
4,4,Jim Jefferies,"February 22, 2023",High n’ Dry (2023) | Transcript,Jim Jefferies is back and no topic is off limi...,"Please welcome to the stage, Jim Jefferies! He..."
...,...,...,...,...,...,...
410,410,George Carlin,"April 6, 2017",Jamming in New York (1992) – Transcript,,Jammin’ in New York is George Carlin’s 14th al...
411,411,,"April 1, 2017",Jim Jefferies on Gun Control [Full Transcript],Australian comedian Jim Jefferies breaks down ...,Australian comedian Jim Jefferies breaks down ...
412,412,Reggie Watts,"March 30, 2017",Spatial (2016) – Full Transcript,Completely improvised show weaving together sk...,"Hello, I’m Thomas. I’m so glad to meet you Mum..."
413,413,GEORGE CARLIN,"March 29, 2017",COMPLAINTS AND GRIEVANCES (2001) – FULL TRANSC...,"Full transcript of Complaints and Grievances, ...",Complaints and Grievances is a HBO stand-up sp...


## Extract sentences and actions from each transcript

In [357]:
sent_rows = []

#function to extract each sentence of each transcript
def extract_sent(row):
    text = row['Transcript']
    title = row['Title']
    subtitle = row['Subtitle']
    comedian = row['Comedian']
    
    sentences = re.split('\[[a-zA-Z\s\-]+\]|\([a-zA-Z\s\-]+\)', text)
    actions = re.findall('\[[a-zA-Z\s\-]+\]|\([a-zA-Z\s\-]+\)', text)
    
    #only keep transcripts with actions
    if len(sentences) > 1 and len(actions) > 0:
        #account for consecutive actions
        sentences1 = []
        actions1 = []
        for a in range(0, len(actions)):
            pre_text = sentences[a]
            if pre_text == '' or pre_text == ' ':
                #concatenate current action with latest action in actions1 unless index 0
                if a == 0:
                    actions1.append(actions[a])
                    sentences1.append(pre_text)
                else:
                    concat_act = actions1.pop() + ' ' + actions[a]
                    actions1.append(concat_act)
            else:
                sentences1.append(pre_text)
                actions1.append(actions[a])
        
        #account for non-empty last element in sentences that doesn't have an action after
        if len(re.findall('\w+', sentences[-1])) > 0:
            sentences1.append(sentences[-1])
            actions1.append('NA')
                
        #add to list of dataframe rows
        for i in range(0, len(sentences1)):
            sent_row = [comedian, title, subtitle, sentences1[i], actions1[i]]
            sent_rows.append(sent_row)

df.apply(extract_sent, axis = 1)

sent_df = pd.DataFrame(sent_rows, columns=['Comedian', 'Title', 'Subtitle', 'Transcript Part', 'Action'])
sent_df

Unnamed: 0,Comedian,Title,Subtitle,Transcript Part,Action
0,Chris Rock,Selective Outrage (2023) | Transcript,,,[slow instrumental music playing] [funk drums ...
1,Chris Rock,Selective Outrage (2023) | Transcript,,Let’s go!,[hip-hop music playing] [audience cheering] [C...
2,Chris Rock,Selective Outrage (2023) | Transcript,,"She said, “$300, I’ll do anything you want.” ...",[audience laughing] [audience cheering] [hip-h...
3,Chris Rock,Selective Outrage (2023) | Transcript,,Ladies and gentlemen. Ladies and gentlemen. C...,[audience cheering] [audience continue cheerin...
4,Chris Rock,Selective Outrage (2023) | Transcript,,"What’s up, Baltimore?",[audience cheers loudly]
...,...,...,...,...,...
25676,GEORGE CARLIN,COMPLAINTS AND GRIEVANCES (2001) – FULL TRANSC...,"Full transcript of Complaints and Grievances, ...",Complaints and Grievances is a HBO stand-up sp...,(Things That Come Off of Your Body)
25677,GEORGE CARLIN,COMPLAINTS AND GRIEVANCES (2001) – FULL TRANSC...,"Full transcript of Complaints and Grievances, ...",” Now. Folks. This next piece of material’s go...,
25678,George Carlin,It’s Bad For Ya (2008) Full transcript,"Full transcript of It's Bad for Ya, final HBO ...","Full transcript of It’s Bad for Ya, final HBO ...",(sound)
25679,George Carlin,It’s Bad For Ya (2008) Full transcript,"Full transcript of It's Bad for Ya, final HBO ...",. What I am is an old fuck. It’s kind of like ...,(Fart sound)


## Filter actions for audience reactions only

In [358]:
unique_actions = list(sent_df['Action'].unique())
unique_actions

['[slow instrumental music playing] [funk drums playing] [indistinct chatter] [man]',
 '[hip-hop music playing] [audience cheering] [Chris Rock]',
 '[audience laughing] [audience cheering] [hip-hop music playing] [female announcer]',
 '[audience cheering] [audience continue cheering] [Chris Rock]',
 '[audience cheers loudly]',
 '[audience cheering]',
 '[man shouting unintelligibly]',
 '[audience laughing]',
 '[audience applauding]',
 '[audience laughing and applauding] [imperceptible]',
 '[chuckling]',
 '[man whooping]',
 '[chuckles]',
 '[laughs]',
 '[sputters]',
 '[audience cheer in agreement]',
 '[high-pitch voice]',
 '[normal voice]',
 '[audience]',
 '[audience laughs]',
 '[audience cheering and whistling]',
 '[audience laughing and applauding]',
 '[groans]',
 '[voice breaking]',
 '[man whoops]',
 '[guffaws]',
 '[scattered laughs]',
 '[exclaims]',
 '[yelps in pain]',
 '[audience laughing and applauding] [man]',
 '[crying]',
 '[imitating a woman]',
 '[chuckling] [man whoops]',
 '[scr

In [359]:
len(unique_actions)

5764

In [360]:
aud_unique_actions = []

#only keep actions that contain 'audience' or 'crowd', and the 'NA' action (which is neutral)
for act in unique_actions:
    lower_act = act.lower()
    if 'audience' in lower_act or 'crowd' in lower_act or act == 'NA':
        aud_unique_actions.append(act)

aud_unique_actions

['[hip-hop music playing] [audience cheering] [Chris Rock]',
 '[audience laughing] [audience cheering] [hip-hop music playing] [female announcer]',
 '[audience cheering] [audience continue cheering] [Chris Rock]',
 '[audience cheers loudly]',
 '[audience cheering]',
 '[audience laughing]',
 '[audience applauding]',
 '[audience laughing and applauding] [imperceptible]',
 '[audience cheer in agreement]',
 '[audience]',
 '[audience laughs]',
 '[audience cheering and whistling]',
 '[audience laughing and applauding]',
 '[audience laughing and applauding] [man]',
 '[slams mic] [audience cheering and applauding] [hip-hop music playing] [audience continue cheering]',
 'NA',
 '[woman groans in audience]',
 '[audience exclaims in agreement] [audience exclaims in disagreement] [laughter]',
 '[audience member]',
 '[crowd cheers]',
 '[audience members]',
 '[audience applauds]',
 '[upbeat music playing] [crowd cheering]',
 '[upbeat music plays] [audience cheering] [announcer]',
 '[indistinct chatte

In [361]:
len(aud_unique_actions)

1051

In [362]:
#filter sent_df for rows whose action is in aud_unique_actions
sent_df1 = sent_df[sent_df['Action'].isin(aud_unique_actions)]
sent_df1

Unnamed: 0,Comedian,Title,Subtitle,Transcript Part,Action
1,Chris Rock,Selective Outrage (2023) | Transcript,,Let’s go!,[hip-hop music playing] [audience cheering] [C...
2,Chris Rock,Selective Outrage (2023) | Transcript,,"She said, “$300, I’ll do anything you want.” ...",[audience laughing] [audience cheering] [hip-h...
3,Chris Rock,Selective Outrage (2023) | Transcript,,Ladies and gentlemen. Ladies and gentlemen. C...,[audience cheering] [audience continue cheerin...
4,Chris Rock,Selective Outrage (2023) | Transcript,,"What’s up, Baltimore?",[audience cheers loudly]
5,Chris Rock,Selective Outrage (2023) | Transcript,,"Yes! Yes, yes. Thank you! Thank you so much! ...",[audience cheering]
...,...,...,...,...,...
25672,,Jim Jefferies on Gun Control [Full Transcript],Australian comedian Jim Jefferies breaks down ...,"“Guns! Who wants to sell me a gun?” Now, I’m ...",[Audience cheering]
25673,,Jim Jefferies on Gun Control [Full Transcript],Australian comedian Jim Jefferies breaks down ...,Yeah! And that made a hell of a lot of sense ...,
25675,Reggie Watts,Spatial (2016) – Full Transcript,Completely improvised show weaving together sk...,” Fuck. # ..Then jump for my love # Jump in # ...,
25677,GEORGE CARLIN,COMPLAINTS AND GRIEVANCES (2001) – FULL TRANSC...,"Full transcript of Complaints and Grievances, ...",” Now. Folks. This next piece of material’s go...,


## Filter for non-empty transcript parts only

In [363]:
#get rows with empty transcript parts to check that actions are not relevant
empty_trans = sent_df1[(sent_df1['Transcript Part'] == '')|(sent_df1['Transcript Part'] == ' ')]
list(empty_trans.loc[:, 'Action'])

['[upbeat music playing] [crowd cheering]',
 '[upbeat music plays] [audience cheering] [announcer]',
 '[gentle music playing] [audience applauding] [audience cheering]',
 '[audience cheering and applauding] [announcer]',
 '[audience murmuring] [murmuring continues] [audience clapping in unison] [sudden silence] [vinyl pops]',
 '[hip-hop music playing] [audience cheering and applauding] [music stops]',
 '(Chattering) (music playing) (audience cheering)',
 '[audience chattering indistinctly] [man]',
 '[heartbeat] [indistinct chatter] [atmospheric whooshing] [audience cheering]',
 '[crowd chanting]',
 '(crowd murmurs)',
 '[indistinct chattering] [faint laughter] [audience cheering and applauding]',
 '(AUDIENCE CHEERING)',
 '[audience cheering] [announcer]',
 '[Audience cheering] (DL Hughley enters from stage left with his head cocked and grabs the microphone)',
 '[upbeat music playing] [exhaling] [indistinct conversations] [crowd cheering] [inaudible] [audience cheering uproariously]',
 '

In [364]:
len(empty_trans)

26

In [365]:
#list of indexes where actions with empty transcripts are NOT found at the beginning of the transcripts
non_beginning_actions = []

#function to check that actions with empty transcripts are found at the beginning of the transcripts
def find_non_beginning_actions(row):
    #find full transcript from df
    full_trans = list(df[df['Title'] == row['Title']]['Transcript'])[0]
    
    start_index = full_trans.index(row['Action'])
    if start_index > 0:
        non_beginning_actions.append(row.index)

empty_trans.apply(find_non_beginning_actions, axis = 1)
len(non_beginning_actions)

0

Since length of the 'non_beginning_actions' list is 0, it means that all the actions with empty transcript parts are indeed found at the beginning of their respective transcripts. Hence, there are no words spoken by the comedian before these actions and we may remove these rows from the sentence dataframe as they do not provide any insights into what are the words spoken by a comedian that lead to an audience reaction after.

In [366]:
#remove rows in empty_trans from the sentence dataframe
sent_df2 = sent_df1.drop(list(empty_trans.index))
sent_df2

Unnamed: 0,Comedian,Title,Subtitle,Transcript Part,Action
1,Chris Rock,Selective Outrage (2023) | Transcript,,Let’s go!,[hip-hop music playing] [audience cheering] [C...
2,Chris Rock,Selective Outrage (2023) | Transcript,,"She said, “$300, I’ll do anything you want.” ...",[audience laughing] [audience cheering] [hip-h...
3,Chris Rock,Selective Outrage (2023) | Transcript,,Ladies and gentlemen. Ladies and gentlemen. C...,[audience cheering] [audience continue cheerin...
4,Chris Rock,Selective Outrage (2023) | Transcript,,"What’s up, Baltimore?",[audience cheers loudly]
5,Chris Rock,Selective Outrage (2023) | Transcript,,"Yes! Yes, yes. Thank you! Thank you so much! ...",[audience cheering]
...,...,...,...,...,...
25672,,Jim Jefferies on Gun Control [Full Transcript],Australian comedian Jim Jefferies breaks down ...,"“Guns! Who wants to sell me a gun?” Now, I’m ...",[Audience cheering]
25673,,Jim Jefferies on Gun Control [Full Transcript],Australian comedian Jim Jefferies breaks down ...,Yeah! And that made a hell of a lot of sense ...,
25675,Reggie Watts,Spatial (2016) – Full Transcript,Completely improvised show weaving together sk...,” Fuck. # ..Then jump for my love # Jump in # ...,
25677,GEORGE CARLIN,COMPLAINTS AND GRIEVANCES (2001) – FULL TRANSC...,"Full transcript of Complaints and Grievances, ...",” Now. Folks. This next piece of material’s go...,


## Further filtering for audience reactions within each action row

In [367]:
#list of valid audience action parts that are hard to capture using pos tags later
valid_aud_parts = ['[scattered applause in audience]', '[applause from the audience]', '[scattered cheers from crowd]', '(Audience and Hughlely laugh)', '(Hughley pauses and laughs with audience)', '(Audience and Hughley laugh)', '(Audience lightly laughs)']

#list of action parts containing 'audience' or 'crowd' but are not valid audience reactions
nonaud_act_parts = []

#function to remove the non-audience parts of each row's action, eg: [hip-hop music playing], and only keep audience parts
def get_aud_reaction(act):
    if act != 'NA':
        #get all the bracketed parts into a list
        act_parts = re.findall('\[[a-zA-Z\s\-]+\]|\([a-zA-Z\s\-]+\)', act)
        aud_act_parts = []
        
        #only keep audience parts
        for p in act_parts:
            #check whether p is found in valid_aud_parts
            if p in valid_aud_parts:
                aud_act_parts.append(p)
                continue
            p_lower = p.lower()
            if 'audience' in p_lower or 'crowd' in p_lower:
                p_blob = TextBlob(p_lower[1:-1])
                p_tags = p_blob.tags
                for i in range(0, len(p_tags)):
                    token, tag = p_tags[i]
                    if token == 'audience' or token == 'crowd':
                        if i < len(p_tags) - 1:
                            next_tag = p_tags[i + 1][1]
                            if 'VB' in next_tag or 'NN' in next_tag:
                                aud_act_parts.append(p)
                                break
                            else:
                                nonaud_act_parts.append(p)
                        else:
                            nonaud_act_parts.append(p)
        
        new_act = ' '.join(aud_act_parts)
        if new_act == '':
            new_act = 'to remove row'
        return new_act
    return 'NA'

new_actions = sent_df2['Action'].apply(get_aud_reaction)

for x in new_actions.unique():
    print(x)

[audience cheering]
[audience laughing] [audience cheering]
[audience cheering] [audience continue cheering]
[audience cheers loudly]
[audience laughing]
[audience applauding]
[audience laughing and applauding]
[audience cheer in agreement]
to remove row
[audience laughs]
[audience cheering and whistling]
[audience cheering and applauding] [audience continue cheering]
NA
[audience exclaims in agreement] [audience exclaims in disagreement]
[audience member]
[crowd cheers]
[audience members]
[audience applauds]
[crowd cheering]
[crowd laughing]
[crowd continues laughing]
[voice in crowd whoops] [crowd clapping]
[woman in crowd screams]
[crowd laughing] [crowd clapping]
[crowd laughing] [crowd continues laughing]
[crowd clapping]
[crowd laughing] [a voice in crowd cackling]
[crowd laughing] [female voice in crowd laughing]
[crowd murmur in laughter]
[crowd laughing] [voice in crowd laughing]
[crowd laugh]
[a few voices in crowd laugh]
[crowd laughing] [female voice in crowd cackling] [cro

In [368]:
for x in list(set(nonaud_act_parts)):
    print(x)

[shout from audience]
[man in audience]
[to the single people in the audience]
[turns towards audience with dramatically serious expression and tone of voice]
[squints and looks back and forth from towards the audience and to the side with a puzzled expression]
(Audience)
[looks at front row audience]
(Hughley freezes with his arms extended to the audience with a slight grin on his face)
[turns on his heels toward audience and holds up a finger]
[woman in audience]
[mumbles to the crowd while gesturing to the camera]
[indistinct chattering in crowd]
[whooping in crowd ]
(Mimicking audience)
(audience)
[stands up straight and looks into the audience with a confident expression and speaks with purpose]
[she looks into the audience]
[crowd]
[turns towards audience and lowers arm]
(Mimics his audience)
[man from audience]
[AUDIENCE]
[Audience]
[woman in crowd]
[voices in crowd]
[to the Crowd]
[hoots from audience]
[speaking to someone in the audience]
[John freezes his pose for a moment wh

In [369]:
#new dataframe with new 'Action' column
sent_df3 = sent_df2.copy()
sent_df3['Action'] = new_actions

#filter sent_df3 for rows where 'Action' != 'to remove row'
sent_df3 = sent_df3[sent_df3['Action'] != 'to remove row']
sent_df3

Unnamed: 0,Comedian,Title,Subtitle,Transcript Part,Action
1,Chris Rock,Selective Outrage (2023) | Transcript,,Let’s go!,[audience cheering]
2,Chris Rock,Selective Outrage (2023) | Transcript,,"She said, “$300, I’ll do anything you want.” ...",[audience laughing] [audience cheering]
3,Chris Rock,Selective Outrage (2023) | Transcript,,Ladies and gentlemen. Ladies and gentlemen. C...,[audience cheering] [audience continue cheering]
4,Chris Rock,Selective Outrage (2023) | Transcript,,"What’s up, Baltimore?",[audience cheers loudly]
5,Chris Rock,Selective Outrage (2023) | Transcript,,"Yes! Yes, yes. Thank you! Thank you so much! ...",[audience cheering]
...,...,...,...,...,...
25672,,Jim Jefferies on Gun Control [Full Transcript],Australian comedian Jim Jefferies breaks down ...,"“Guns! Who wants to sell me a gun?” Now, I’m ...",[Audience cheering]
25673,,Jim Jefferies on Gun Control [Full Transcript],Australian comedian Jim Jefferies breaks down ...,Yeah! And that made a hell of a lot of sense ...,
25675,Reggie Watts,Spatial (2016) – Full Transcript,Completely improvised show weaving together sk...,” Fuck. # ..Then jump for my love # Jump in # ...,
25677,GEORGE CARLIN,COMPLAINTS AND GRIEVANCES (2001) – FULL TRANSC...,"Full transcript of Complaints and Grievances, ...",” Now. Folks. This next piece of material’s go...,


## Text preprocessing for each transcript part

In [370]:
import string
from nltk import word_tokenize
from nltk.stem import WordNetLemmatizer

stopwords = nltk.corpus.stopwords.words('english')
lemmatizer = WordNetLemmatizer()
add_punctuation = '“”’'

#function to preprocess transcript parts
def preprocess(text):
    #remove punctuation
    punctuation_free = "".join([i for i in text if i not in string.punctuation])
    
    #remove digits
    digits_removed = ''.join([i for i in punctuation_free if not i.isdigit()])
    
    #make words to lowercase
    text_lower = digits_removed.lower()
    
    #tokenize to words
    tokenized = word_tokenize(text_lower)
    
    #remove stopwords
    stopwords_removed = [i for i in tokenized if i not in stopwords]
    
    #lemmatize
    lemm_text = [lemmatizer.lemmatize(word) for word in stopwords_removed]
    
    #join list of words back into string
    processed_text = ' '.join(lemm_text)
    
    #remove punctuations not found in string.punctuation
    final_text = "".join([i for i in processed_text if i not in add_punctuation])
    
    return final_text


In [371]:
#preprocess transcript part of each row to form new 'Processed Transcript' column
sent_df3['Processed Transcript'] = sent_df3['Transcript Part'].apply(preprocess)
sent_df3

Unnamed: 0,Comedian,Title,Subtitle,Transcript Part,Action,Processed Transcript
1,Chris Rock,Selective Outrage (2023) | Transcript,,Let’s go!,[audience cheering],let go
2,Chris Rock,Selective Outrage (2023) | Transcript,,"She said, “$300, I’ll do anything you want.” ...",[audience laughing] [audience cheering],said anything want said bitch paint house ...
3,Chris Rock,Selective Outrage (2023) | Transcript,,Ladies and gentlemen. Ladies and gentlemen. C...,[audience cheering] [audience continue cheering],lady gentleman lady gentleman chris rock
4,Chris Rock,Selective Outrage (2023) | Transcript,,"What’s up, Baltimore?",[audience cheers loudly],baltimore
5,Chris Rock,Selective Outrage (2023) | Transcript,,"Yes! Yes, yes. Thank you! Thank you so much! ...",[audience cheering],yes yes yes thank thank much thank much coming...
...,...,...,...,...,...,...
25672,,Jim Jefferies on Gun Control [Full Transcript],Australian comedian Jim Jefferies breaks down ...,"“Guns! Who wants to sell me a gun?” Now, I’m ...",[Audience cheering],gun want sell gun gon na wrap talk anymore...
25673,,Jim Jefferies on Gun Control [Full Transcript],Australian comedian Jim Jefferies breaks down ...,Yeah! And that made a hell of a lot of sense ...,,yeah made hell lot sense musket know governmen...
25675,Reggie Watts,Spatial (2016) – Full Transcript,Completely improvised show weaving together sk...,” Fuck. # ..Then jump for my love # Jump in # ...,,fuck jump love jump feel touch jump wan na ta...
25677,GEORGE CARLIN,COMPLAINTS AND GRIEVANCES (2001) – FULL TRANSC...,"Full transcript of Complaints and Grievances, ...",” Now. Folks. This next piece of material’s go...,,folk next piece material going give u chance...


## Processing and labelling actions

In [372]:
action_dict = {}

#get dictionary of audience reactions
def fill_dict(a):
    blob = TextBlob(a)
    blob_tags = blob.tags
    for token, tag in blob_tags:
        if tag == 'NNS' or tag == 'NN' or 'VB' in tag:
            if token in action_dict:
                action_dict[token] += 1
            else:
                action_dict[token] = 1
                
sent_df3['Action'].apply(fill_dict)
action_dict

{'audience': 5726,
 'cheering': 569,
 ']': 6015,
 'laughing': 2916,
 'continue': 3,
 'cheers': 226,
 'applauding': 248,
 'cheer': 8,
 'agreement': 2,
 'laughs': 2248,
 'whistling': 13,
 'NA': 189,
 'exclaims': 15,
 'disagreement': 1,
 'member': 34,
 '[': 887,
 'crowd': 872,
 'members': 15,
 'applauds': 136,
 'continues': 36,
 'voice': 9,
 'whoops': 20,
 'clapping': 69,
 'woman': 2,
 'screams': 6,
 'cackling': 3,
 'murmur': 1,
 'laughter': 326,
 'voices': 1,
 'laugh': 17,
 'quietens': 2,
 'whooping': 29,
 'exclaiming': 13,
 'chuckling': 24,
 'chuckles': 63,
 'whopping': 1,
 'muttering': 1,
 'snickering': 1,
 'groaning': 19,
 'titters': 3,
 'coos': 1,
 'boos': 4,
 'shouts': 6,
 'scattered': 2,
 'applause': 9,
 'shouting': 17,
 'gasps': 14,
 'claps': 35,
 'cries': 1,
 'surprise': 1,
 'sympathizes': 1,
 'jeering': 2,
 'groans': 26,
 'catcalls': 2,
 'wolf': 1,
 'whistles': 6,
 'beat': 2,
 'singing': 3,
 'applaud': 4,
 'moaning': 1,
 'screaming': 5,
 'hooting': 4,
 'hoots': 4,
 'imitating': 

In [373]:
#determine list of more significant audience reactions from action_dict
aud_reactions = ['cheer', 'laugh', 'applaud', 'NA']

In [374]:
def label_action(text):
    label = ''
    for a in aud_reactions:
        if a in text:
            if label == '':
                label += a
            else:
                label = label + ' ' + a
    
    if label == '':
        label = 'to remove row'
    return label

reactions = sent_df3['Action'].apply(label_action)

In [375]:
#label each row of sent_df3 according to the 'Action' column
sent_df3['Audience Reaction'] = reactions

sent_df4 = sent_df3.copy()
sent_df4 = sent_df4[sent_df4['Audience Reaction'] != 'to remove row']
sent_df4

Unnamed: 0,Comedian,Title,Subtitle,Transcript Part,Action,Processed Transcript,Audience Reaction
1,Chris Rock,Selective Outrage (2023) | Transcript,,Let’s go!,[audience cheering],let go,cheer
2,Chris Rock,Selective Outrage (2023) | Transcript,,"She said, “$300, I’ll do anything you want.” ...",[audience laughing] [audience cheering],said anything want said bitch paint house ...,cheer laugh
3,Chris Rock,Selective Outrage (2023) | Transcript,,Ladies and gentlemen. Ladies and gentlemen. C...,[audience cheering] [audience continue cheering],lady gentleman lady gentleman chris rock,cheer
4,Chris Rock,Selective Outrage (2023) | Transcript,,"What’s up, Baltimore?",[audience cheers loudly],baltimore,cheer
5,Chris Rock,Selective Outrage (2023) | Transcript,,"Yes! Yes, yes. Thank you! Thank you so much! ...",[audience cheering],yes yes yes thank thank much thank much coming...,cheer
...,...,...,...,...,...,...,...
25672,,Jim Jefferies on Gun Control [Full Transcript],Australian comedian Jim Jefferies breaks down ...,"“Guns! Who wants to sell me a gun?” Now, I’m ...",[Audience cheering],gun want sell gun gon na wrap talk anymore...,cheer
25673,,Jim Jefferies on Gun Control [Full Transcript],Australian comedian Jim Jefferies breaks down ...,Yeah! And that made a hell of a lot of sense ...,,yeah made hell lot sense musket know governmen...,
25675,Reggie Watts,Spatial (2016) – Full Transcript,Completely improvised show weaving together sk...,” Fuck. # ..Then jump for my love # Jump in # ...,,fuck jump love jump feel touch jump wan na ta...,
25677,GEORGE CARLIN,COMPLAINTS AND GRIEVANCES (2001) – FULL TRANSC...,"Full transcript of Complaints and Grievances, ...",” Now. Folks. This next piece of material’s go...,,folk next piece material going give u chance...,


## Creating target 'Funniness' variable
Funniness categorical variable has 4 categories:
- 0: neutral
- 1: a little funny
- 2: moderately funny
- 3: very funny

To generate a value of funniness according to the audience reaction, first calculate the 'funny score' which should range from 0 to 1 inclusive.

**Funny score = 0.5 * laugh + 0.25 * applaud + 0.2 * cheer**  
where laugh = 1 when laugh is present in reaction, 0 otherwise  
      applaud = 1 when applaud is present in reaction, 0 otherwise  
      cheer = 1 when cheer is present in reaction, 0 otherwise

Then, according to the funny score and the range that it falls in below, determine the value of funniness:
- Score of 0: 0 (neutral)
- Score in range (0, 0.5): 1 (a little funny)
- Score in range \[0.5, 0.75): 2 (moderately funny)
- Score in range \[0.75, 1\]: 3 (very funny)

In [376]:
#function to calculate funny score and output funniness for each row
def get_funniness(reaction):
    laugh = 0
    applaud = 0
    cheer = 0
    
    if 'laugh' in reaction:
        laugh = 1
    if 'applaud'in reaction:
        applaud = 1
    if 'cheer' in reaction:
        cheer = 1
    
    score = 0.5 * laugh + 0.25 * applaud + 0.2 * cheer
    
    if score == 0:
        funniness = 0
    elif score > 0 and score < 0.5:
        funniness = 1
    elif score >= 0.5 and score < 0.75:
        funniness = 2
    elif score >= 0.75 and score <= 1:
        funniness = 3
    
    return funniness


In [377]:
#add 'Funniness' column to sent_df4
sent_df4['Funniness'] = sent_df4['Audience Reaction'].apply(get_funniness)

#reset index of sent_df4 to range from 0 to 6491
sent_df4 = sent_df4.reset_index()
sent_df4 = sent_df4.drop(columns='index')
sent_df4

Unnamed: 0,Comedian,Title,Subtitle,Transcript Part,Action,Processed Transcript,Audience Reaction,Funniness
0,Chris Rock,Selective Outrage (2023) | Transcript,,Let’s go!,[audience cheering],let go,cheer,1
1,Chris Rock,Selective Outrage (2023) | Transcript,,"She said, “$300, I’ll do anything you want.” ...",[audience laughing] [audience cheering],said anything want said bitch paint house ...,cheer laugh,2
2,Chris Rock,Selective Outrage (2023) | Transcript,,Ladies and gentlemen. Ladies and gentlemen. C...,[audience cheering] [audience continue cheering],lady gentleman lady gentleman chris rock,cheer,1
3,Chris Rock,Selective Outrage (2023) | Transcript,,"What’s up, Baltimore?",[audience cheers loudly],baltimore,cheer,1
4,Chris Rock,Selective Outrage (2023) | Transcript,,"Yes! Yes, yes. Thank you! Thank you so much! ...",[audience cheering],yes yes yes thank thank much thank much coming...,cheer,1
...,...,...,...,...,...,...,...,...
6487,,Jim Jefferies on Gun Control [Full Transcript],Australian comedian Jim Jefferies breaks down ...,"“Guns! Who wants to sell me a gun?” Now, I’m ...",[Audience cheering],gun want sell gun gon na wrap talk anymore...,cheer,1
6488,,Jim Jefferies on Gun Control [Full Transcript],Australian comedian Jim Jefferies breaks down ...,Yeah! And that made a hell of a lot of sense ...,,yeah made hell lot sense musket know governmen...,,0
6489,Reggie Watts,Spatial (2016) – Full Transcript,Completely improvised show weaving together sk...,” Fuck. # ..Then jump for my love # Jump in # ...,,fuck jump love jump feel touch jump wan na ta...,,0
6490,GEORGE CARLIN,COMPLAINTS AND GRIEVANCES (2001) – FULL TRANSC...,"Full transcript of Complaints and Grievances, ...",” Now. Folks. This next piece of material’s go...,,folk next piece material going give u chance...,,0


## Split train and test sets

In [378]:
from sklearn.model_selection import train_test_split

X = sent_df4[['Processed Transcript']]
y = sent_df4.loc[:, 'Funniness']

X_train, X_test, y_train, y_test = train_test_split(X, y, random_state = 0)

## TF-IDF vectorization of transcript parts

In [379]:
from sklearn.feature_extraction.text import TfidfVectorizer

vectorizer = TfidfVectorizer()
tfidf_matrix = vectorizer.fit_transform(X_train['Processed Transcript'])
X_test_matrix = vectorizer.transform(X_test['Processed Transcript'])

#create dataframe from tfidf_matrix, which is a sparse matrix
X_train_trans = pd.DataFrame(tfidf_matrix.toarray(), columns = vectorizer.get_feature_names_out())
X_test_trans = pd.DataFrame(X_test_matrix.toarray(), columns = vectorizer.get_feature_names_out())
X_train_trans

Unnamed: 0,aa,aaa,aaaa,aaaaaa,aaaaaaaaaaall,aaaaaabout,aaaaave,aaaah,aaaggghhh,aaand,...,índole,íntimo,órale,última,últimas,último,única,único,únicos,ﬂoor
0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0
1,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0
2,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0
3,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0
4,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
4864,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0
4865,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0
4866,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0
4867,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0


## Feature selection

In [380]:
from sklearn.feature_selection import SelectKBest, mutual_info_regression

#Select top 10000 features based on mutual info regression
selector = SelectKBest(mutual_info_regression, k = 10000)
selector.fit(X_train_trans, y_train)
new_features = X_train_trans.columns[selector.get_support()]

#filter X_train_trans for top 10000 features from new_features
X_train_trans1 = X_train_trans[list(new_features)]
X_test_trans1 = X_test_trans[list(new_features)]
X_train_trans1

Unnamed: 0,aaaaaa,aaaaave,aaaah,aaaggghhh,aaand,aardvark,aargh,aaron,ab,abajo,...,índole,íntimo,órale,última,últimas,último,única,único,únicos,ﬂoor
0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0
1,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0
2,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0
3,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0
4,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
4864,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0
4865,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0
4866,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0
4867,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0


## Checking value counts of train set and resampling

In [381]:
y_train.value_counts()

2    4004
1     627
0     142
3      96
Name: Funniness, dtype: int64

Train dataset is highly inbalanced across the 4 categories, hence resampling is required. In our case, we will do oversampling.

In [382]:
from imblearn.over_sampling import SMOTE

smote = SMOTE()
X_train_sm, y_train_sm = smote.fit_resample(X_train_trans1, y_train)
y_train_sm.value_counts()

2    4004
1    4004
0    4004
3    4004
Name: Funniness, dtype: int64

Train dataset is now balanced across the 4 categories.

## Naive Bayes classification

In [383]:
from sklearn.naive_bayes import MultinomialNB

nb_clf = MultinomialNB().fit(X_train_sm, y_train_sm)
print('Train score: ', nb_clf.score(X_train_sm, y_train_sm))
print('Test score: ', nb_clf.score(X_test_trans1, y_test))

Train score:  0.8041333666333667
Test score:  0.43253234750462105


In [425]:
import plotly.graph_objects as go
import plotly.express as px
from sklearn.metrics import roc_curve, roc_auc_score

y_scores = nb_clf.predict_proba(X_test_trans1)

# One hot encode the labels in order to plot them
y_onehot = pd.get_dummies(y_test, columns = nb_clf.classes_)

# Create an empty figure, and iteratively add new lines
# every time we compute a new class
fig = go.Figure()
fig.add_shape(
    type = 'line', line = dict(dash='dash'),
    x0 = 0, x1 = 1, y0 = 0, y1 = 1
)

for i in range(y_scores.shape[1]):
    y_true = y_onehot.iloc[:, i]
    y_score = y_scores[:, i]

    fpr, tpr, _ = roc_curve(y_true, y_score)
    auc_score = roc_auc_score(y_true, y_score)

    name = f"{y_onehot.columns[i]} (AUC={auc_score:.2f})"
    fig.add_trace(go.Scatter(x = fpr, y = tpr, name = name, mode = 'lines'))

fig.update_layout(
    title = 'ROC Curve (MultinomialNB)',
    xaxis_title = 'False Positive Rate',
    yaxis_title = 'True Positive Rate',
    yaxis = dict(scaleanchor = "x", scaleratio = 1),
    xaxis = dict(constrain = 'domain'),
    width = 700, height = 500
)
fig.show()

SyntaxError: invalid syntax (731653902.py, line 30)

In [412]:
import pickle
pickle.dump(nb_clf, open("trained_nb_model", "wb"))

## Logistic regression

In [384]:
from sklearn.linear_model import LogisticRegression

logreg = LogisticRegression(C = 0.01, max_iter = 1000, random_state = 0).fit(X_train_sm, y_train_sm)
print('Train score: ', logreg.score(X_train_sm, y_train_sm))
print('Test score: ', logreg.score(X_test_trans1, y_test))

Train score:  0.7001748251748252
Test score:  0.4208256315465188


In [413]:
pickle.dump(logreg, open("trained_logreg_model", "wb"))

## Random forest classification

In [387]:
from sklearn.ensemble import RandomForestClassifier

rf_clf = RandomForestClassifier(max_depth = 3, random_state = 0).fit(X_train_sm, y_train_sm)
print('Train score: ', rf_clf.score(X_train_sm, y_train_sm))
print('Test score: ', rf_clf.score(X_test_trans1, y_test))

Train score:  0.5723026973026973
Test score:  0.7288971041281578


In [414]:
pickle.dump(rf_clf, open("trained_rf_model", "wb"))

## Adaboost classification

In [394]:
from sklearn.ensemble import AdaBoostClassifier

ada_clf = AdaBoostClassifier(n_estimators = 100, learning_rate = 0.1, random_state = 0).fit(X_train_sm, y_train_sm)
print('Train score: ', ada_clf.score(X_train_sm, y_train_sm))
print('Test score: ', ada_clf.score(X_test_trans1, y_test))

Train score:  0.586038961038961
Test score:  0.6839186691312384


In [415]:
pickle.dump(ada_clf, open("trained_ada_model", "wb"))

## Gradient boosting classification

In [399]:
from sklearn.ensemble import GradientBoostingClassifier

grad_clf = GradientBoostingClassifier(n_estimators = 80, learning_rate = 0.01, random_state = 0).fit(X_train_sm, y_train_sm)
print('Train score: ', grad_clf.score(X_train_sm, y_train_sm))
print('Test score: ', grad_clf.score(X_test_trans1, y_test))

Train score:  0.6426073926073926
Test score:  0.6931608133086876


In [416]:
pickle.dump(grad_clf, open("trained_grad_model", "wb"))

## XGBoost classification

In [400]:
import xgboost as xgb
from sklearn.metrics import accuracy_score

xgb_clf = xgb.XGBClassifier(objective = "multi:softprob", n_estimators = 100, learning_rate = 0.1, early_stopping_rounds = 10, eval_metric = ['auc', 'merror'], random_state = 0)

xgb_clf.fit(X_train_sm, y_train_sm, eval_set = [(X_test_trans1, y_test)])

xgb_train_acc = accuracy_score(y_train_sm, xgb_clf.predict(X_train_sm))
xgb_test_acc = accuracy_score(y_test, xgb_clf.predict(X_test_trans1))

print('Train score: ', xgb_train_acc)
print('Test score: ', xgb_test_acc)

[0]	validation_0-auc:0.59869	validation_0-merror:0.27788
[1]	validation_0-auc:0.59593	validation_0-merror:0.28404
[2]	validation_0-auc:0.58815	validation_0-merror:0.28774
[3]	validation_0-auc:0.58792	validation_0-merror:0.64941
[4]	validation_0-auc:0.58569	validation_0-merror:0.29452
[5]	validation_0-auc:0.59475	validation_0-merror:0.30561
[6]	validation_0-auc:0.59529	validation_0-merror:0.30129
[7]	validation_0-auc:0.59799	validation_0-merror:0.29205
[8]	validation_0-auc:0.60177	validation_0-merror:0.29760
[9]	validation_0-auc:0.60792	validation_0-merror:0.30253
Train score:  0.6521603396603397
Test score:  0.7221195317313617


In [417]:
pickle.dump(xgb_clf, open("trained_xgb_model", "wb"))

## Voting classification

In [407]:
from sklearn.ensemble import VotingClassifier

vote_clf = VotingClassifier(estimators=[('rf_clf', rf_clf), ('ada_clf', ada_clf), ('grad_clf', grad_clf)],
                        voting='soft', weights=[1,1,2])

vote_clf.fit(X_train_sm, y_train_sm)

print('Train score: ', vote_clf.score(X_train_sm, y_train_sm))
print('Test score: ', vote_clf.score(X_test_trans1, y_test))

KeyboardInterrupt: 

## Model evaluation

In [403]:
from sklearn.metrics import f1_score, precision_score, recall_score, roc_auc_score

train_acc = [nb_clf.score(X_train_sm, y_train_sm), 
             logreg.score(X_train_sm, y_train_sm), 
             rf_clf.score(X_train_sm, y_train_sm), 
             ada_clf.score(X_train_sm, y_train_sm), 
             grad_clf.score(X_train_sm, y_train_sm), 
             xgb_train_acc]

test_acc = [nb_clf.score(X_test_trans1, y_test), 
            logreg.score(X_test_trans1, y_test), 
            rf_clf.score(X_test_trans1, y_test), 
            ada_clf.score(X_test_trans1, y_test), 
            grad_clf.score(X_test_trans1, y_test), 
            xgb_test_acc]

precision = [precision_score(y_test, nb_clf.predict(X_test_trans1), average = 'micro'), 
            precision_score(y_test, logreg.predict(X_test_trans1), average = 'micro'), 
            precision_score(y_test, rf_clf.predict(X_test_trans1), average = 'micro'), 
            precision_score(y_test, ada_clf.predict(X_test_trans1), average = 'micro'), 
            precision_score(y_test, grad_clf.predict(X_test_trans1), average = 'micro'), 
            precision_score(y_test, xgb_clf.predict(X_test_trans1), average = 'micro')]

recall = [recall_score(y_test, nb_clf.predict(X_test_trans1), average = 'micro'), 
         recall_score(y_test, logreg.predict(X_test_trans1), average = 'micro'), 
         recall_score(y_test, rf_clf.predict(X_test_trans1), average = 'micro'), 
         recall_score(y_test, ada_clf.predict(X_test_trans1), average = 'micro'), 
         recall_score(y_test, grad_clf.predict(X_test_trans1), average = 'micro'), 
         recall_score(y_test, xgb_clf.predict(X_test_trans1), average = 'micro')]

f1 = [f1_score(y_test, nb_clf.predict(X_test_trans1), average = 'micro'), 
     f1_score(y_test, logreg.predict(X_test_trans1), average = 'micro'), 
     f1_score(y_test, rf_clf.predict(X_test_trans1), average = 'micro'), 
     f1_score(y_test, ada_clf.predict(X_test_trans1), average = 'micro'), 
     f1_score(y_test, grad_clf.predict(X_test_trans1), average = 'micro'), 
     f1_score(y_test, xgb_clf.predict(X_test_trans1), average = 'micro')]

auc = [roc_auc_score(y_test, nb_clf.predict_proba(X_test_trans1), multi_class='ovr'), 
      roc_auc_score(y_test, logreg.predict_proba(X_test_trans1), multi_class='ovr'), 
      roc_auc_score(y_test, rf_clf.predict_proba(X_test_trans1), multi_class='ovr'), 
      roc_auc_score(y_test, ada_clf.predict_proba(X_test_trans1), multi_class='ovr'), 
      roc_auc_score(y_test, grad_clf.predict_proba(X_test_trans1), multi_class='ovr'), 
      roc_auc_score(y_test, xgb_clf.predict_proba(X_test_trans1), multi_class='ovr')]

#create model performance dataframe
mdl_perf_df = pd.DataFrame({'Train Accuracy': train_acc, 
                           'Test Accuracy': test_acc, 
                           'Precision': precision, 
                           'Recall': recall, 
                           'F1-score': f1, 
                           'AUC': auc}, index = ['MultinomialNB', 'Logistic Regression', 'Random Forest', 'Adaboost', 'Gradient Boost', 'XGBoost'])

mdl_perf_df

Unnamed: 0,Train Accuracy,Test Accuracy,Precision,Recall,F1-score,AUC
MultinomialNB,0.804133,0.432532,0.432532,0.432532,0.432532,0.616154
Logistic Regression,0.700175,0.420826,0.420826,0.420826,0.420826,0.621372
Random Forest,0.572303,0.728897,0.728897,0.728897,0.728897,0.555339
Adaboost,0.586039,0.683919,0.683919,0.683919,0.683919,0.593406
Gradient Boost,0.642607,0.693161,0.693161,0.693161,0.693161,0.636407
XGBoost,0.65216,0.72212,0.72212,0.72212,0.72212,0.62677


In [418]:
pickle.dump(mdl_perf_df, open("model_performance_df", "wb"))