In [1]:
import nltk
import pandas as pd 
from nltk.corpus import stopwords

In [38]:
# load pickle for most common words 
most_common_words = pd.read_pickle('/Users/lihuicham/Desktop/Y2S2/BT4222/project/standup-comedy-analysis/main/pickle/common_words_tfidf.pkl')

In [3]:
# load pickle for textblob dataframe 
df_textblob = pd.read_pickle('/Users/lihuicham/Desktop/Y2S2/BT4222/project/standup-comedy-analysis/main/pickle/sentiments_textblob.pkl')
print(df_textblob['Polarity_Class'].value_counts())

1    350
0     65
Name: Polarity_Class, dtype: int64


In [4]:
# load pickle for vader dataframe 
df_vader = pd.read_pickle('/Users/lihuicham/Desktop/Y2S2/BT4222/project/standup-comedy-analysis/main/pickle/sentiments_vader.pkl')
print(df_vader['Polarity_Class'].value_counts())

1    308
0    107
Name: Polarity_Class, dtype: int64


## TextBlob

In [37]:
df_textblob.head()

Unnamed: 0,Comedian,Date,Title,Subtitle,Transcript,Polarity_Score,Subjectivity_Score,Polarity_Class,Subjectivity_Class
0,Chris Rock,"March 8, 2023",Selective Outrage (2023) | Transcript,,lets go she said ill do anything you w...,0.053924,0.537392,1,1
1,Marc Maron,"March 3, 2023",Thinky Pain (2013) | Transcript,Marc Maron returns to his old stomping grounds...,i dont know what you were thinking like im no...,0.039222,0.52734,1,1
2,Chelsea Handler,"March 3, 2023",Evolution (2020) | Transcript,Chelsea Handler is back and better than ever -...,join me in welcoming the author of six number ...,0.028674,0.496281,1,0
3,Tom Papa,"March 3, 2023",What A Day! (2022) | Transcript,"Follows Papa as he shares about parenting, his...",premiered on december ladies and gentlemen g...,0.040564,0.541739,1,1
4,Jim Jefferies,"February 22, 2023",High n’ Dry (2023) | Transcript,Jim Jefferies is back and no topic is off limi...,please welcome to the stage jim jefferies hell...,0.059485,0.540981,1,1


## Feature Extraction 

In [6]:
# get indexes of the positive and negative classes
pos_indexes = df_textblob.index[df_textblob['Polarity_Class'] == 1].tolist()
neg_indexes = df_textblob.index[df_textblob['Polarity_Class'] == 0].tolist()

In [7]:
# find out how many positive and negative words we have in total, along with their frequency (word, frequency)
pos_words_tuple = []
neg_words_tuple = []

for key in pos_indexes : 
    word_list = most_common_words[key]
    for word in word_list :
        pos_words_tuple.append(word)
        
for key in neg_indexes : 
    word_list = most_common_words[key]
    for word in word_list :
        neg_words_tuple.append(word)
        
print(f"Number of Positive Words : {len(pos_words_tuple)}" )
print(f"Number of Negative Words : {len(neg_words_tuple)}" )

Number of Positive Words : 10500
Number of Negative Words : 1950


In [8]:
# sort the based on descending frequency  (most frequent to least frequent)
sorted_pos_words = sorted(pos_words_tuple, key=lambda w: w[1], reverse=True)
sorted_neg_words = sorted(neg_words_tuple, key=lambda w: w[1], reverse=True)

In [9]:
pos_words = [w[0] for w in sorted_pos_words]
neg_words = [w[0] for w in sorted_neg_words]

In [10]:
# create UNIQUE positive and negative word list 

# first, find the common words between both lists
common_list = list(set(pos_words) & set(neg_words))
print(f"There are {len(common_list)} words that exists in both positive and negative word list.") 

There are 666 words that exists in both positive and negative word list.


In [11]:
# remove the common words to create unique lists 
pos_unique_words = [w for w in pos_words if w not in common_list]
neg_unique_words = [w for w in neg_words if w not in common_list]

print(f"Number of Unique Positive Words : {len(pos_unique_words)}" )
print(f"Number of Unique Negative Words : {len(neg_unique_words)}" )

Number of Unique Positive Words : 6583
Number of Unique Negative Words : 573


In [12]:
# more cleaning : remove stop words
# stop_words = set(stopwords.words('english'))
# filtered_pos_unique_words = [w for w in pos_unique_words if w not in stop_words]
# filtered_neg_unique_words = [w for w in neg_unique_words if w not in common_list]

# print(f"Number of Filtered Unique Positive Words : {len(filtered_pos_unique_words)}" )   # 6579
# print(f"Number of Filtered Unique Negative Words : {len(filtered_neg_unique_words)}" )   # 573

# from this result, we know that removing stop words is not very useful 

In [30]:
# notice that there are actually a lot of repeated word in each list, 
# especially in the positive words list. 
# let's try to remove the duplicates 
# every word should only exist once within its own list (no duplicates)

# Note : we cannot use the set() function to remove duplicate as the method does not preserve the sorted order. 

pos_no_dup = []
neg_no_dup = []

for word in pos_unique_words:
    if word not in pos_no_dup:
        pos_no_dup.append(word)

for word in neg_unique_words:
    if word not in neg_no_dup:
        neg_no_dup.append(word)

print(f"Number of Unique Positive Words with No Duplicates : {len(pos_no_dup)}" )
print(f"Number of Unique Negative Words with No Duplicates : {len(neg_no_dup)}" )

Number of Unique Positive Words with No Duplicates : 4543
Number of Unique Negative Words with No Duplicates : 552


In [35]:
own_stop_words = ['♫', 'fucken', 'motherfcker', 'whaaaaaaaa', 
                  'whaaaaaaaaaaaaaaaa', 'mmmmm', 'i–i', 'up♪', 'like—', '♪girl',
                 'yeah♪', 'you-', ]

In [36]:
filtered_pos_words = [w for w in pos_no_dup if w not in own_stop_words]
filtered_neg_words = [w for w in neg_no_dup if w not in own_stop_words]

print(f"Number of Filtered Positive Words : {len(filtered_pos_words)}" )  
print(f"Number of Filtered Negative Words : {len(filtered_neg_words)}" )  

Number of Filtered Positive Words : 4536
Number of Filtered Negative Words : 548
