In [1]:
import pandas as pd
from textblob import TextBlob

### Data Collection:

In [2]:
# Read the CSV file into a DataFrame.
df = pd.read_csv('starbucks_post_comment_raw_data.csv')

In [3]:
df

Unnamed: 0,post_id,post_author,comment_auther,comment_id,comment_body,comment_score,comment_created_utc,comment_permalink,is_root_comment,comment_author_flair_text
0,xr8fc3,swiftlocked,Saint-Claire,iqdfbj8,That's really bold and honestly why would they...,2577,1.664463e+09,/r/starbucks/comments/xr8fc3/when_they_buy_a_0...,True,Former Partner
1,xr8fc3,swiftlocked,angiehawkeye,iqdmth8,"I'm confused, they only paid 5 cents and thoug...",1286,1.664466e+09,/r/starbucks/comments/xr8fc3/when_they_buy_a_0...,True,
2,xr8fc3,swiftlocked,CDNnotintheknow,iqdknrl,No is a complete answer.,319,1.664465e+09,/r/starbucks/comments/xr8fc3/when_they_buy_a_0...,True,:Customer: Customer
3,xr8fc3,swiftlocked,bearssaygrrr,iqdkdq1,"Send them an empty bag, the AUDACITY of this fool",842,1.664465e+09,/r/starbucks/comments/xr8fc3/when_they_buy_a_0...,True,
4,xr8fc3,swiftlocked,,iqdlkuq,make the frap but dump it in the bag 😇,1000,1.664465e+09,/r/starbucks/comments/xr8fc3/when_they_buy_a_0...,True,
...,...,...,...,...,...,...,...,...,...,...
525,rco8xd,witchezbrew,HaleyxErin,hnxbokm,Then take it out before i ever see it because ...,1,1.639098e+09,/r/starbucks/comments/rco8xd/they_won_the_unio...,False,Former Partner
526,rco8xd,witchezbrew,HaleyxErin,hnx9ijf,Well can you select not to pay them while work...,2,1.639097e+09,/r/starbucks/comments/rco8xd/they_won_the_unio...,False,Former Partner
527,rco8xd,witchezbrew,HaleyxErin,hnxamvt,.8 cents is literally nothing at all.,-1,1.639097e+09,/r/starbucks/comments/rco8xd/they_won_the_unio...,False,Former Partner
528,rco8xd,witchezbrew,HaleyxErin,hnxagol,The worst working condition was the customers....,2,1.639097e+09,/r/starbucks/comments/rco8xd/they_won_the_unio...,False,Former Partner


### Text Preprocessing

In [4]:
# Specify the columns you want to drop
columns_to_drop = ['post_id', 'post_author', 'comment_auther', 'comment_score', 'comment_created_utc', 'comment_permalink', 'is_root_comment', 'comment_author_flair_text']

# Drop the specified columns
df.drop(columns=columns_to_drop, inplace=True)


In [5]:
df

Unnamed: 0,comment_id,comment_body
0,iqdfbj8,That's really bold and honestly why would they...
1,iqdmth8,"I'm confused, they only paid 5 cents and thoug..."
2,iqdknrl,No is a complete answer.
3,iqdkdq1,"Send them an empty bag, the AUDACITY of this fool"
4,iqdlkuq,make the frap but dump it in the bag 😇
...,...,...
525,hnxbokm,Then take it out before i ever see it because ...
526,hnx9ijf,Well can you select not to pay them while work...
527,hnxamvt,.8 cents is literally nothing at all.
528,hnxagol,The worst working condition was the customers....


##### Lowercasing:

In [6]:
# Apply lowercasing to the 'comment_body' column
df['comment_body'] = df['comment_body'].str.lower()

In [7]:
df

Unnamed: 0,comment_id,comment_body
0,iqdfbj8,that's really bold and honestly why would they...
1,iqdmth8,"i'm confused, they only paid 5 cents and thoug..."
2,iqdknrl,no is a complete answer.
3,iqdkdq1,"send them an empty bag, the audacity of this fool"
4,iqdlkuq,make the frap but dump it in the bag 😇
...,...,...
525,hnxbokm,then take it out before i ever see it because ...
526,hnx9ijf,well can you select not to pay them while work...
527,hnxamvt,.8 cents is literally nothing at all.
528,hnxagol,the worst working condition was the customers....


##### Removing Punctuation:

In [8]:
import string

In [9]:
# Define a function to remove punctuation using regex
def remove_punctuation(text):
    # Replace all punctuation characters with an empty string
    return ''.join([char for char in text if char not in string.punctuation])


In [10]:
# Apply the remove_punctuation function to the 'comment_body' column
df['comment_body'] = df['comment_body'].apply(remove_punctuation)

In [11]:
df

Unnamed: 0,comment_id,comment_body
0,iqdfbj8,thats really bold and honestly why would they ...
1,iqdmth8,im confused they only paid 5 cents and thought...
2,iqdknrl,no is a complete answer
3,iqdkdq1,send them an empty bag the audacity of this fool
4,iqdlkuq,make the frap but dump it in the bag 😇
...,...,...
525,hnxbokm,then take it out before i ever see it because ...
526,hnx9ijf,well can you select not to pay them while work...
527,hnxamvt,8 cents is literally nothing at all
528,hnxagol,the worst working condition was the customers ...


In [12]:
import re

In [15]:
# Define a string-based regex pattern to match emoticons
emoticon_pattern = r'[\U0001F600-\U0001F64F\U0001F300-\U0001F5FF\U0001F680-\U0001F6FF\U0001F700-\U0001F77F\U0001F780-\U0001F7FF\U0001F800-\U0001F8FF\U0001F900-\U0001F9FF\U0001FA00-\U0001FA6F\U0001FA70-\U0001FAFF\U0001FB00-\U0001FBFF\U0001FC00-\U0001FCFF\U0001FD00-\U0001FDFF\U0001FE00-\U0001FEFF\U0001FF00-\U0001FFFF]+'

In [16]:
# Apply the emoticon removal to the 'comment_body' column with regex=True
df['comment_body'] = df['comment_body'].str.replace(emoticon_pattern, '', regex=True)

In [17]:
df

Unnamed: 0,comment_id,comment_body
0,iqdfbj8,thats really bold and honestly why would they ...
1,iqdmth8,im confused they only paid 5 cents and thought...
2,iqdknrl,no is a complete answer
3,iqdkdq1,send them an empty bag the audacity of this fool
4,iqdlkuq,make the frap but dump it in the bag
...,...,...
525,hnxbokm,then take it out before i ever see it because ...
526,hnx9ijf,well can you select not to pay them while work...
527,hnxamvt,8 cents is literally nothing at all
528,hnxagol,the worst working condition was the customers ...


##### Stopword Removal

In [22]:
import nltk
nltk.download('stopwords')
from nltk.corpus import stopwords

[nltk_data] Downloading package stopwords to
[nltk_data]     /Users/csadminpro16gb/nltk_data...
[nltk_data]   Unzipping corpora/stopwords.zip.


In [23]:
# Download the NLTK stopwords list and create a set
stopwords_set = set(stopwords.words('english'))

In [24]:
# Define a function to remove stopwords from a text
def remove_stopwords(text):
    return ' '.join([word for word in text.split() if word.lower() not in stopwords_set])


In [25]:
# Apply the remove_stopwords function to the 'comment_body' column
df['comment_body'] = df['comment_body'].apply(remove_stopwords)

In [26]:
df

Unnamed: 0,comment_id,comment_body
0,iqdfbj8,thats really bold honestly would ever think wo...
1,iqdmth8,im confused paid 5 cents thought could get frap
2,iqdknrl,complete answer
3,iqdkdq1,send empty bag audacity fool
4,iqdlkuq,make frap dump bag
...,...,...
525,hnxbokm,take ever see dont even want pay car note actu...
526,hnx9ijf,well select pay working yeah
527,hnxamvt,8 cents literally nothing
528,hnxagol,worst working condition customers unless could...


In [27]:
from nltk.sentiment.vader import SentimentIntensityAnalyzer

In [28]:
# Initialize the VADER sentiment analyzer
nltk.download('vader_lexicon')
sid = SentimentIntensityAnalyzer()

[nltk_data] Downloading package vader_lexicon to
[nltk_data]     /Users/csadminpro16gb/nltk_data...


In [29]:
# You can access the 'comment_body' column and calculate sentiment scores
df['sentiment_scores'] = df['comment_body'].apply(lambda x: sid.polarity_scores(x))

In [30]:
# Extract the compound sentiment score (a normalized score) and add it as a new column
df['compound_sentiment'] = df['sentiment_scores'].apply(lambda x: x['compound'])

In [31]:
df

Unnamed: 0,comment_id,comment_body,sentiment_scores,compound_sentiment
0,iqdfbj8,thats really bold honestly would ever think wo...,"{'neg': 0.0, 'neu': 0.531, 'pos': 0.469, 'comp...",0.7328
1,iqdmth8,im confused paid 5 cents thought could get frap,"{'neg': 0.247, 'neu': 0.753, 'pos': 0.0, 'comp...",-0.3182
2,iqdknrl,complete answer,"{'neg': 0.0, 'neu': 1.0, 'pos': 0.0, 'compound...",0.0000
3,iqdkdq1,send empty bag audacity fool,"{'neg': 0.61, 'neu': 0.39, 'pos': 0.0, 'compou...",-0.5719
4,iqdlkuq,make frap dump bag,"{'neg': 0.464, 'neu': 0.536, 'pos': 0.0, 'comp...",-0.3818
...,...,...,...,...
525,hnxbokm,take ever see dont even want pay car note actu...,"{'neg': 0.09, 'neu': 0.814, 'pos': 0.096, 'com...",0.0191
526,hnx9ijf,well select pay working yeah,"{'neg': 0.182, 'neu': 0.26, 'pos': 0.558, 'com...",0.4404
527,hnxamvt,8 cents literally nothing,"{'neg': 0.0, 'neu': 1.0, 'pos': 0.0, 'compound...",0.0000
528,hnxagol,worst working condition customers unless could...,"{'neg': 0.291, 'neu': 0.709, 'pos': 0.0, 'comp...",-0.6249


In [32]:
# Classify sentiment as 'positive', 'neutral', or 'negative' based on compound score thresholds
def classify_sentiment(compound_score):
    if compound_score >= 0.05:
        return 'positive'
    elif compound_score <= -0.05:
        return 'negative'
    else:
        return 'neutral'


In [33]:
df['sentiment'] = df['compound_sentiment'].apply(classify_sentiment)

In [34]:
df

Unnamed: 0,comment_id,comment_body,sentiment_scores,compound_sentiment,sentiment
0,iqdfbj8,thats really bold honestly would ever think wo...,"{'neg': 0.0, 'neu': 0.531, 'pos': 0.469, 'comp...",0.7328,positive
1,iqdmth8,im confused paid 5 cents thought could get frap,"{'neg': 0.247, 'neu': 0.753, 'pos': 0.0, 'comp...",-0.3182,negative
2,iqdknrl,complete answer,"{'neg': 0.0, 'neu': 1.0, 'pos': 0.0, 'compound...",0.0000,neutral
3,iqdkdq1,send empty bag audacity fool,"{'neg': 0.61, 'neu': 0.39, 'pos': 0.0, 'compou...",-0.5719,negative
4,iqdlkuq,make frap dump bag,"{'neg': 0.464, 'neu': 0.536, 'pos': 0.0, 'comp...",-0.3818,negative
...,...,...,...,...,...
525,hnxbokm,take ever see dont even want pay car note actu...,"{'neg': 0.09, 'neu': 0.814, 'pos': 0.096, 'com...",0.0191,neutral
526,hnx9ijf,well select pay working yeah,"{'neg': 0.182, 'neu': 0.26, 'pos': 0.558, 'com...",0.4404,positive
527,hnxamvt,8 cents literally nothing,"{'neg': 0.0, 'neu': 1.0, 'pos': 0.0, 'compound...",0.0000,neutral
528,hnxagol,worst working condition customers unless could...,"{'neg': 0.291, 'neu': 0.709, 'pos': 0.0, 'comp...",-0.6249,negative
