Let's start by removing all the unnecessary columns and keeping the only two that interest us: Text and Sentiment.

In [93]:
import pandas as pd

file_path = 'sentimentdataset.csv'
df = pd.read_csv(file_path)

# Select only the Text and Sentiment columns
df = df[['Text', 'Sentiment']]

df.head()

Unnamed: 0,Text,Sentiment
0,Enjoying a beautiful day at the park! ...,Positive
1,Traffic was terrible this morning. ...,Negative
2,Just finished an amazing workout! 💪 ...,Positive
3,Excited about the upcoming weekend getaway! ...,Positive
4,Trying out a new recipe for dinner tonight. ...,Neutral


Let's start by seeing what are the individual sentiments present in the dataset.

In [94]:
df['Sentiment'].unique()

array([' Positive  ', ' Negative  ', ' Neutral   ', ' Anger        ',
       ' Fear         ', ' Sadness      ', ' Disgust      ',
       ' Happiness    ', ' Joy          ', ' Love         ',
       ' Amusement    ', ' Enjoyment    ', ' Admiration   ',
       ' Affection    ', ' Awe          ', ' Disappointed ',
       ' Surprise     ', ' Acceptance   ', ' Adoration    ',
       ' Anticipation ', ' Bitter       ', ' Calmness     ',
       ' Confusion    ', ' Excitement   ', ' Kind         ',
       ' Pride        ', ' Shame        ', ' Confusion ', ' Excitement ',
       ' Shame ', ' Elation       ', ' Euphoria      ', ' Contentment   ',
       ' Serenity      ', ' Gratitude     ', ' Hope          ',
       ' Empowerment   ', ' Compassion    ', ' Tenderness    ',
       ' Arousal       ', ' Enthusiasm    ', ' Fulfillment  ',
       ' Reverence     ', ' Compassion', ' Fulfillment   ', ' Reverence ',
       ' Elation   ', ' Despair         ', ' Grief           ',
       ' Loneliness     

Let's clean these up of any whitespaces.

In [95]:
df['Sentiment'] = df['Sentiment'].str.strip()

Let's now classify the sentiments as only 'Positive', 'Negative' and 'Neutral' for the sake of simplicity.

In [96]:
sentiment_classification = {
    'Positive': [
        'Positive', 'Happy', 'Joy', 'Love', 'Amusement', 'Enjoyment', 'Happiness', 'Surprise',
       'Adoration', 'Admiration', 'Anticipation', 'Affection', 'Awe', 'Excitement', 'Kind', 
       'Pride', 'Elation', 'Euphoria', 'Contentment', 'Serenity', 'Gratitude', 'Hope', 
        'Empowerment', 'Compassion', 'Tenderness', 'Arousal', 'Enthusiasm', 
        'Fulfillment', 'Reverence', 'Inspiration', 'Motivation', 'Thrill',
       'Bittersweet', 'Overjoyed', 'JoyfulReunion', 'Blessed',
       'Appreciation', 'Enchantment', 'Amusement', 'Satisfaction', 'Optimism', 
       'Accomplishment', 'Wonderment', 'Confidence', 'Playful', 'Free-spirited',
        'Radiance', 'Rejuvenation', 'Adventure','Nostalgia', 'Determination', 'Zest',
        'Hopeful', 'Proud', 'Grateful', 'Empathetic', 'Compassionate', 'Inspired', 'Confident', 
        'Creativity', 'PlayfulJoy', 'Resilience', 'Spark', 'Marvel',
       'Mindfulness', 'DreamChaser', 'Elegance', 'Whimsy', 'Wonder',
       'Coziness', 'Melodic', 'FestiveJoy', 'InnerJourney', 'Freedom',
       'Dazzle', 'Adrenaline', 'ArtisticBurst', 'CulinaryOdyssey', 
       'Positivity', 'Kindness', 'Friendship', 'Success', 'Exploration',
       'Amazement', 'Romance', 'Captivation', 'Tranquility', 'Grandeur',
       'Emotion', 'Energy', 'Celebration', 'Charm', 'Ecstasy', 'Colorful',
       'Hypnotic', 'Connection', 'Iconic', 'Journey', 'Engagement',
       'Touched', 'Triumph', 'Heartwarming', 'Sympathy', 'Runway Creativity', "Ocean's Freedom",
       'Renewed Effort', 'Challenge', 'Breakthrough', 'Joy in Baking',
       'Envisioning History', 'Imagination', 'Vibrancy', 'Mesmerizing',
       'Culinary Adventure', 'Winter Magic', 'Thrilling Journey',
       "Nature's Beauty", 'Celestial Wonder', 'Creative Inspiration'
    ],
    'Negative': [
        'Negative', 'Anger', 'Fear', 'Sadness', 'Disgust', 'Disappointed', 
        'Bitter', 'Calmness', 'Confusion', 'Frustration', 'Shame', 'Grief', 
        'Loneliness', 'Jealousy', 'Resentment', 'Anxiety', 'Intimidation', 
        'Helplessness', 'Envy', 'Regret', 'Despair', 'Isolation', 'Loss', 
        'Heartache', 'Solitude', 'Betrayal', 'Sorrow', 'Darkness', 
        'Desperation', 'Ruins', 'Desolation', 'Exhaustion', 'Emotionalstorm', 
        'Miscalculation', 'Obstacle', 'Pressure', 'Boredom', 'Melancholy',
        'Bitterness', 'Fearful', 'Apprehensive', 'Overwhelmed', 'Jealous',
       'Devastated', 'Frustrated', 'Envious', 'Dismissive', 'Heartbreak',
       'Suffering', 'EmotionalStorm', 'Disappointment', 'LostLove', 'Whispers of the Past',
        'Embarrassed', 'Sad', 'Hate', 'Bad'
    ],
    'Neutral': [
        'Neutral', 'Acceptance', 'Indifference', 'Numbness', 'Ambivalence', 
        'Reflection', 'Pensive', 'Solace', 'Harmony', 'Contemplation', 'Curiosity',
        'Yearning', 'Intrigue', 'Immersion', 'Suspense', 'Relief', 'Mischievous'
    ]
}

# Create a reverse lookup dictionary
reverse_lookup = {sentiment: key for key, values in sentiment_classification.items() for sentiment in values}

df['Sentiment'] = df['Sentiment'].replace(reverse_lookup)

df['Sentiment'].unique()


array(['Positive', 'Negative', 'Neutral'], dtype=object)

### **VADER** - Bag of Words Approach

In [97]:
import nltk
from nltk.sentiment.vader import SentimentIntensityAnalyzer
from tqdm.notebook import tqdm

# Ensure NLTK datasets are downloaded
nltk.download('vader_lexicon')
nltk.download('punkt')
nltk.download('averaged_perceptron_tagger')
nltk.download('maxent_ne_chunker')
nltk.download('words')

# Initialize VADER sentiment analyzer
sia = SentimentIntensityAnalyzer()

[nltk_data] Downloading package vader_lexicon to
[nltk_data]     /Users/janvigoje/nltk_data...
[nltk_data]   Package vader_lexicon is already up-to-date!
[nltk_data] Downloading package punkt to /Users/janvigoje/nltk_data...
[nltk_data]   Package punkt is already up-to-date!
[nltk_data] Downloading package averaged_perceptron_tagger to
[nltk_data]     /Users/janvigoje/nltk_data...
[nltk_data]   Package averaged_perceptron_tagger is already up-to-
[nltk_data]       date!
[nltk_data] Downloading package maxent_ne_chunker to
[nltk_data]     /Users/janvigoje/nltk_data...
[nltk_data]   Package maxent_ne_chunker is already up-to-date!
[nltk_data] Downloading package words to /Users/janvigoje/nltk_data...
[nltk_data]   Package words is already up-to-date!


In [98]:
def classify_sentiment(text):
    scores = sia.polarity_scores(text)
    compound = scores['compound']
    if compound >= 0.05:
        sentiment = 'Positive'
    elif compound <= -0.05:
        sentiment = 'Negative'
    else:
        sentiment = 'Neutral'
    return sentiment

df['Sentiment_VADER']= df['Text'].apply(classify_sentiment)
df




Unnamed: 0,Text,Sentiment,Sentiment_VADER
0,Enjoying a beautiful day at the park! ...,Positive,Positive
1,Traffic was terrible this morning. ...,Negative,Negative
2,Just finished an amazing workout! 💪 ...,Positive,Positive
3,Excited about the upcoming weekend getaway! ...,Positive,Positive
4,Trying out a new recipe for dinner tonight. ...,Neutral,Neutral
...,...,...,...
727,Collaborating on a science project that receiv...,Positive,Positive
728,Attending a surprise birthday party organized ...,Positive,Positive
729,Successfully fundraising for a school charity ...,Positive,Positive
730,"Participating in a multicultural festival, cel...",Positive,Positive


### **Hugging Face** - Roberta Pretained Model

In [99]:
from transformers import AutoTokenizer
from transformers import AutoModelForSequenceClassification
from scipy.special import softmax


In [100]:
MODEL = f"cardiffnlp/twitter-roberta-base-sentiment"
tokenizer = AutoTokenizer.from_pretrained(MODEL)
model = AutoModelForSequenceClassification.from_pretrained(MODEL)



In [101]:
#Example
encoded_text = tokenizer("I am so happy!", return_tensors='pt')
results = model(**encoded_text)
scores = results[0][0].detach().numpy()
scores = softmax(scores)
scores_dict = {
    'roberta_neg' : scores[0],
    'roberta_neu' : scores[1],
    'roberta_pos' : scores[2]
}
print(scores_dict)

{'roberta_neg': 0.0016519686, 'roberta_neu': 0.006504846, 'roberta_pos': 0.99184316}


In [102]:
def classify_sentiment_2(text):
    encoded_text = tokenizer(text, return_tensors='pt')
    results = model(**encoded_text)
    scores = results[0][0].detach().numpy()
    scores = softmax(scores)
    scores_dict = {
        'roberta_neg' : scores[0],
        'roberta_neu' : scores[1],
        'roberta_pos' : scores[2]
    }
    if scores_dict['roberta_neg'] > scores_dict['roberta_neu'] and scores_dict['roberta_neg'] > scores_dict['roberta_pos']:
        return 'Negative'
    elif  scores_dict['roberta_pos'] > scores_dict['roberta_neu'] and scores_dict['roberta_pos'] > scores_dict['roberta_neg']:
        return 'Positive'
    else:
        return 'Neutral'
    

df['Sentiment_Roberta']= df['Text'].apply(classify_sentiment_2)
df

Unnamed: 0,Text,Sentiment,Sentiment_VADER,Sentiment_Roberta
0,Enjoying a beautiful day at the park! ...,Positive,Positive,Positive
1,Traffic was terrible this morning. ...,Negative,Negative,Negative
2,Just finished an amazing workout! 💪 ...,Positive,Positive,Positive
3,Excited about the upcoming weekend getaway! ...,Positive,Positive,Positive
4,Trying out a new recipe for dinner tonight. ...,Neutral,Neutral,Neutral
...,...,...,...,...
727,Collaborating on a science project that receiv...,Positive,Positive,Positive
728,Attending a surprise birthday party organized ...,Positive,Positive,Positive
729,Successfully fundraising for a school charity ...,Positive,Positive,Positive
730,"Participating in a multicultural festival, cel...",Positive,Positive,Positive


### **The Transformers Pipeline** : Quick and Easy

In [103]:
from transformers import pipeline

sent_pipeline = pipeline("sentiment-analysis")

No model was supplied, defaulted to distilbert/distilbert-base-uncased-finetuned-sst-2-english and revision af0f99b (https://huggingface.co/distilbert/distilbert-base-uncased-finetuned-sst-2-english).
Using a pipeline without specifying a model name and revision in production is not recommended.
Hardware accelerator e.g. GPU is available in the environment, but no `device` argument is passed to the `Pipeline` object. Model will be on CPU.


In [104]:
sent_pipeline('I love sentiment analysis!')

[{'label': 'POSITIVE', 'score': 0.9997853636741638}]

In [105]:
def classify_sentiment_3(text):
    result = sent_pipeline(text)
    result[0]['label'] = result[0]['label'].capitalize()
    return result[0]['label'] 
    

df['Sentiment_Pipeline']= df['Text'].apply(classify_sentiment_3)
df

Unnamed: 0,Text,Sentiment,Sentiment_VADER,Sentiment_Roberta,Sentiment_Pipeline
0,Enjoying a beautiful day at the park! ...,Positive,Positive,Positive,Positive
1,Traffic was terrible this morning. ...,Negative,Negative,Negative,Negative
2,Just finished an amazing workout! 💪 ...,Positive,Positive,Positive,Positive
3,Excited about the upcoming weekend getaway! ...,Positive,Positive,Positive,Positive
4,Trying out a new recipe for dinner tonight. ...,Neutral,Neutral,Neutral,Negative
...,...,...,...,...,...
727,Collaborating on a science project that receiv...,Positive,Positive,Positive,Positive
728,Attending a surprise birthday party organized ...,Positive,Positive,Positive,Positive
729,Successfully fundraising for a school charity ...,Positive,Positive,Positive,Positive
730,"Participating in a multicultural festival, cel...",Positive,Positive,Positive,Positive


### **TextBlob**

In [106]:
from textblob import TextBlob

def classify_sentiment_4(text):
    blob = TextBlob(text)
    score = blob.sentiment.polarity
    if score >= 0.05:
        sentiment = 'Positive'
    elif score <= -0.05:
        sentiment = 'Negative'
    else:
        sentiment = 'Neutral'
    return sentiment

df['Sentiment_Blob']= df['Text'].apply(classify_sentiment_4)
df

Unnamed: 0,Text,Sentiment,Sentiment_VADER,Sentiment_Roberta,Sentiment_Pipeline,Sentiment_Blob
0,Enjoying a beautiful day at the park! ...,Positive,Positive,Positive,Positive,Positive
1,Traffic was terrible this morning. ...,Negative,Negative,Negative,Negative,Negative
2,Just finished an amazing workout! 💪 ...,Positive,Positive,Positive,Positive,Positive
3,Excited about the upcoming weekend getaway! ...,Positive,Positive,Positive,Positive,Positive
4,Trying out a new recipe for dinner tonight. ...,Neutral,Neutral,Neutral,Negative,Positive
...,...,...,...,...,...,...
727,Collaborating on a science project that receiv...,Positive,Positive,Positive,Positive,Positive
728,Attending a surprise birthday party organized ...,Positive,Positive,Positive,Positive,Positive
729,Successfully fundraising for a school charity ...,Positive,Positive,Positive,Positive,Positive
730,"Participating in a multicultural festival, cel...",Positive,Positive,Positive,Positive,Positive


### Measuring Accuracy

In [4]:
methods = {
    'VADER': 'Sentiment_VADER',
    'RoBERTa': 'Sentiment_Roberta',
    'Pipeline': 'Sentiment_Pipeline',
    'TextBlob': 'Sentiment_Blob'
}

methods_list = []
true_percentages = []
false_percentages = []

for method, column in methods.items():
    match_column = f"Match_{method}"
    df[match_column] = df['Sentiment'] == df[column]
    
    # Calculating match frequency
    match_frequency = df[match_column].value_counts(normalize=True) * 100
    
    true_percentage = match_frequency.get(True, 0) # If True is not present, it will return 0
    false_percentage = match_frequency.get(False, 0) # Same here
    
    methods_list.append(method)
    true_percentages.append(true_percentage)
    false_percentages.append(false_percentage)

data = {
    "Method": methods_list,
    "True (%)": true_percentages,
    "False (%)": false_percentages
}

accuracy_df = pd.DataFrame(data)
accuracy_df


Unnamed: 0,Method,True (%),False (%)
0,VADER,77.459016,22.540984
1,RoBERTa,80.601093,19.398907
2,Pipeline,79.644809,20.355191
3,TextBlob,48.224044,51.775956


Therefore, we select Roberta's Pretained Model, from Hugging face, as it has 80% accuracy. 