In [None]:
# Muhamed Hisham bin Mohamed Bahurudeen (IS01081947)
# Muhammad Afiq Fikri Bin Ahmad Sabri (IS01082516)

# In this project, we tested VADER (lexicon-based) and Naïve Bayes (machine learning) for sentiment analysis. 
# VADER gave 79% accuracy, but struggled with neutral (f1-score: 0.05) and negative (f1-score: 0.37). 
# Naïve Bayes performed better with 85% accuracy, and higher f1-scores for negative (0.65) and positive (0.92). 
# Overall, Naïve Bayes is more accurate and reliable for classifying sentiment.



In [2]:
!pip install vaderSentiment scikit-learn pandas --quiet

import pandas as pd
from vaderSentiment.vaderSentiment import SentimentIntensityAnalyzer
from sklearn.feature_extraction.text import CountVectorizer
from sklearn.model_selection import train_test_split
from sklearn.naive_bayes import MultinomialNB
from sklearn.metrics import classification_report


In [4]:
# Load dataset
df = pd.read_csv("preprocessed_reviews_final.csv")

# Drop nulls 
df = df.dropna(subset=["lemmatized", "Sentiment"])

# Only keep positive, neutral, negative
df = df[df['Sentiment'].isin(['positive', 'neutral', 'negative'])]


df.head()


Unnamed: 0,ProductId,Score,lemmatized,tokenized,Sentiment
0,B001E4KFG0,5,buy several vitality can dog food product find...,"['buy', 'several', 'vitality', 'can', 'dog', '...",positive
1,B00813GRG4,1,product arrive labeled jumbo salt peanutsthe p...,"['product', 'arrive', 'labeled', 'jumbo', 'sal...",negative
2,B000LQOCH0,4,confection around century light pillowy citrus...,"['confection', 'around', 'century', 'light', '...",positive
3,B000UA0QIQ,2,look secret ingredient robitussin believe find...,"['look', 'secret', 'ingredient', 'robitussin',...",negative
4,B006K2ZZ7K,5,great taffy great price wide assortment yummy ...,"['great', 'taffy', 'great', 'price', 'wide', '...",positive


In [6]:
analyzer = SentimentIntensityAnalyzer()


def vader_sentiment(text):
    score = analyzer.polarity_scores(text)['compound']
    if score >= 0.05:
        return 'positive'
    elif score <= -0.05:
        return 'negative'
    else:
        return 'neutral'

# Apply to lemmatized 
df['vader_predicted'] = df['lemmatized'].apply(vader_sentiment)

# Report
print("\nClassification Report for VADER:")
print(classification_report(df['Sentiment'], df['vader_predicted'], target_names=['negative', 'neutral', 'positive']))



Classification Report for VADER:
              precision    recall  f1-score   support

    negative       0.58      0.27      0.37     82036
     neutral       0.12      0.03      0.05     42640
    positive       0.82      0.96      0.88    443771

    accuracy                           0.79    568447
   macro avg       0.51      0.42      0.43    568447
weighted avg       0.73      0.79      0.75    568447



In [8]:
X = df['lemmatized']
y = df['Sentiment']

# Split into train-test
X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.3, random_state=42)

# Vectorize using Bag of Words
vectorizer = CountVectorizer()
X_train_vec = vectorizer.fit_transform(X_train)
X_test_vec = vectorizer.transform(X_test)

# Train Naive Bayes model
nb_model = MultinomialNB()
nb_model.fit(X_train_vec, y_train)

# Predict
y_pred_nb = nb_model.predict(X_test_vec)

# Report
print("\nClassification Report for Naive Bayes:")
print(classification_report(y_test, y_pred_nb, target_names=['negative', 'neutral', 'positive']))



Classification Report for Naive Bayes:
              precision    recall  f1-score   support

    negative       0.70      0.60      0.65     24657
     neutral       0.50      0.18      0.27     12816
    positive       0.88      0.96      0.92    133062

    accuracy                           0.85    170535
   macro avg       0.70      0.58      0.61    170535
weighted avg       0.83      0.85      0.83    170535

