In [1]:
import pandas as pd
import numpy as np
import re
import string
import nltk
from nltk.corpus import stopwords
from nltk.tokenize import word_tokenize
from nltk.sentiment import SentimentIntensityAnalyzer
from nrclex import NRCLex
from sklearn.model_selection import train_test_split
from sklearn.ensemble import RandomForestClassifier
from sklearn.metrics import accuracy_score, classification_report


In [2]:
# Download necessary NLTK data
nltk.download('vader_lexicon')
nltk.download('punkt')
nltk.download('stopwords')

[nltk_data] Downloading package vader_lexicon to
[nltk_data]     C:\Users\Jakov\AppData\Roaming\nltk_data...
[nltk_data]   Package vader_lexicon is already up-to-date!
[nltk_data] Downloading package punkt to
[nltk_data]     C:\Users\Jakov\AppData\Roaming\nltk_data...
[nltk_data]   Package punkt is already up-to-date!
[nltk_data] Downloading package stopwords to
[nltk_data]     C:\Users\Jakov\AppData\Roaming\nltk_data...
[nltk_data]   Package stopwords is already up-to-date!


True

In [3]:
import zipfile
import os

zip_path = "dataset.zip"
extract_path = ""

# Ensure the extract path exists

# Extract the zip file
with zipfile.ZipFile(zip_path, 'r') as zip_ref:
    zip_ref.extractall(extract_path)



In [3]:
# Load dataset
df = pd.read_csv("dataset.csv")  # Replace with your actual dataset file


In [4]:
lemmatizer = nltk.WordNetLemmatizer()
stop_words = set(nltk.corpus.stopwords.words("english"))


def preprocess_text(text):
    text = text.lower()
    text = re.sub(r"<.*?>", "", text)  # Remove HTML tags
    text = re.sub(r"[^a-zA-Z\s]", "", text)  # Remove non-alphabetical characters
    tokens = nltk.word_tokenize(text)
    tokens = [lemmatizer.lemmatize(word) for word in tokens if word not in stop_words]
    return " ".join(tokens)

In [11]:
def analyze_sentiment(text):
    sia = SentimentIntensityAnalyzer()
    sentiment_score = sia.polarity_scores(text)
    return "positive" if sentiment_score['compound'] >= 0 else "negative"


In [12]:
def analyze_emotion(text):
    emotion = NRCLex(text)
    emotion_scores = emotion.raw_emotion_scores
    positive_emotions = {"joy", "trust", "anticipation", "surprise"}
    negative_emotions = {"anger", "disgust", "fear", "sadness"}
    pos_score = sum(emotion_scores.get(e, 0) for e in positive_emotions)
    neg_score = sum(emotion_scores.get(e, 0) for e in negative_emotions)
    return emotion_scores, pos_score, neg_score


In [7]:
# Apply preprocessing and analysis
print("Preprocessing text")
df["Cleaned_Reviews"] = df["review"].astype(str).apply(preprocess_text)



Preprocessing text


In [13]:
print("Analyzing sentiment")
df["Predicted_Sentiment"] = df["Cleaned_Reviews"].apply(analyze_sentiment)


Analyzing sentiment


In [16]:
df[["Emotion_Scores", "Positive_Score", "Negative_Score"]] = df["Cleaned_Reviews"].apply(lambda x: pd.Series(analyze_emotion(x)))


In [17]:
df["Emotion_Based_Sentiment"] = df.apply(lambda row: "positive" if row["Positive_Score"] >= row["Negative_Score"] else "negative", axis=1)


In [18]:
print("5")
df["Sentiment_Match"] = df["sentiment"] == df["Emotion_Based_Sentiment"]

5


In [19]:
# Prepare data for training
X = df[["Positive_Score", "Negative_Score"]]
y = df["sentiment"].map({"positive": 1, "negative": 0})

In [26]:
# Train-test split
X_train, X_test, y_train, y_test = train_test_split(
    X, y, test_size=0.2, random_state=42
)

In [27]:
# Train a RandomForestClassifier model
model = RandomForestClassifier()
model.fit(X_train, y_train)

In [28]:
# Evaluate the model
y_pred = model.predict(X_test)
print(f"Model Accuracy: {accuracy_score(y_test, y_pred):.2f}")
print(classification_report(y_test, y_pred))

Model Accuracy: 0.63
              precision    recall  f1-score   support

           0       0.63      0.64      0.63      4961
           1       0.64      0.62      0.63      5039

    accuracy                           0.63     10000
   macro avg       0.63      0.63      0.63     10000
weighted avg       0.63      0.63      0.63     10000



In [29]:
# Test on a new review
new_review = "Great story, made me happy, but the acting was terrible. Overall very bad movie."
new_review_cleaned = preprocess_text(new_review)
new_review_emotions, new_review_pos, new_review_neg = analyze_emotion(new_review_cleaned)
new_review_features = pd.DataFrame([[new_review_pos, new_review_neg]], columns=["Positive_Score", "Negative_Score"])
new_review_sentiment = model.predict(new_review_features)
print("New Review Emotion Scores:")
for emotion, score in new_review_emotions.items():
    print(f"{emotion}: {score}")

print(f"Comparison - Positive Score: {new_review_pos}, Negative Score: {new_review_neg}")
print(f"Predicted Sentiment: {'Positive' if new_review_sentiment[0] == 1 else 'Negative'}")


New Review Emotion Scores:
anticipation: 1
joy: 1
positive: 1
trust: 1
anger: 2
disgust: 2
fear: 2
negative: 2
sadness: 2
Comparison - Positive Score: 3, Negative Score: 8
Predicted Sentiment: Negative
