## Extract data, run only first time

In [60]:
import zipfile as zipfile
import os as os

zip_file_path = "archive1.zip"
extract_folder_path = "extracted_archive"

with zipfile.ZipFile(zip_file_path, "r") as zip_ref:
    zip_ref.extractall(extract_folder_path)

extracted_files = os.listdir(extract_folder_path)
print(extracted_files)


['IMDB Dataset.csv']


## Load dataset from extracted data

In [61]:
import pandas as pd

extract_folder_path = "extracted_archive"
dataset_path = os.path.join(extract_folder_path, "IMDB Dataset.csv")
df = pd.read_csv(dataset_path, header=None, names=["review", "sentiment"])

## Preprocessing

#### First download necessary NLTK data

In [62]:
# Download necessary NLTK data
import nltk

nltk.download("stopwords")
nltk.download("wordnet")
nltk.download("punkt")

[nltk_data] Downloading package stopwords to
[nltk_data]     C:\Users\Jakov\AppData\Roaming\nltk_data...
[nltk_data]   Package stopwords is already up-to-date!
[nltk_data] Downloading package wordnet to
[nltk_data]     C:\Users\Jakov\AppData\Roaming\nltk_data...
[nltk_data]   Package wordnet is already up-to-date!
[nltk_data] Downloading package punkt to
[nltk_data]     C:\Users\Jakov\AppData\Roaming\nltk_data...
[nltk_data]   Package punkt is already up-to-date!


True

#### After nltk is downloaded, preprocess the data

In [63]:
import re

lemmatizer = nltk.WordNetLemmatizer()
stop_words = set(nltk.corpus.stopwords.words("english"))


def preprocess_text(text):
    text = text.lower()
    text = re.sub(r"<.*?>", "", text)  # Remove HTML tags
    text = re.sub(r"[^a-zA-Z\s]", "", text)  # Remove non-alphabetical characters
    tokens = nltk.word_tokenize(text)
    tokens = [lemmatizer.lemmatize(word) for word in tokens if word not in stop_words]
    return " ".join(tokens)


# Apply preprocessing to the dataset
df["processed_review"] = df["review"].apply(preprocess_text)

## Emotion analysis

In [64]:
from nrclex import NRCLex


# Emotion analysis using NRC
def analyze_emotions(text):
    emotion = NRCLex(text)
    return emotion.affect_frequencies


# Get emotion scores
df["emotion_analysis"] = df["processed_review"].apply(analyze_emotions)


# Extract emotion scores into separate columns
def extract_emotion_scores(emotion_analysis):
    scores = {
        "anger": emotion_analysis.get("anger", 0),
        "anticipation": emotion_analysis.get("anticipation", 0),
        "disgust": emotion_analysis.get("disgust", 0),
        "fear": emotion_analysis.get("fear", 0),
        "joy": emotion_analysis.get("joy", 0),
        "sadness": emotion_analysis.get("sadness", 0),
        "surprise": emotion_analysis.get("surprise", 0),
        "trust": emotion_analysis.get("trust", 0),
    }
    return scores


df["emotion_scores"] = df["emotion_analysis"].apply(extract_emotion_scores)

## Prepare data for training

In [65]:
from sklearn.model_selection import train_test_split
from sklearn.preprocessing import LabelEncoder
from sklearn.ensemble import RandomForestClassifier
from sklearn.metrics import classification_report, accuracy_score

# Prepare features (X) and target (y)
X = pd.json_normalize(df["emotion_scores"])  # Features (emotion scores)
y = df["sentiment"]  # Target labels (sentiment)

# Label encoding for sentiment
label_encoder = LabelEncoder()
y = label_encoder.fit_transform(y)

# Split the data into training and testing sets (80%/20%)
X_train, X_test, y_train, y_test = train_test_split(
    X, y, test_size=0.2, random_state=42
)

## Train model

In [66]:
# Train the model (Random Forest)
model = RandomForestClassifier(random_state=42)
model.fit(X_train, y_train)

In [67]:
# Make predictions
y_pred = model.predict(X_test)

In [68]:
# Evaluate the model
print("Classification Report:")
print(classification_report(y_test, y_pred))  # Precision, Recall, F1-Score
print("Accuracy Score:", accuracy_score(y_test, y_pred))  # Overall accuracy

Classification Report:
              precision    recall  f1-score   support

           0       0.67      0.69      0.68      4994
           1       0.68      0.66      0.67      5007

    accuracy                           0.68     10001
   macro avg       0.68      0.68      0.68     10001
weighted avg       0.68      0.68      0.68     10001

Accuracy Score: 0.6757324267573243


In [69]:
import joblib  # For saving the model

# Save the trained model and label encoder
joblib.dump(model, "sentiment_model.pkl")  # Save the model
joblib.dump(label_encoder, "label_encoder.pkl")  # Save the label encoder

['label_encoder.pkl']

In [70]:
# Optional: Save the processed data if needed for further analysis
df.to_csv("processed_data.csv", index=False)

In [71]:
# Example of loading the saved model and predicting on new data
# Load the saved model
loaded_model = joblib.load("sentiment_model.pkl")
loaded_label_encoder = joblib.load("label_encoder.pkl")

In [94]:
# Predict sentiment for a new review
new_review = "The editing is sloppy and the cinematography, lighting, and grading are utterly incomprehensible."
processed_review = preprocess_text(new_review)  # Preprocess the new review
emotion_analysis = analyze_emotions(processed_review)  # Get emotion scores
print(emotion_analysis)
emotion_scores = extract_emotion_scores(emotion_analysis)  # Extract emotion scores
print(emotion_scores)


{'fear': 0.0, 'anger': 0.0, 'anticip': 0.0, 'trust': 0.0, 'surprise': 0.0, 'positive': 0.0, 'negative': 0.6666666666666666, 'sadness': 0.0, 'disgust': 0.3333333333333333, 'joy': 0.0}
{'anger': 0.0, 'anticipation': 0, 'disgust': 0.3333333333333333, 'fear': 0.0, 'joy': 0.0, 'sadness': 0.0, 'surprise': 0.0, 'trust': 0.0}


In [95]:
# Convert emotion scores to a DataFrame for prediction
X_new = pd.DataFrame([emotion_scores])  

In [96]:
# Print emotion scores
print("Emotion Scores:")
for emotion, score in emotion_scores.items():
    print(f"{emotion.capitalize()}: {score:.2f}")

Emotion Scores:
Anger: 0.00
Anticipation: 0.00
Disgust: 0.33
Fear: 0.00
Joy: 0.00
Sadness: 0.00
Surprise: 0.00
Trust: 0.00


In [97]:
# Predict sentiment
predicted = loaded_model.predict(X_new)
predicted_sentiment = loaded_label_encoder.inverse_transform(predicted)
print(f"\nPredicted Sentiment: {predicted_sentiment[0]}")


Predicted Sentiment: negative
