In [24]:
from IPython import get_ipython
from IPython.display import display
import pandas as pd
import numpy as np
import re
import string

from sklearn.model_selection import StratifiedKFold, train_test_split, GridSearchCV, cross_val_predict
from sklearn.naive_bayes import MultinomialNB
from sklearn.preprocessing import MinMaxScaler
from sklearn.metrics import (
    accuracy_score,
    precision_score,
    recall_score,
    f1_score,
    confusion_matrix,
    classification_report
)

import matplotlib.pyplot as plt
import seaborn as sns
from nltk.corpus import stopwords
import nltk

from scipy.sparse import hstack

In [25]:
# Load the clean and balanced dataset
train_df = pd.read_csv('/content/balanced_train_data_updated.csv')
test_df = pd.read_csv('/content/balanced_test_data_updated.csv')

In [26]:
# Combine Text and Summary
train_df['combined'] = train_df['Summary'].fillna('') + " " + train_df['Text'].fillna('')
test_df['combined'] = test_df['Summary'].fillna('') + " " + test_df['Text'].fillna('')

In [27]:
# Apply cleaning and create 'tokens' column
def clean_and_tokenize(combined):
    combined = combined.lower()
    combined = re.sub(r'[' + string.punctuation + ']', ' ', combined)  # Remove punctuation
    tokens = combined.split()
    return tokens # Return list of tokens

train_df['clean_combined'] = train_df['combined'].astype(str).apply(clean_and_tokenize)

# Create 'tokens' column by applying the modified function
train_df['tokens'] = train_df['combined'].astype(str).apply(clean_and_tokenize)

test_df['clean_combined'] = test_df['combined'].astype(str).apply(clean_and_tokenize)

# Create 'tokens' column by applying the modified function
test_df['tokens'] = test_df['combined'].astype(str).apply(clean_and_tokenize)

In [None]:
# Uninstall existing versions forcefully
!pip uninstall -y numpy scipy gensim

# Reinstall the specified compatible versions together
# Using slightly older but known compatible versions to avoid potential issues with the very latest
!pip install numpy==1.24.0 scipy==1.10.0 gensim==4.3.0 --no-cache-dir --force-reinstall

Found existing installation: numpy 1.24.0
Uninstalling numpy-1.24.0:
  Successfully uninstalled numpy-1.24.0
Found existing installation: scipy 1.10.0
Uninstalling scipy-1.10.0:
  Successfully uninstalled scipy-1.10.0
Found existing installation: gensim 4.3.0
Uninstalling gensim-4.3.0:
  Successfully uninstalled gensim-4.3.0
Collecting numpy==1.24.0
  Downloading numpy-1.24.0-cp311-cp311-manylinux_2_17_x86_64.manylinux2014_x86_64.whl.metadata (5.6 kB)
Collecting scipy==1.10.0
  Downloading scipy-1.10.0-cp311-cp311-manylinux_2_17_x86_64.manylinux2014_x86_64.whl.metadata (58 kB)
[2K     [90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━[0m [32m58.9/58.9 kB[0m [31m5.4 MB/s[0m eta [36m0:00:00[0m
[?25hCollecting gensim==4.3.0
  Downloading gensim-4.3.0-cp311-cp311-manylinux_2_17_x86_64.manylinux2014_x86_64.whl.metadata (8.4 kB)
Collecting smart-open>=1.8.1 (from gensim==4.3.0)
  Downloading smart_open-7.1.0-py3-none-any.whl.metadata (24 kB)
Collecting FuzzyTM>=0.4.0 (from gensim==4.3.0)

In [28]:
# Preparing tokenized corpus
from gensim.models import Word2Vec

train_sentences = train_df['tokens'].tolist()
test_sentences = test_df['tokens'].tolist()

In [29]:
# Train Word2Vec on training data

w2v_model = Word2Vec(sentences=train_sentences, vector_size=100, window=5, min_count=1, workers=4, seed=42)

In [30]:
# Function to get document vectors

def document_vector(doc, model):
    # Filter out-of-vocabulary words
    doc = [word for word in doc if word in model.wv.key_to_index]
    if not doc:
        return np.zeros(model.vector_size)
    return np.mean(model.wv[doc], axis=0)

In [31]:
# Create document vectors for train and test sets

X_train = np.vstack(train_df['tokens'].apply(lambda x: document_vector(x, w2v_model)).values)
y_train = train_df['Score'].values

X_test = np.vstack(test_df['tokens'].apply(lambda x: document_vector(x, w2v_model)).values)
y_test = test_df['Score'].values

In [32]:
# Scale features to be non-negative (required for MultinomialNB)
scaler = MinMaxScaler()
X_scaled = scaler.fit_transform(X_train)

In [33]:
# Define model and parameter grid
nb = MultinomialNB()
param_grid = {
    'alpha': [0.1, 0.5, 1.0]
}
grid = GridSearchCV(nb, param_grid, cv=3, scoring='f1_macro', verbose=1, n_jobs=-1)
grid.fit(X_scaled, y_train)

Fitting 3 folds for each of 3 candidates, totalling 9 fits


In [34]:
# Define the outer cross-validation strategy
outer_cv = StratifiedKFold(n_splits=5, shuffle=True, random_state=42) # Define outer_cv

# Get predictions using outer CV folds
y_pred = cross_val_predict(grid, X_scaled, y_train, cv=outer_cv, n_jobs=-1)

In [35]:
# Best model
best_model = grid.best_estimator_

In [36]:
# Predictions
y_pred = best_model.predict(X_test)

In [38]:
# Evaluation
print("\nBest Parameters:", grid.best_params_)
print("\nAccuracy:", accuracy_score(y_test, y_pred))
print("Precision (macro):", precision_score(y_test, y_pred, average='macro'))
print("Recall (macro):", recall_score(y_test, y_pred, average='macro'))
print("F1 Score (macro):", f1_score(y_test, y_pred, average='macro'))

print("\nConfusion Matrix:\n", confusion_matrix(y_test, y_pred))
print("\nClassification Report:\n", classification_report(y_test, y_pred, target_names=['Negative', 'Neutral', 'Positive']))


Best Parameters: {'alpha': 0.1}

Accuracy: 0.5441504123832233
Precision (macro): 0.6495840827484132
Recall (macro): 0.5441432295442655
F1 Score (macro): 0.5385343029570413

Confusion Matrix:
 [[3073 5232  222]
 [1040 7062  426]
 [ 486 4256 3786]]

Classification Report:
               precision    recall  f1-score   support

    Negative       0.67      0.36      0.47      8527
     Neutral       0.43      0.83      0.56      8528
    Positive       0.85      0.44      0.58      8528

    accuracy                           0.54     25583
   macro avg       0.65      0.54      0.54     25583
weighted avg       0.65      0.54      0.54     25583

