In [25]:
import pandas as pd
import numpy as np
import re
import string

from sklearn.model_selection import train_test_split, GridSearchCV
from sklearn.feature_extraction.text import TfidfVectorizer
from sklearn.linear_model import LogisticRegression
from sklearn.metrics import (
    accuracy_score,
    precision_score,
    recall_score,
    f1_score,
    confusion_matrix,
    classification_report
)

import matplotlib.pyplot as plt
import seaborn as sns
from nltk.corpus import stopwords
import nltk

from scipy.sparse import hstack

In [9]:
# Download stopwords
nltk.download('stopwords')
stop_words = set(stopwords.words('english'))

[nltk_data] Downloading package stopwords to /root/nltk_data...
[nltk_data]   Unzipping corpora/stopwords.zip.


In [26]:
# Load the clean and balanced dataset
train_df = pd.read_csv('/content/balanced_train_data_updated.csv')
test_df = pd.read_csv('/content/balanced_test_data_updated.csv')

In [5]:
print("No. of rows in train set: ", train_df.shape[0])
print("No. of rows in test set: ", test_df.shape[0])

No. of rows in train set:  102331
No. of rows in test set:  25583


In [6]:
print("Any NaNs in train set:", train_df.isna().values.any())
print("Any NaNs in test set:", test_df.isna().values.any())

Any NaNs in train set: False
Any NaNs in test set: False


In [1]:
# Explicitly uninstall numpy and gensim
!pip uninstall numpy -y
!pip uninstall gensim -y

# Reinstall the specified versions with --no-cache-dir
!pip install numpy==1.26.4 scipy==1.13.0 gensim==4.3.3 --no-cache-dir

Found existing installation: numpy 1.26.4
Uninstalling numpy-1.26.4:
  Successfully uninstalled numpy-1.26.4
Found existing installation: gensim 4.3.3
Uninstalling gensim-4.3.3:
  Successfully uninstalled gensim-4.3.3
Collecting numpy==1.26.4
  Downloading numpy-1.26.4-cp311-cp311-manylinux_2_17_x86_64.manylinux2014_x86_64.whl.metadata (61 kB)
[2K     [90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━[0m [32m61.0/61.0 kB[0m [31m5.0 MB/s[0m eta [36m0:00:00[0m
Collecting gensim==4.3.3
  Downloading gensim-4.3.3-cp311-cp311-manylinux_2_17_x86_64.manylinux2014_x86_64.whl.metadata (8.1 kB)
Downloading numpy-1.26.4-cp311-cp311-manylinux_2_17_x86_64.manylinux2014_x86_64.whl (18.3 MB)
[2K   [90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━[0m [32m18.3/18.3 MB[0m [31m243.2 MB/s[0m eta [36m0:00:00[0m
[?25hDownloading gensim-4.3.3-cp311-cp311-manylinux_2_17_x86_64.manylinux2014_x86_64.whl (26.7 MB)
[2K   [90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━[0m [32m26.7/26.7 MB[0m [31m284.6 M

In [27]:
# Combine Text and Summary
train_df['combined'] = train_df['Summary'].fillna('') + " " + train_df['Text'].fillna('')
test_df['combined'] = test_df['Summary'].fillna('') + " " + test_df['Text'].fillna('')

In [29]:
# Apply cleaning and create 'tokens' column
def clean_and_tokenize(combined):
    combined = combined.lower()
    combined = re.sub(r'[' + string.punctuation + ']', ' ', combined)  # Remove punctuation
    tokens = combined.split()
    return tokens # Return list of tokens

train_df['clean_combined'] = train_df['combined'].astype(str).apply(clean_and_tokenize)

# Create 'tokens' column by applying the modified function
train_df['tokens'] = train_df['combined'].astype(str).apply(clean_and_tokenize)

test_df['clean_combined'] = test_df['combined'].astype(str).apply(clean_and_tokenize)

# Create 'tokens' column by applying the modified function
test_df['tokens'] = test_df['combined'].astype(str).apply(clean_and_tokenize)

In [30]:
from gensim.models import Word2Vec

# Preparing tokenized corpus

train_sentences = train_df['tokens'].tolist()
test_sentences = test_df['tokens'].tolist()

In [31]:
# Train Word2Vec on training data

w2v_model = Word2Vec(sentences=train_sentences, vector_size=100, window=5, min_count=1, workers=4, seed=42)

In [32]:
# Function to get document vectors

def document_vector(doc, model):
    # Filter out-of-vocabulary words
    doc = [word for word in doc if word in model.wv.key_to_index]
    if not doc:
        return np.zeros(model.vector_size)
    return np.mean(model.wv[doc], axis=0)

In [33]:
# Create document vectors for train and test sets

X_train = np.vstack(train_df['tokens'].apply(lambda x: document_vector(x, w2v_model)).values)
y_train = train_df['Score'].values

X_test = np.vstack(test_df['tokens'].apply(lambda x: document_vector(x, w2v_model)).values)
y_test = test_df['Score'].values

In [34]:
# Logistic Regression with GridSearchCV for hyperparameter tuning
param_grid = {
    'C': [0.01, 0.1, 1, 10],
    'solver': ['lbfgs'],
    'multi_class': ['multinomial'],
    'max_iter': [200]
}

grid = GridSearchCV(LogisticRegression(), param_grid, cv=3, scoring='f1_macro', verbose=1, n_jobs=-1)
grid.fit(X_train, y_train)

Fitting 3 folds for each of 4 candidates, totalling 12 fits




In [35]:
# Best model
best_model = grid.best_estimator_

In [36]:
# Predictions
y_pred = best_model.predict(X_test)

In [38]:
# Evaluation
print("\nBest Parameters:", grid.best_params_)
print("\nAccuracy:", accuracy_score(y_test, y_pred))
print("Precision (macro):", precision_score(y_test, y_pred, average='macro'))
print("Recall (macro):", recall_score(y_test, y_pred, average='macro'))
print("F1 Score (macro):", f1_score(y_test, y_pred, average='macro'))

print("\nConfusion Matrix:\n", confusion_matrix(y_test, y_pred))
print("\nClassification Report:\n", classification_report(y_test, y_pred, target_names=['Negative', 'Neutral', 'Positive']))


Best Parameters: {'C': 0.1, 'max_iter': 200, 'multi_class': 'multinomial', 'solver': 'lbfgs'}

Accuracy: 0.6994097642966032
Precision (macro): 0.6991901433402119
Recall (macro): 0.699410067443495
F1 Score (macro): 0.6992974631588987

Confusion Matrix:
 [[6030 1932  565]
 [1980 5215 1333]
 [ 547 1333 6648]]

Classification Report:
               precision    recall  f1-score   support

    Negative       0.70      0.71      0.71      8527
     Neutral       0.61      0.61      0.61      8528
    Positive       0.78      0.78      0.78      8528

    accuracy                           0.70     25583
   macro avg       0.70      0.70      0.70     25583
weighted avg       0.70      0.70      0.70     25583

