In [1]:
import pandas as pd
import numpy as np
import re
import string

from sklearn.model_selection import train_test_split, GridSearchCV
from sklearn.feature_extraction.text import TfidfVectorizer
from sklearn.linear_model import LogisticRegression
from sklearn.metrics import (
    accuracy_score,
    precision_score,
    recall_score,
    f1_score,
    confusion_matrix,
    classification_report
)

import matplotlib.pyplot as plt
import seaborn as sns
from nltk.corpus import stopwords
import nltk

from scipy.sparse import hstack

In [2]:
# Download stopwords
nltk.download('stopwords')
stop_words = set(stopwords.words('english'))

[nltk_data] Downloading package stopwords to /root/nltk_data...
[nltk_data]   Unzipping corpora/stopwords.zip.


In [13]:
# Load the reduced majority class dataset
train_df = pd.read_csv('/content/train_data_reduced.csv')
test_df = pd.read_csv('/content/test_data_reduced.csv')

In [4]:
# drop rows with NaN

train_df = train_df.dropna()
test_df = test_df.dropna()

In [5]:
# Text cleaning function
def clean_text(text):
    text = text.lower()
    text = re.sub(r'[' + string.punctuation + ']', ' ', text)  # Remove punctuation
    tokens = text.split()
    tokens = [word for word in tokens if word not in stop_words]
    return ' '.join(tokens)


In [6]:
# Apply cleaning
train_df['clean_text'] = train_df['Text'].astype(str).apply(clean_text)
train_df['clean_summary'] = train_df['Summary'].astype(str).apply(clean_text)

test_df['clean_text'] = test_df['Text'].astype(str).apply(clean_text)
test_df['clean_summary'] = test_df['Summary'].astype(str).apply(clean_text)

In [7]:
# TF-IDF vectorization
tfidf_text = TfidfVectorizer(max_features=10000, ngram_range=(1,2))
tfidf_summary = TfidfVectorizer(max_features=5000, ngram_range=(1,2))

X_train_text = tfidf_text.fit_transform(train_df['clean_text'])
X_train_summary = tfidf_summary.fit_transform(train_df['clean_summary'])

X_test_text = tfidf_text.transform(test_df['clean_text'])
X_test_summary = tfidf_summary.transform(test_df['clean_summary'])

In [8]:
# Combine features
X_train = hstack([X_train_text, X_train_summary])
X_test = hstack([X_test_text, X_test_summary])

# Seperate target variable
y_train = train_df['Score']
y_test = test_df['Score']

In [9]:
# Logistic Regression with GridSearchCV for hyperparameter tuning
param_grid = {
    'C': [0.01, 0.1, 1, 10],
    'solver': ['lbfgs'],
    'multi_class': ['multinomial'],
    'max_iter': [200]
}

grid = GridSearchCV(LogisticRegression(), param_grid, cv=3, scoring='f1_macro', verbose=1, n_jobs=-1)
grid.fit(X_train, y_train)

Fitting 3 folds for each of 4 candidates, totalling 12 fits


STOP: TOTAL NO. OF ITERATIONS REACHED LIMIT.

Increase the number of iterations (max_iter) or scale the data as shown in:
    https://scikit-learn.org/stable/modules/preprocessing.html
Please also refer to the documentation for alternative solver options:
    https://scikit-learn.org/stable/modules/linear_model.html#logistic-regression
  n_iter_i = _check_optimize_result(


In [10]:
# Best model
best_model = grid.best_estimator_

In [11]:
# Predictions
y_pred = best_model.predict(X_test)

In [12]:
# Evaluation
print("\nBest Parameters:", grid.best_params_)
print("\nAccuracy:", accuracy_score(y_test, y_pred))
print("Precision (macro):", precision_score(y_test, y_pred, average='macro'))
print("Recall (macro):", recall_score(y_test, y_pred, average='macro'))
print("F1 Score (macro):", f1_score(y_test, y_pred, average='macro'))

print("\nConfusion Matrix:\n", confusion_matrix(y_test, y_pred))
print("\nClassification Report:\n", classification_report(y_test, y_pred, target_names=['Negative', 'Neutral', 'Positive']))


Best Parameters: {'C': 10, 'max_iter': 200, 'multi_class': 'multinomial', 'solver': 'lbfgs'}

Accuracy: 0.8212958222974026
Precision (macro): 0.7852186261372228
Recall (macro): 0.7773648510446002
F1 Score (macro): 0.7806199990960804

Confusion Matrix:
 [[14056  1488   857]
 [ 2032  4949  1547]
 [  874  1231 17895]]

Classification Report:
               precision    recall  f1-score   support

    Negative       0.83      0.86      0.84     16401
     Neutral       0.65      0.58      0.61      8528
    Positive       0.88      0.89      0.89     20000

    accuracy                           0.82     44929
   macro avg       0.79      0.78      0.78     44929
weighted avg       0.82      0.82      0.82     44929

