## Logistic Regression Experiments

## Load Dataset

In [1]:
import pandas as pd
import numpy as np
import matplotlib.pyplot as plt
import seaborn as sns
from IPython.display import display
import random
from sklearn.pipeline import Pipeline
from sklearn.feature_extraction.text import TfidfVectorizer
from sklearn.linear_model import LogisticRegression
from sklearn.metrics import classification_report, confusion_matrix
import matplotlib.pyplot as plt
from sklearn.base import BaseEstimator, TransformerMixin
from gensim.models import Word2Vec
import nltk
from nltk.tokenize import word_tokenize
from sklearn.metrics import f1_score
import optuna
from optuna.samplers import TPESampler
import joblib

In [2]:
random.seed(42)
np.random.seed(42)

- Load the train and test files 

In [3]:
train_df = pd.read_csv("train_data.csv")
val_df  = pd.read_csv("val_data.csv")
test_df  = pd.read_csv("test_data.csv")

In [4]:
print(f"Train: {len(train_df):>6} rows")
print(f" Val : {len(val_df):>6} rows")
print(f" Test: {len(test_df):>6} rows")

Train:   8000 rows
 Val :   1000 rows
 Test:   1000 rows


## Word2Vec - Experiment 3d (Word2Vec trained on SIA reviews)

In [5]:
import nltk
import re, ftfy, string
from nltk.corpus import stopwords
import string

stop_words = set(stopwords.words('english'))
punct_table = str.maketrans('', '', string.punctuation)

In [6]:
def clean_text_pipeline(text):
    # Fix Unicode glitches
    text = ftfy.fix_text(text)

    # Strip HTML tags and URLs
    text = re.sub(r'<[^>]+>', ' ', text)
    text = re.sub(r'https?://\S+|www\.\S+', '', text)

    # Lowercase 
    text = text.lower()

    # Remove Punctuation 
    #text = text.translate(punct_table)

    # Remove whitespace
    #text = re.sub(r'\s+', ' ', text).strip()

    # # Split into words for stopword removal (next step)
    # words = text.split() 

    # # Drop stopwords
    # words = [w for w in words if w not in stopwords.words('english')]

    return (text)

In [7]:
def w2v_features(texts, vector_size, window, min_count, sg, epochs, workers):
    tok = texts.astype(str).apply(word_tokenize)
    model = Word2Vec(
      sentences   = tok.tolist(),
      vector_size = vector_size,
      window      = window,
      min_count   = min_count,
      sg          = sg,
      epochs      = epochs,
      workers     = workers,
      seed        = 42
    )
    # avg pooling
    def dv(toks):
        vecs = [model.wv[w] for w in toks if w in model.wv]
        return np.mean(vecs, axis=0) if vecs else np.zeros(vector_size)
    return np.vstack(tok.apply(dv).values), model

In [8]:
train_df["clean_text"] = train_df["text"].astype(str).apply(clean_text_pipeline)
val_df  ["clean_text"] = val_df  ["text"].astype(str).apply(clean_text_pipeline)
train_texts = train_df["clean_text"] 
val_texts   = val_df["clean_text"]
y_train = train_df["sentiment_id"]
y_val   = val_df["sentiment_id"]


- Applying Optimal Parameters for Retraining and Evaluation on the Test Set

In [9]:
"""Best macro-F1: 0.7024024557030937
Best hyperparameters: {'vector_size': 200, 'window': 4, 'min_count': 7, 'sg': 1, 'epochs': 14, 'C': 4.674855236819771, 'penalty_solver': 'l1_saga'}"""

"Best macro-F1: 0.7024024557030937\nBest hyperparameters: {'vector_size': 200, 'window': 4, 'min_count': 7, 'sg': 1, 'epochs': 14, 'C': 4.674855236819771, 'penalty_solver': 'l1_saga'}"

In [10]:
best = {'vector_size': 300, 'window': 6, 'min_count': 8, 'sg': 1, 'epochs': 12, 'C': 1.01625, 'penalty_solver': 'l1_saga'}
sentences = [word_tokenize(txt) for txt in train_df["text"].astype(str)]
w2v_final = Word2Vec(
    sentences   = sentences,
    vector_size = best["vector_size"],
    window      = best["window"],
    min_count   = best["min_count"],
    sg          = best["sg"],
    epochs      = best["epochs"],
    workers     = 4,
    seed        = 42
)

In [11]:
# average word2vec vectors for a document
def doc_vector(doc):
    toks = word_tokenize(str(doc))
    vecs = [w2v_final.wv[t] for t in toks if t in w2v_final.wv]
    return np.mean(vecs, axis=0) if vecs else np.zeros(w2v_final.vector_size)

In [None]:
# Build feature matrices
train_df["clean_text"] = train_df["text"].astype(str).apply(clean_text_pipeline)
test_df ["clean_text"] = test_df ["text"].astype(str).apply(clean_text_pipeline)
X_train = np.vstack(train_df["clean_text"].apply(doc_vector).tolist())
X_test  = np.vstack(test_df ["clean_text"].apply(doc_vector).tolist())
y_train = train_df["sentiment_id"]
y_test  = test_df ["sentiment_id"]

In [None]:
# Train classifier with best params
penalty, solver = best["penalty_solver"].split("_")
clf_final = LogisticRegression(
    C            = best["C"],
    penalty      = penalty,
    solver       = solver,
    class_weight = "balanced",
    random_state = 42,
    max_iter     = 10000,
)
clf_final.fit(X_train, y_train)

In [None]:
y_test_pred = clf_final.predict(X_test)
print(classification_report(y_test,y_test_pred,target_names=["Negative", "Neutral", "Positive"]))

              precision    recall  f1-score   support

    Negative       0.70      0.71      0.71       160
     Neutral       0.33      0.69      0.44       101
    Positive       0.97      0.82      0.89       739

    accuracy                           0.79      1000
   macro avg       0.67      0.74      0.68      1000
weighted avg       0.87      0.79      0.82      1000



In [None]:
joblib.dump(clf_final, "lr_word2vec_experiment3d.pkl")

['lr_word2vec_experiment3c.pkl']