## Logistic Regression Experiments

## Load Dataset

In [1]:
import pandas as pd
import numpy as np
import matplotlib.pyplot as plt
import seaborn as sns
from IPython.display import display
import random
from sklearn.pipeline import Pipeline
from sklearn.feature_extraction.text import TfidfVectorizer
from sklearn.linear_model import LogisticRegression
from sklearn.metrics import classification_report, confusion_matrix
import matplotlib.pyplot as plt
from sklearn.base import BaseEstimator, TransformerMixin
from gensim.models import Word2Vec
import nltk
from nltk.tokenize import word_tokenize
from sklearn.metrics import f1_score
import optuna
from optuna.samplers import TPESampler
import joblib

In [2]:
random.seed(42)
np.random.seed(42)

- Load the train and test files 

In [3]:
train_df = pd.read_csv("train_data.csv")
val_df  = pd.read_csv("val_data.csv")
test_df  = pd.read_csv("test_data.csv")

In [4]:
print(f"Train: {len(train_df):>6} rows")
print(f" Val : {len(val_df):>6} rows")
print(f" Test: {len(test_df):>6} rows")

Train:   8000 rows
 Val :   1000 rows
 Test:   1000 rows


## Word2Vec - Experiment 3 (Word2Vec trained on SIA reviews)

In [5]:
def w2v_features(texts, vector_size, window, min_count, sg, epochs, workers):
    tok = texts.astype(str).apply(word_tokenize)
    model = Word2Vec(
      sentences   = tok.tolist(),
      vector_size = vector_size,
      window      = window,
      min_count   = min_count,
      sg          = sg,
      epochs      = epochs,
      workers     = workers,
      seed        = 42
    )
    # avg pooling
    def dv(toks):
        vecs = [model.wv[w] for w in toks if w in model.wv]
        return np.mean(vecs, axis=0) if vecs else np.zeros(vector_size)
    return np.vstack(tok.apply(dv).values), model

In [6]:
train_texts = train_df["text"].astype(str)
val_texts   = val_df["text"].astype(str)
y_train = train_df["sentiment_id"]
y_val   = val_df["sentiment_id"]

- Hyperparameter tuning

In [7]:
def objective(trial):
    # Word2Vec params
    vs = trial.suggest_int("vector_size", 100, 300, step=100)
    win   = trial.suggest_int("window", 3, 7)
    mc    = trial.suggest_int("min_count", 1, 10)
    sg    = trial.suggest_int("sg", 0, 1)       # 0=CBOW, 1=skip-gram
    epochs= trial.suggest_int("epochs",5,20)
    X_tr, w2v = w2v_features(train_texts, vs, win, mc, sg, epochs, workers=4)
    X_va      = np.vstack(val_texts.astype(str)
                          .apply(lambda d: np.mean(
                              [w2v.wv[w] for w in word_tokenize(d) if w in w2v.wv],
                              axis=0) if d else np.zeros(vs)
                          ).values)
    # LR params
    C = trial.suggest_float("C", 1e-3, 1e3, log=True)
    combo = trial.suggest_categorical("penalty_solver",
                                      ["l1_saga","l2_lbfgs","l2_sag","l2_saga"])
    p,s = combo.split("_")
    clf = LogisticRegression(C=C, penalty=p, solver=s,
                             class_weight="balanced",
                             max_iter=10000, random_state=42)
    clf.fit(X_tr, y_train)
    preds = clf.predict(X_va)
    return f1_score(y_val, preds, average="macro")

In [8]:
sampler = TPESampler(seed=42, n_startup_trials=10)
study   = optuna.create_study(direction="maximize", sampler=sampler)
study.optimize(objective, n_trials=20)
print("Best macro-F1:", study.best_value)
print("Best hyperparameters:", study.best_params)

[I 2025-06-22 00:10:24,375] A new study created in memory with name: no-name-b9d2640e-6ab5-4fb8-bde5-7b403fd1c8ec
[I 2025-06-22 00:10:32,536] Trial 0 finished with value: 0.5977086615978003 and parameters: {'vector_size': 200, 'window': 7, 'min_count': 8, 'sg': 1, 'epochs': 7, 'C': 0.00862913219007186, 'penalty_solver': 'l2_lbfgs'}. Best is trial 0 with value: 0.5977086615978003.
[I 2025-06-22 00:10:35,714] Trial 1 finished with value: 0.5937998518003501 and parameters: {'vector_size': 100, 'window': 7, 'min_count': 9, 'sg': 0, 'epochs': 7, 'C': 0.012601639723276799, 'penalty_solver': 'l2_lbfgs'}. Best is trial 0 with value: 0.5977086615978003.
[I 2025-06-22 00:10:50,521] Trial 2 finished with value: 0.652840960580587 and parameters: {'vector_size': 200, 'window': 3, 'min_count': 3, 'sg': 0, 'epochs': 12, 'C': 51.41096648805749, 'penalty_solver': 'l2_sag'}. Best is trial 2 with value: 0.652840960580587.
[I 2025-06-22 00:12:26,560] Trial 3 finished with value: 0.6317086830151316 and par

Best macro-F1: 0.7024024557030937
Best hyperparameters: {'vector_size': 200, 'window': 4, 'min_count': 7, 'sg': 1, 'epochs': 14, 'C': 4.674855236819771, 'penalty_solver': 'l1_saga'}


- Applying Optimal Parameters for Retraining and Evaluation on the Test Set

In [9]:
best = study.best_params

sentences = (train_texts.apply(word_tokenize).tolist())

w2v_final = Word2Vec(
    sentences   = sentences,
    vector_size = best["vector_size"],
    window      = best["window"],
    min_count   = best["min_count"],
    sg          = best["sg"],
    epochs      = best["epochs"],
    workers     = 4,
    seed        = 42)

In [10]:
# average word2vec vectors for a document
def doc_vector(doc):
    tokens = word_tokenize(str(doc))
    vecs   = [w2v_final.wv[t] for t in tokens if t in w2v_final.wv]
    return np.mean(vecs, axis=0) if vecs else np.zeros(w2v_final.vector_size)

In [11]:
# Build feature matrices
X_train = np.vstack(train_texts.apply(doc_vector).values)
X_test  = np.vstack(test_df["text"].astype(str).apply(doc_vector).values)
y_train = train_df["sentiment_id"]
y_test  = test_df ["sentiment_id"]

In [17]:
# Train classifier with best params
penalty, solver = best["penalty_solver"].split("_")
clf_final = LogisticRegression(
    C            = best["C"],
    penalty      = penalty,
    solver       = solver,
    class_weight = "balanced",
    random_state = 42,
    max_iter     = 10000,
)
clf_final.fit(X_train, y_train)

In [18]:
y_test_pred = clf_final.predict(X_test)
print(classification_report(y_test,y_test_pred,target_names=["Negative", "Neutral", "Positive"]))

              precision    recall  f1-score   support

    Negative       0.68      0.72      0.70       160
     Neutral       0.33      0.63      0.43       101
    Positive       0.98      0.84      0.90       739

    accuracy                           0.80      1000
   macro avg       0.66      0.73      0.68      1000
weighted avg       0.86      0.80      0.82      1000



In [20]:
joblib.dump(clf_final, "lr_word2vec_experiment3.pkl")

['lr_word2vec_experiment3.pkl']