In [None]:
# ===============================
# 05_model_training.ipynb
# ===============================

import pandas as pd
from sklearn.model_selection import train_test_split
from sklearn.feature_extraction.text import TfidfVectorizer
from sklearn.linear_model import LogisticRegression
from sklearn.metrics import classification_report, confusion_matrix, accuracy_score
import joblib
import os

# ===============================
# Step 1: Load preprocessed data
# ===============================
df = pd.read_csv("../data/processed/preprocessed_reviews.csv")
print("Dataset loaded. Total samples:", len(df))

# Optional: Use a sample for faster training
df_sample = df.sample(n=min(20000, len(df)), random_state=42)
X = df_sample['clean_review']
y = df_sample['sentiment']

# ===============================
# Step 2: Train-test split
# ===============================
X_train, X_test, y_train, y_test = train_test_split(
    X, y, test_size=0.2, random_state=42, stratify=y
)
print(f"\nTrain size: {X_train.shape[0]}, Test size: {X_test.shape[0]}")

# ===============================
# Step 3: TF-IDF Vectorization
# ===============================
vectorizer = TfidfVectorizer(max_features=2000, ngram_range=(1,2))
X_train_tfidf = vectorizer.fit_transform(X_train)
X_test_tfidf = vectorizer.transform(X_test)

print("\nSample feature names:", vectorizer.get_feature_names_out()[:20])

# ===============================
# Step 4: Train Logistic Regression with class balance
# ===============================
# Using class_weight='balanced' to give more importance to minority class
model = LogisticRegression(max_iter=300, solver='saga', class_weight='balanced')
model.fit(X_train_tfidf, y_train)

# ===============================
# Step 5: Evaluate model
# ===============================
y_pred = model.predict(X_test_tfidf)

print("\nAccuracy:", accuracy_score(y_test, y_pred))
print("\nConfusion Matrix:\n", confusion_matrix(y_test, y_pred))
print("\nClassification Report:\n", classification_report(y_test, y_pred))

# ===============================
# Step 6: Save model and vectorizer
# ===============================
os.makedirs("../models/", exist_ok=True)
joblib.dump(model, "../models/logreg_sentiment_model_balanced.pkl")
joblib.dump(vectorizer, "../models/tfidf_vectorizer.pkl")
print("\nModel and vectorizer saved to ../models/")


Dataset loaded. Total samples: 19818

Train size: 15854, Test size: 3964

Sample feature names: ['able' 'able wear' 'absolutely' 'absolutely beautiful'
 'absolutely gorgeous' 'absolutely love' 'accent' 'accessory' 'accurate'
 'across' 'across chest' 'actual' 'actually' 'add' 'added' 'adding'
 'addition' 'adjust' 'adjustable' 'adorable']

Accuracy: 0.9127144298688193

Confusion Matrix:
 [[ 426   48]
 [ 298 3192]]

Classification Report:
               precision    recall  f1-score   support

    negative       0.59      0.90      0.71       474
    positive       0.99      0.91      0.95      3490

    accuracy                           0.91      3964
   macro avg       0.79      0.91      0.83      3964
weighted avg       0.94      0.91      0.92      3964


Model and vectorizer saved to ../models/
