In [1]:
import pandas as pd
from sklearn.model_selection import train_test_split
from sklearn.feature_extraction.text import TfidfVectorizer
from sklearn.linear_model import LogisticRegression
from sklearn.metrics import classification_report, confusion_matrix, accuracy_score
import joblib

# Load preprocessed data
df = pd.read_csv("../data/processed/preprocessed_reviews.csv")
print("Dataset loaded. Total samples:", len(df))

# Train-test split
X = df['clean_review']
y = df['sentiment']

X_train, X_test, y_train, y_test = train_test_split(
    X, y, test_size=0.2, random_state=42, stratify=y
)
print(f"\nTrain size: {X_train.shape[0]}, Test size: {X_test.shape[0]}")

# TF-IDF Vectorization
vectorizer = TfidfVectorizer(max_features=5000, ngram_range=(1,2))
X_train_tfidf = vectorizer.fit_transform(X_train)
X_test_tfidf = vectorizer.transform(X_test)

# Train Logistic Regression on full dataset
model = LogisticRegression(max_iter=500, solver='saga')
model.fit(X_train_tfidf, y_train)

# Evaluate
y_pred = model.predict(X_test_tfidf)
print("\nAccuracy:", accuracy_score(y_test, y_pred))
print("\nConfusion Matrix:\n", confusion_matrix(y_test, y_pred))
print("\nClassification Report:\n", classification_report(y_test, y_pred))

# Save model & vectorizer
joblib.dump(model, "../models/logreg_sentiment_model_full.pkl")
joblib.dump(vectorizer, "../models/tfidf_vectorizer_full.pkl")
print("\nFull model and vectorizer saved to ../models/")


Dataset loaded. Total samples: 19818

Train size: 15854, Test size: 3964

Accuracy: 0.9273461150353178

Confusion Matrix:
 [[ 226  248]
 [  40 3450]]

Classification Report:
               precision    recall  f1-score   support

    negative       0.85      0.48      0.61       474
    positive       0.93      0.99      0.96      3490

    accuracy                           0.93      3964
   macro avg       0.89      0.73      0.79      3964
weighted avg       0.92      0.93      0.92      3964


Full model and vectorizer saved to ../models/
