In [1]:
# Load saved preprocessed CSVs
import pandas as pd

df_train = pd.read_csv('data/processed/preprocessed_train.csv')
df_test  = pd.read_csv('data/processed/preprocessed_test.csv')

# Recreate X / y variables used by models
X_train = df_train['clean_reviewText']   # or 'reviewText' if you prefer raw
y_train = df_train['sentiment']

X_test  = df_test['clean_reviewText']
y_test  = df_test['sentiment']

In [10]:
import pandas as pd
import time
from sklearn.pipeline import Pipeline
from sklearn.naive_bayes import MultinomialNB
from sklearn.linear_model import LogisticRegression
from sklearn.metrics import accuracy_score, classification_report


# Try to import a local preprocessing module (if present) using a dynamic import.
# Using importlib.util.find_spec avoids a static "Import could not be resolved" lint error.
import importlib
import importlib.util

preprocessing_pipeline = None
spec = importlib.util.find_spec('pre_processing')
if spec is not None:
    try:
        module = importlib.import_module('pre_processing')
        preprocessing_pipeline = getattr(module, 'preprocessing_pipeline', None)
    except Exception:
        preprocessing_pipeline = None

# Fall back to inline minimal TF-IDF if pre_processing not available
if preprocessing_pipeline is None:
    from sklearn.feature_extraction.text import TfidfVectorizer
    preprocessing_pipeline = Pipeline([
        ('tfidf', TfidfVectorizer(max_features=10000, ngram_range=(1, 2)))
    ])

# X_train, X_test, y_train, y_test
# preprocessing_pipeline (The TextCleaner + TfidfVectorizer)

# 1. Define the Candidate Models
# We extend the preprocessing pipeline by adding a classifier step
models = {
    "Naive Bayes": Pipeline([
        ('preprocessor', preprocessing_pipeline),
        ('classifier', MultinomialNB())
    ]),

    "Logistic Regression": Pipeline([
        ('preprocessor', preprocessing_pipeline),
        ('classifier', LogisticRegression(class_weight='balanced', max_iter=1000))
    ])
}

# 2. The Training Loop
results = {}

print("--- Starting Model Competition ---")

for name, model in models.items():
    print(f"\nTraining {name}...")
    start_time = time.time()

    # Train (Fit)
    # The pipeline runs Cleaner -> TF-IDF -> Model automatically
    model.fit(X_train, y_train)

    train_time = time.time() - start_time
    print(f"Training completed in {train_time:.2f} seconds.")

    # Evaluate
    y_pred = model.predict(X_test)
    accuracy = accuracy_score(y_test, y_pred)
    results[name] = accuracy

    print(f"{name} Accuracy: {accuracy:.4f}")
    # We print the report to see Precision/Recall for the Minority Class (0)
    print(classification_report(y_test, y_pred, target_names=['Negative', 'Positive']))

print("\n--- Competition Finished ---")
print("Winner:", max(results, key=results.get))

--- Starting Model Competition ---

Training Naive Bayes...
Training completed in 0.26 seconds.
Naive Bayes Accuracy: 0.9631
              precision    recall  f1-score   support

    Negative       0.00      0.00      0.00        43
    Positive       0.96      1.00      0.98      1121

    accuracy                           0.96      1164
   macro avg       0.48      0.50      0.49      1164
weighted avg       0.93      0.96      0.94      1164


Training Logistic Regression...


  _warn_prf(average, modifier, f"{metric.capitalize()} is", result.shape[0])
  _warn_prf(average, modifier, f"{metric.capitalize()} is", result.shape[0])
  _warn_prf(average, modifier, f"{metric.capitalize()} is", result.shape[0])


Training completed in 0.37 seconds.
Logistic Regression Accuracy: 0.9570
              precision    recall  f1-score   support

    Negative       0.44      0.60      0.51        43
    Positive       0.98      0.97      0.98      1121

    accuracy                           0.96      1164
   macro avg       0.71      0.79      0.74      1164
weighted avg       0.96      0.96      0.96      1164


--- Competition Finished ---
Winner: Naive Bayes


  norm2_w = weights @ weights if weights.ndim == 1 else squared_norm(weights)
  norm2_w = weights @ weights if weights.ndim == 1 else squared_norm(weights)
  norm2_w = weights @ weights if weights.ndim == 1 else squared_norm(weights)


In [11]:
import joblib

# 1. Select best model
final_model = models["Logistic Regression"]

# 2. Save to disk
# This creates a binary file containing the logic, vocabulary, and weights
joblib.dump(final_model, 'sentiment_model.pkl')

print("Model saved as 'sentiment_model.pkl'")

# 3. Verification
# Let's try to load it back and predict on a fake review to be sure
loaded_model = joblib.load('sentiment_model.pkl')
test_review = ["The product arrived broken and the support was rude."]
prediction = loaded_model.predict(test_review)
probability = loaded_model.predict_proba(test_review)

print(f"Review: {test_review[0]}")
print(f"Sentiment: {'Positive' if prediction[0] == 1 else 'Negative'}")
print(f"Confidence: {max(probability[0]):.2f}")

Model saved as 'sentiment_model.pkl'
Review: The product arrived broken and the support was rude.
Sentiment: Negative
Confidence: 0.79
