In [2]:
import pandas as pd
import numpy as np

from sklearn.feature_extraction.text import TfidfVectorizer
from sklearn.model_selection import train_test_split, cross_val_score

from sklearn.linear_model import LogisticRegression
from sklearn.naive_bayes import MultinomialNB
from sklearn.svm import SVC
from sklearn.ensemble import RandomForestClassifier

from sklearn.metrics import accuracy_score, classification_report, confusion_matrix

import matplotlib.pyplot as plt
import seaborn as sns
import joblib

# Load processed data
df = pd.read_csv('E:/sentiment-analysis-project/data/processed/processed_reviews.csv')
print(f"üìä Loaded {len(df)} processed reviews")


üìä Loaded 50000 processed reviews


In [3]:
# Features and labels
X = df['processed_review']
y = df['sentiment']

# Train/test split
X_train, X_test, y_train, y_test = train_test_split(
    X, y, test_size=0.2, random_state=42, stratify=y
)

print(f"üìà Training set size: {len(X_train)}")
print(f"üìâ Test set size: {len(X_test)}")

# TF-IDF Vectorization
print("\nüîÑ Creating TF-IDF features...")
tfidf = TfidfVectorizer(max_features=10000, ngram_range=(1, 2), min_df=2, max_df=0.95)
X_train_tfidf = tfidf.fit_transform(X_train)
X_test_tfidf = tfidf.transform(X_test)

print(f"‚úÖ Feature matrix shape: {X_train_tfidf.shape}")
print(f"üìù Sample features: {tfidf.get_feature_names_out()[:10]}")


üìà Training set size: 40000
üìâ Test set size: 10000

üîÑ Creating TF-IDF features...
‚úÖ Feature matrix shape: (40000, 10000)
üìù Sample features: ['aaron' 'abandon' 'abandoned' 'abbott' 'abc' 'ability' 'able' 'able get'
 'able make' 'able see']


In [4]:
from sklearn.linear_model import LogisticRegression
from sklearn.naive_bayes import MultinomialNB
from sklearn.model_selection import cross_val_score
from sklearn.metrics import accuracy_score

# Use only light models
models = {
    'Logistic Regression': LogisticRegression(random_state=42, max_iter=1000),
    'Naive Bayes': MultinomialNB()
}

results = {}

for name, model in models.items():
    print(f"\nüîÑ Training {name}...")
    model.fit(X_train_tfidf, y_train)
    y_pred = model.predict(X_test_tfidf)
    test_acc = accuracy_score(y_test, y_pred)

    # Optional: light CV
    cv_scores = cross_val_score(model, X_train_tfidf, y_train, cv=3, scoring='accuracy')

    results[name] = {
        'CV Mean': cv_scores.mean(),
        'Test Accuracy': test_acc
    }

    print(f"‚úÖ {name} - CV Mean: {cv_scores.mean():.4f}, Test Accuracy: {test_acc:.4f}")

# Show results
results_df = pd.DataFrame(results).T
display(results_df)



üîÑ Training Logistic Regression...
‚úÖ Logistic Regression - CV Mean: 0.8871, Test Accuracy: 0.8924

üîÑ Training Naive Bayes...
‚úÖ Naive Bayes - CV Mean: 0.8638, Test Accuracy: 0.8665


Unnamed: 0,CV Mean,Test Accuracy
Logistic Regression,0.8871,0.8924
Naive Bayes,0.863775,0.8665


In [5]:
import joblib

# Save best model
joblib.dump(model, 'E:\sentiment-analysis-project/models/logistic_model.pkl')

# Save the TF-IDF vectorizer
joblib.dump(tfidf, 'E:\sentiment-analysis-project/models/tfidf_vectorizer.pkl')

print("‚úÖ Model and vectorizer saved to /models/")


  joblib.dump(model, 'E:\sentiment-analysis-project/models/logistic_model.pkl')
  joblib.dump(tfidf, 'E:\sentiment-analysis-project/models/tfidf_vectorizer.pkl')


‚úÖ Model and vectorizer saved to /models/
