In [1]:
import pandas as pd
from sklearn.model_selection import train_test_split
from sklearn.feature_extraction.text import CountVectorizer
import pickle
import os

# Define paths
data_url = "https://raw.githubusercontent.com/4GeeksAcademy/naive-bayes-project-tutorial/main/playstore_reviews.csv"
data_dir = "/workspaces/sentiment-analysis-project/data"
os.makedirs(data_dir, exist_ok=True)

# Load dataset
df = pd.read_csv(data_url)

# Remove package_name and preprocess review text
df = df.drop(columns=["package_name"])
df["review"] = df["review"].str.strip().str.lower()

# Split into train and test sets
X = df["review"]
y = df["polarity"]
X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.2, random_state=42)

# Transform text into word count matrix
vec_model = CountVectorizer(stop_words="english")
X_train_vec = vec_model.fit_transform(X_train).toarray()
X_test_vec = vec_model.transform(X_test).toarray()

# Save processed data and vectorizer
with open(os.path.join(data_dir, "X_train_vec.pkl"), "wb") as f:
    pickle.dump(X_train_vec, f)
with open(os.path.join(data_dir, "X_test_vec.pkl"), "wb") as f:
    pickle.dump(X_test_vec, f)
with open(os.path.join(data_dir, "y_train.pkl"), "wb") as f:
    pickle.dump(y_train, f)
with open(os.path.join(data_dir, "y_test.pkl"), "wb") as f:
    pickle.dump(y_test, f)
with open(os.path.join(data_dir, "vectorizer.pkl"), "wb") as f:
    pickle.dump(vec_model, f)

print("Dataset loaded, preprocessed, and saved.")

Dataset loaded, preprocessed, and saved.


In [2]:
import pickle
import os
from sklearn.naive_bayes import GaussianNB, MultinomialNB, BernoulliNB
from sklearn.metrics import accuracy_score, classification_report
from sklearn.model_selection import GridSearchCV
import numpy as np

# Define paths
data_dir = "/workspaces/sentiment-analysis-project/data"
model_dir = "/workspaces/sentiment-analysis-project/models"
os.makedirs(model_dir, exist_ok=True)

# Load processed data
with open(os.path.join(data_dir, "X_train_vec.pkl"), "rb") as f:
    X_train = pickle.load(f)
with open(os.path.join(data_dir, "X_test_vec.pkl"), "rb") as f:
    X_test = pickle.load(f)
with open(os.path.join(data_dir, "y_train.pkl"), "rb") as f:
    y_train = pickle.load(f)
with open(os.path.join(data_dir, "y_test.pkl"), "rb") as f:
    y_test = pickle.load(f)

# Train and evaluate GaussianNB
gnb = GaussianNB()
gnb.fit(X_train, y_train)
y_pred_gnb = gnb.predict(X_test)
gnb_accuracy = accuracy_score(y_test, y_pred_gnb)
print("GaussianNB Accuracy:", gnb_accuracy)
print("GaussianNB Classification Report:\n", classification_report(y_test, y_pred_gnb))

# Train and evaluate MultinomialNB with hyperparameter tuning
mnb = MultinomialNB()
param_grid = {"alpha": [0.1, 0.5, 1.0, 2.0, 5.0]}
grid_search = GridSearchCV(mnb, param_grid, cv=5, scoring="accuracy", n_jobs=-1)
grid_search.fit(X_train, y_train)
best_mnb = grid_search.best_estimator_
y_pred_mnb = best_mnb.predict(X_test)
mnb_accuracy = accuracy_score(y_test, y_pred_mnb)
print("MultinomialNB Best Parameters:", grid_search.best_params_)
print("MultinomialNB Accuracy:", mnb_accuracy)
print("MultinomialNB Classification Report:\n", classification_report(y_test, y_pred_mnb))

# Train and evaluate BernoulliNB
bnb = BernoulliNB()
bnb.fit(X_train, y_train)
y_pred_bnb = bnb.predict(X_test)
bnb_accuracy = accuracy_score(y_test, y_pred_bnb)
print("BernoulliNB Accuracy:", bnb_accuracy)
print("BernoulliNB Classification Report:\n", classification_report(y_test, y_pred_bnb))

# Save the best Naive Bayes model (assuming MultinomialNB performs best)
best_model = best_mnb
with open(os.path.join(model_dir, "best_nb_model.pkl"), "wb") as f:
    pickle.dump(best_model, f)

# Save performance metrics
metrics = {
    "GaussianNB": gnb_accuracy,
    "MultinomialNB": mnb_accuracy,
    "BernoulliNB": bnb_accuracy
}
with open(os.path.join(data_dir, "nb_metrics.pkl"), "wb") as f:
    pickle.dump(metrics, f)

print("Best Naive Bayes model saved.")

GaussianNB Accuracy: 0.8044692737430168
GaussianNB Classification Report:
               precision    recall  f1-score   support

           0       0.85      0.88      0.86       126
           1       0.69      0.62      0.65        53

    accuracy                           0.80       179
   macro avg       0.77      0.75      0.76       179
weighted avg       0.80      0.80      0.80       179

MultinomialNB Best Parameters: {'alpha': 0.5}
MultinomialNB Accuracy: 0.8268156424581006
MultinomialNB Classification Report:
               precision    recall  f1-score   support

           0       0.86      0.90      0.88       126
           1       0.73      0.66      0.69        53

    accuracy                           0.83       179
   macro avg       0.80      0.78      0.79       179
weighted avg       0.82      0.83      0.82       179

BernoulliNB Accuracy: 0.770949720670391
BernoulliNB Classification Report:
               precision    recall  f1-score   support

           0 