In [21]:
import pandas as pd
import numpy as np
import matplotlib.pyplot as plt 
import seaborn as sns
from sklearn import tree
from sklearn.model_selection import RandomizedSearchCV
from sklearn.model_selection import GridSearchCV
from sklearn.linear_model import LogisticRegression
from sklearn.metrics import accuracy_score
from sklearn.metrics import classification_report
from sklearn.model_selection import train_test_split
from sklearn.feature_extraction.text import CountVectorizer
from sklearn.naive_bayes import MultinomialNB
from sklearn.naive_bayes import BernoulliNB
from sklearn.naive_bayes import GaussianNB




In [22]:
data= pd.read_csv("data/playstore_reviews.csv", sep = ",")
data.head()

Unnamed: 0,package_name,review,polarity
0,com.facebook.katana,privacy at least put some option appear offli...,0
1,com.facebook.katana,"messenger issues ever since the last update, ...",0
2,com.facebook.katana,profile any time my wife or anybody has more ...,0
3,com.facebook.katana,the new features suck for those of us who don...,0
4,com.facebook.katana,forced reload on uploading pic on replying co...,0


In [23]:
data = data.drop(columns=['package_name'])
data["review"] = data["review"].str.strip().str.lower()
data = data.dropna(subset=['review', 'polarity'])
data.head()

Unnamed: 0,review,polarity
0,privacy at least put some option appear offlin...,0
1,"messenger issues ever since the last update, i...",0
2,profile any time my wife or anybody has more t...,0
3,the new features suck for those of us who don'...,0
4,forced reload on uploading pic on replying com...,0


In [24]:
X = data["review"]
y = data["polarity"]
X_train, X_test, y_train, y_test = train_test_split(X, y, test_size = 0.2, random_state = 42)

X_train.head()

331    just did the latest update on viber and yet ag...
733    keeps crashing it only works well in extreme d...
382    the fail boat has arrived the 6.0 version is t...
704    superfast, just as i remember it ! opera mini ...
813    installed and immediately deleted this crap i ...
Name: review, dtype: object

In [25]:
from sklearn.feature_extraction.text import TfidfVectorizer
vec_model = CountVectorizer(min_df=5,stop_words = "english") #remove common stop words (like "the", "is", etc.) to reduce noise in data
vec_model = CountVectorizer(ngram_range=(1, 2)) # Use bigrams along with unigrams
vec_model = TfidfVectorizer(stop_words='english') # Use TF-IDF with stop words removal
X_train = vec_model.fit_transform(X_train).toarray()
X_test = vec_model.transform(X_test).toarray()


Since the dataset consists of words, leta try MultinomialNB Probability model.

In [26]:
model = MultinomialNB()
model.fit(X_train, y_train)

In [27]:
y_pred = model.predict(X_test)
y_pred

array([0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 1, 0, 0, 1, 0, 0, 0,
       1, 0, 0, 0, 0, 1, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 1, 0, 0, 0, 1, 0,
       0, 0, 0, 0, 0, 0, 0, 1, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0,
       0, 0, 0, 0, 0, 0, 0, 1, 1, 0, 0, 1, 0, 0, 0, 0, 0, 0, 0, 1, 0, 0,
       1, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 1, 0, 0, 0, 0, 0,
       1, 0, 1, 0, 1, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 1, 0, 0,
       0, 0, 0, 1, 0, 0, 1, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 1, 0, 0, 1, 0,
       0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0,
       0, 0, 0])

In [28]:
from sklearn.metrics import accuracy_score, classification_report, confusion_matrix

print("Multinomial Accuracy:", accuracy_score(y_test, y_pred))
print("Multinomial Confusion Matrix:\n", confusion_matrix(y_test, y_pred))
print("Multinomial Classification Report:\n", classification_report(y_test, y_pred))

Multinomial Accuracy: 0.7988826815642458
Multinomial Confusion Matrix:
 [[124   2]
 [ 34  19]]
Multinomial Classification Report:
               precision    recall  f1-score   support

           0       0.78      0.98      0.87       126
           1       0.90      0.36      0.51        53

    accuracy                           0.80       179
   macro avg       0.84      0.67      0.69       179
weighted avg       0.82      0.80      0.77       179



Let's try GaussianNB

In [29]:
model = GaussianNB()
model.fit(X_train, y_train)

In [30]:
y_pred = model.predict(X_test)
y_pred

array([0, 1, 1, 0, 0, 0, 0, 0, 0, 1, 0, 0, 0, 0, 0, 1, 0, 0, 1, 0, 0, 0,
       1, 1, 0, 1, 0, 1, 0, 0, 0, 0, 0, 0, 0, 0, 0, 1, 1, 0, 0, 0, 1, 0,
       0, 0, 0, 0, 0, 0, 0, 1, 0, 0, 0, 1, 0, 0, 0, 0, 0, 1, 0, 0, 0, 0,
       0, 0, 0, 1, 0, 0, 0, 1, 1, 1, 1, 0, 0, 0, 0, 1, 0, 0, 0, 1, 0, 1,
       1, 0, 0, 0, 0, 0, 0, 0, 0, 1, 0, 0, 0, 0, 0, 0, 0, 1, 1, 0, 0, 0,
       0, 0, 1, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 1, 0, 0, 1, 0, 0,
       1, 0, 1, 1, 0, 0, 1, 0, 0, 1, 0, 0, 1, 0, 0, 0, 0, 1, 1, 1, 1, 1,
       1, 0, 0, 0, 0, 0, 0, 1, 0, 0, 0, 0, 0, 0, 0, 0, 0, 1, 1, 1, 0, 1,
       0, 0, 0])

In [31]:
from sklearn.metrics import accuracy_score, classification_report, confusion_matrix

print("Gaussian Accuracy:", accuracy_score(y_test, y_pred))
print("Gaussian Confusion Matrix:\n", confusion_matrix(y_test, y_pred))
print("Gaussian Classification Report:\n", classification_report(y_test, y_pred))

Gaussian Accuracy: 0.8100558659217877
Gaussian Confusion Matrix:
 [[112  14]
 [ 20  33]]
Gaussian Classification Report:
               precision    recall  f1-score   support

           0       0.85      0.89      0.87       126
           1       0.70      0.62      0.66        53

    accuracy                           0.81       179
   macro avg       0.78      0.76      0.76       179
weighted avg       0.81      0.81      0.81       179



Let's try Bernoulli NB

In [32]:
model = BernoulliNB()
model.fit(X_train, y_train)

In [33]:
y_pred = model.predict(X_test)
y_pred

array([0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 1, 0, 0, 1, 0, 0, 0,
       1, 1, 0, 0, 0, 1, 0, 0, 0, 0, 0, 0, 0, 0, 0, 1, 0, 0, 0, 0, 1, 0,
       0, 0, 0, 0, 0, 0, 0, 1, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0,
       0, 0, 0, 1, 0, 0, 0, 1, 1, 1, 0, 1, 0, 0, 0, 0, 0, 0, 0, 1, 0, 0,
       0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 1, 0, 0, 1, 0, 0, 0, 0, 0,
       1, 0, 0, 0, 1, 1, 0, 0, 0, 0, 0, 0, 0, 0, 0, 1, 0, 0, 0, 0, 0, 0,
       0, 0, 1, 0, 0, 0, 0, 0, 0, 1, 0, 0, 0, 0, 0, 0, 0, 1, 0, 0, 1, 0,
       0, 1, 0, 0, 1, 0, 1, 0, 1, 0, 0, 0, 0, 0, 1, 0, 0, 0, 0, 1, 0, 0,
       0, 0, 0])

In [34]:
print("Bernoulli Accuracy:", accuracy_score(y_test, y_pred))
print("Bernoulli Confusion Matrix:\n", confusion_matrix(y_test, y_pred))
print("Bernoulli Classification Report:\n", classification_report(y_test, y_pred))

Bernoulli Accuracy: 0.770949720670391
Bernoulli Confusion Matrix:
 [[117   9]
 [ 32  21]]
Bernoulli Classification Report:
               precision    recall  f1-score   support

           0       0.79      0.93      0.85       126
           1       0.70      0.40      0.51        53

    accuracy                           0.77       179
   macro avg       0.74      0.66      0.68       179
weighted avg       0.76      0.77      0.75       179



Let's compare accuracies. Gaussian result is the best so far, then Multinomial  with Bernoulli being the last.

Let's optimize the Gaussian Model. First, GridSearch and RandomSearch. 

In [35]:
from sklearn.naive_bayes import GaussianNB
from sklearn.model_selection import GridSearchCV
from sklearn.metrics import accuracy_score


param_grid = {
    'var_smoothing': [1e-9, 1e-8, 1e-7, 1e-6, 1e-5]
}


gnb_model = GaussianNB()
grid_search = GridSearchCV(estimator=gnb_model, param_grid=param_grid,
                           scoring='accuracy', cv=5)


grid_search.fit(X_train, y_train)


best_params = grid_search.best_params_
best_score = grid_search.best_score_
print("Best Parameters:", best_params)
print("Best Cross-Validation Score:", best_score)


best_model = grid_search.best_estimator_
y_pred = best_model.predict(X_test)
test_accuracy = accuracy_score(y_test, y_pred)
print("Test Accuracy:", test_accuracy)


Best Parameters: {'var_smoothing': 1e-09}
Best Cross-Validation Score: 0.7514133753570373
Test Accuracy: 0.8100558659217877


Grid Search did not improve the accuracy. Lets try Randomized Search.

In [36]:
from scipy.stats import uniform

gnb_model = GaussianNB()

param_grid = {
    'var_smoothing': np.logspace(-12, -1, num=100)  # Test values from 1e-9 to 1e-1
}

random_search = RandomizedSearchCV(
    estimator=gnb_model,
    param_distributions=param_grid,
    n_iter=30,  # Number of parameter settings to try
    scoring='accuracy',  # Use a metric appropriate for your problem
    cv=5,  # Number of cross-validation folds
    random_state=42,
    n_jobs=-1  # Use all processors
)



random_search.fit(X_train, y_train)


print("Best parameters found: ", random_search.best_params_)
print("Best cross-validation accuracy: ", random_search.best_score_)


# Use the best parameters to fit the final model
best_model = random_search.best_estimator_

Best parameters found:  {'var_smoothing': np.float64(0.005994842503189397)}
Best cross-validation accuracy:  0.7893036540923865


Grid Search determined the hyperparameters that did not improve the model but also did not degrade the score. Lets proceed re-training the model with the GridSearch found parameters.

In [37]:
model = MultinomialNB(alpha = 1.0, fit_prior = False)
model.fit(X_train, y_train)
model.fit(X_train, y_train)
y_pred = model.predict(X_test)
accuracy_score(y_test, y_pred)

0.8268156424581006

The accuracy improved 2%. 

In [38]:
from pickle import dump
dump(model, open("models/naive_bayes_alpha_1-9176382_fit_prior_False_42.sav", "wb"))