# Naive Bayes Project

In [216]:
!pip install -r "/workspaces/Naive-Bayes-Project/requirements.txt"

Defaulting to user installation because normal site-packages is not writeable

[1m[[0m[34;49mnotice[0m[1;39;49m][0m[39;49m A new release of pip is available: [0m[31;49m23.0.1[0m[39;49m -> [0m[32;49m24.2[0m
[1m[[0m[34;49mnotice[0m[1;39;49m][0m[39;49m To update, run: [0m[32;49mpip install --upgrade pip[0m


In [217]:
import numpy as np
import pandas as pd
import matplotlib.pyplot as plt
import seaborn as sns

from sklearn.model_selection import train_test_split
from sklearn.feature_extraction.text import CountVectorizer
from sklearn.naive_bayes import BernoulliNB, MultinomialNB, GaussianNB
from sklearn.metrics import f1_score, accuracy_score
from sklearn.model_selection import GridSearchCV

from pickle import dump




In [218]:
url = "https://raw.githubusercontent.com/4GeeksAcademy/naive-bayes-project-tutorial/main/playstore_reviews.csv"
df = pd.read_csv(url)

In [219]:
df.head()

Unnamed: 0,package_name,review,polarity
0,com.facebook.katana,privacy at least put some option appear offli...,0
1,com.facebook.katana,"messenger issues ever since the last update, ...",0
2,com.facebook.katana,profile any time my wife or anybody has more ...,0
3,com.facebook.katana,the new features suck for those of us who don...,0
4,com.facebook.katana,forced reload on uploading pic on replying co...,0


In [220]:
df.info()

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 891 entries, 0 to 890
Data columns (total 3 columns):
 #   Column        Non-Null Count  Dtype 
---  ------        --------------  ----- 
 0   package_name  891 non-null    object
 1   review        891 non-null    object
 2   polarity      891 non-null    int64 
dtypes: int64(1), object(2)
memory usage: 21.0+ KB


In [221]:
df.drop("package_name",axis=1, inplace=True)

In [222]:
df.head()

Unnamed: 0,review,polarity
0,privacy at least put some option appear offli...,0
1,"messenger issues ever since the last update, ...",0
2,profile any time my wife or anybody has more ...,0
3,the new features suck for those of us who don...,0
4,forced reload on uploading pic on replying co...,0


In [223]:
df["review"]=df["review"].str.strip().str.lower()


In [224]:
# We divide the dataset into training and test samples
X = df["review"] #input is review and output polarity as we are outputting a sentiment +/- based on the review.
y = df["polarity"]

X_train, X_test, y_train, y_test = train_test_split(X, y, test_size = 0.2, random_state = 42)

X_train.head()

331    just did the latest update on viber and yet ag...
733    keeps crashing it only works well in extreme d...
382    the fail boat has arrived the 6.0 version is t...
704    superfast, just as i remember it ! opera mini ...
813    installed and immediately deleted this crap i ...
Name: review, dtype: object

In [225]:
X_train.shape

(712,)

In [226]:
vec_model = CountVectorizer(stop_words = "english")
X_train = vec_model.fit_transform(X_train).toarray()
X_test = vec_model.transform(X_test).toarray()

In [227]:
X_train.shape

(712, 3310)

In [228]:
model = BernoulliNB()
model.fit(X_train, y_train)

In [229]:
y_train_pred = model.predict(X_train)
print(list(y_train_pred))

[np.int64(0), np.int64(0), np.int64(0), np.int64(1), np.int64(1), np.int64(0), np.int64(0), np.int64(0), np.int64(0), np.int64(0), np.int64(1), np.int64(1), np.int64(0), np.int64(0), np.int64(0), np.int64(0), np.int64(0), np.int64(0), np.int64(1), np.int64(0), np.int64(0), np.int64(0), np.int64(1), np.int64(0), np.int64(0), np.int64(1), np.int64(1), np.int64(0), np.int64(0), np.int64(1), np.int64(1), np.int64(0), np.int64(0), np.int64(0), np.int64(0), np.int64(0), np.int64(0), np.int64(0), np.int64(0), np.int64(1), np.int64(0), np.int64(0), np.int64(0), np.int64(0), np.int64(0), np.int64(1), np.int64(0), np.int64(0), np.int64(0), np.int64(0), np.int64(0), np.int64(0), np.int64(0), np.int64(1), np.int64(0), np.int64(0), np.int64(0), np.int64(1), np.int64(1), np.int64(0), np.int64(1), np.int64(0), np.int64(0), np.int64(0), np.int64(0), np.int64(0), np.int64(0), np.int64(0), np.int64(0), np.int64(0), np.int64(0), np.int64(0), np.int64(0), np.int64(0), np.int64(0), np.int64(0), np.int64(0)

In [230]:
y_pred = model.predict(X_test)
print(y_pred)

[0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 1 0 0 1 0 0 0 1 1 0 0 0 1 0 0 0 0 0 0 0 0 0
 1 0 0 0 0 1 0 0 0 0 0 0 0 0 1 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 1 0 0 0 1
 1 1 0 1 0 0 0 0 0 0 0 1 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 1 0 0 1 0 0 0 0 0 1
 0 0 0 1 1 0 0 0 0 0 0 0 0 0 1 0 0 0 0 0 0 0 0 1 0 0 0 0 0 0 1 0 0 0 0 0 0
 0 1 0 0 1 0 0 1 0 0 1 0 1 0 1 0 0 0 0 0 1 0 0 0 0 1 0 0 0 0 0]


In [231]:
print(f'The accuracy score of the train set is: {accuracy_score(y_train, y_train_pred)}')
print(f'The accuracy score of the test set is: {accuracy_score(y_test, y_pred)}')


The accuracy score of the train set is: 0.9199438202247191
The accuracy score of the test set is: 0.770949720670391


In [232]:
print('The Training F1 Score is', f1_score(y_train, y_train_pred))
print('The Testing F1 Score is', f1_score(y_test, y_pred))

The Training F1 Score is 0.8758169934640523
The Testing F1 Score is 0.5060240963855421


Model hyperparameterization

In [233]:
parameters = {"alpha":[0.01, 0.1, 1, 10]
              }

In [234]:
# Create a GridSearchCV object
grid_search = GridSearchCV(model, parameters, cv=5)

# Fit the grid search to your data
grid_search.fit(X_train, y_train)

# Best parameters
best_params = grid_search.best_params_
best_BNB = grid_search.best_estimator_

In [235]:
best_BNB

In [236]:
y_pred = best_BNB.predict(X_test)
y_pred

array([0, 1, 1, 0, 0, 0, 0, 0, 0, 1, 0, 0, 0, 0, 0, 1, 0, 1, 1, 0, 0, 0,
       1, 1, 0, 0, 0, 1, 0, 0, 0, 0, 0, 0, 0, 1, 0, 1, 1, 0, 0, 0, 1, 0,
       0, 0, 0, 0, 0, 1, 0, 1, 0, 0, 0, 1, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0,
       0, 0, 0, 1, 0, 0, 0, 1, 1, 1, 1, 1, 0, 0, 0, 0, 0, 0, 0, 1, 0, 0,
       1, 0, 0, 0, 0, 0, 0, 0, 1, 0, 0, 0, 0, 0, 0, 0, 1, 0, 1, 0, 0, 0,
       1, 0, 1, 0, 1, 1, 0, 0, 0, 0, 0, 0, 0, 0, 0, 1, 1, 0, 0, 1, 0, 0,
       0, 0, 1, 1, 0, 0, 1, 0, 0, 1, 0, 0, 1, 0, 0, 0, 0, 1, 1, 1, 1, 0,
       0, 0, 0, 0, 0, 0, 1, 1, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 1, 1, 0, 1,
       0, 0, 0])

In [237]:

print(f'The accuracy score of the train set is: {accuracy_score(y_train, y_train_pred)}')
print(f'The accuracy score of the train set is: {accuracy_score(y_test, y_pred)}')

The accuracy score of the train set is: 0.9199438202247191
The accuracy score of the train set is: 0.8379888268156425


In [238]:
print('The Training F1 Score is', f1_score(y_train, y_train_pred))
print('The Testing F1 Score is', f1_score(y_test, y_pred))

The Training F1 Score is 0.8758169934640523
The Testing F1 Score is 0.7128712871287128


Gaussian Test

In [239]:
GNB = GaussianNB()
GNB.fit(X_train, y_train)

In [240]:
y_train_pred_GNB = GNB.predict(X_train)

In [241]:
y_pred_GNB = GNB.predict(X_test)
print(y_pred_GNB)

[0 1 1 0 0 0 0 0 0 1 0 0 0 0 0 1 0 0 1 0 0 0 1 1 0 1 0 1 0 0 0 0 0 0 0 1 0
 1 1 0 0 0 1 0 0 0 0 0 0 0 0 1 0 0 0 1 0 0 0 0 0 1 0 0 0 0 0 0 0 1 0 0 0 1
 1 1 1 0 0 0 0 1 0 0 0 1 0 1 1 0 0 0 0 0 0 0 0 1 0 0 0 0 0 0 0 1 1 0 0 0 0
 0 1 0 0 0 0 0 0 0 0 0 0 0 0 0 1 0 0 1 0 0 1 0 1 1 0 0 1 0 0 1 0 0 1 0 0 0
 0 1 1 1 1 1 1 0 0 0 0 0 0 1 0 0 0 0 0 0 0 0 0 1 1 1 0 1 0 0 0]


In [242]:
print(f'The accuracy score of the train set is: {accuracy_score(y_train, y_train_pred_GNB)}')
print(f'The accuracy score of the test set is: {accuracy_score(y_test, y_pred_GNB)}')

The accuracy score of the train set is: 0.9859550561797753
The accuracy score of the test set is: 0.8044692737430168


In [243]:
print('The Training F1 Score is', f1_score(y_train, y_train_pred_GNB))
print('The Testing F1 Score is', f1_score(y_test, y_pred_GNB))

The Training F1 Score is 0.9806949806949807
The Testing F1 Score is 0.6534653465346535


MultinomialNB Test

In [244]:
MNB = MultinomialNB()
MNB.fit(X_train, y_train)

In [245]:
y_pred = MNB.predict(X_test)
y_pred

array([0, 0, 1, 0, 0, 0, 0, 0, 0, 0, 0, 0, 1, 0, 0, 1, 0, 1, 1, 0, 0, 0,
       1, 1, 0, 0, 0, 1, 0, 0, 0, 0, 0, 0, 0, 1, 0, 1, 1, 0, 0, 0, 1, 0,
       0, 0, 0, 0, 0, 1, 0, 1, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0,
       0, 0, 0, 1, 0, 0, 0, 1, 1, 1, 0, 1, 0, 0, 0, 0, 0, 0, 0, 1, 0, 0,
       1, 0, 1, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 1, 0, 0, 1, 0, 0, 0, 0, 0,
       1, 0, 1, 0, 1, 1, 0, 0, 1, 0, 0, 0, 0, 0, 0, 1, 1, 0, 0, 1, 0, 0,
       0, 0, 1, 1, 0, 0, 1, 0, 0, 1, 0, 0, 0, 0, 0, 0, 0, 1, 1, 0, 1, 0,
       0, 1, 0, 0, 0, 0, 1, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 1, 1, 0, 1,
       0, 0, 0])

In [246]:
print(f'The accuracy score of the train set is: {accuracy_score(y_train, y_train_pred)}')
print(f'The accuracy score of the train set is: {accuracy_score(y_test, y_pred)}')

The accuracy score of the train set is: 0.9199438202247191
The accuracy score of the train set is: 0.8156424581005587


In [247]:
print('The Training F1 Score is', f1_score(y_train, y_train_pred))
print('The Testing F1 Score is', f1_score(y_test, y_pred))

The Training F1 Score is 0.8758169934640523
The Testing F1 Score is 0.6597938144329897


In [248]:
params={"alpha":[0.01, 0.1, 1, 10, 100, 200, 500, 1000],
        "fit_prior":[True,False]
              }

In [249]:
# Create a GridSearchCV object
grid_search = GridSearchCV(MNB, params, cv=5)

# Fit the grid search to your data
grid_search.fit(X_train, y_train)

# Best parameters
best_params = grid_search.best_params_
best_MNB = grid_search.best_estimator_

In [250]:
y_pred = best_MNB.predict(X_test)
y_pred

array([0, 1, 1, 0, 0, 0, 0, 0, 0, 1, 0, 0, 0, 0, 0, 1, 0, 1, 1, 0, 0, 0,
       1, 1, 0, 0, 0, 1, 0, 0, 0, 0, 0, 0, 0, 1, 0, 1, 1, 0, 0, 0, 1, 0,
       0, 0, 0, 0, 0, 0, 0, 1, 0, 0, 0, 1, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0,
       0, 0, 0, 1, 0, 0, 0, 1, 1, 1, 1, 1, 0, 0, 0, 1, 0, 0, 0, 1, 0, 0,
       1, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 1, 0, 1, 0, 0, 0,
       1, 0, 1, 0, 1, 1, 0, 0, 0, 0, 0, 0, 0, 0, 0, 1, 1, 0, 0, 1, 0, 0,
       0, 0, 1, 1, 0, 0, 1, 0, 0, 1, 0, 0, 1, 0, 0, 0, 0, 1, 1, 0, 1, 0,
       0, 0, 0, 0, 0, 0, 1, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 1, 1, 0, 1,
       0, 0, 0])

In [251]:
print(f'The accuracy score of the train set is: {accuracy_score(y_train, y_train_pred)}')
print(f'The accuracy score of the train set is: {accuracy_score(y_test, y_pred)}')

The accuracy score of the train set is: 0.9199438202247191
The accuracy score of the train set is: 0.8212290502793296


In [252]:
print('The Training F1 Score is', f1_score(y_train, y_train_pred))
print('The Testing F1 Score is', f1_score(y_test, y_pred))

The Training F1 Score is 0.8758169934640523
The Testing F1 Score is 0.673469387755102


In [255]:
dump(best_MNB, open("naive_bayes_alpha_1-9176382_fit_prior_False_42.sav", "wb"))
