## Naive bayes with GridSearchCV

In [75]:
# read the dataset
import pandas as pd 
dataset = pd.read_csv("Social_Network_Ads.csv")
dataset.head()

Unnamed: 0,User ID,Gender,Age,EstimatedSalary,Purchased
0,15624510,Male,19,19000,0
1,15810944,Male,35,20000,0
2,15668575,Female,26,43000,0
3,15603246,Female,27,57000,0
4,15804002,Male,19,76000,0


In [76]:
# convert categorical into numerical 
dataset = pd.get_dummies(dataset, drop_first=True, dtype=int)
dataset = dataset.drop(["User ID"],axis=1)
dataset.head()

Unnamed: 0,Age,EstimatedSalary,Purchased,Gender_Male
0,19,19000,0,1
1,35,20000,0,1
2,26,43000,0,0
3,27,57000,0,0
4,19,76000,0,1


In [77]:
# split input and output 
independent = dataset[[ "Age",	"EstimatedSalary", "Gender_Male"]]
dependent = dataset[[ "Purchased" ]]

In [78]:
# check dataset balance or imbalance
dataset["Purchased"].value_counts()

Purchased
0    257
1    143
Name: count, dtype: int64

In [79]:
# split train and test
from sklearn.model_selection import train_test_split
x_train,x_test,y_train,y_test = train_test_split(independent,dependent, test_size=0.20,random_state=0)
print(x_train,x_test,y_train,y_test )

     Age  EstimatedSalary  Gender_Male
336   58           144000            1
64    59            83000            0
55    24            55000            0
106   26            35000            0
300   58            38000            0
..   ...              ...          ...
323   48            30000            0
192   29            43000            1
117   36            52000            1
47    27            54000            0
172   26           118000            0

[320 rows x 3 columns]      Age  EstimatedSalary  Gender_Male
132   30            87000            1
309   38            50000            0
341   35            75000            1
196   30            79000            0
246   35            50000            0
..   ...              ...          ...
14    18            82000            1
363   42            79000            0
304   40            60000            0
361   53            34000            0
329   47           107000            0

[80 rows x 3 columns]      Purchased
33

In [80]:
y_train = y_train.values.ravel()

## MultinomialNB

In [81]:
# model creation + grid cv  MultinomialNB
from sklearn.model_selection import GridSearchCV
from sklearn.naive_bayes import MultinomialNB 

param_grid ={
"alpha" :[0.1, 0.5, 1.0, 2.0],
"fit_prior": [True, False]
}
model = MultinomialNB()

grid_model = GridSearchCV(model, param_grid,cv=5, refit=True, verbose=3  , n_jobs =-1, scoring='f1_weighted') 
grid_model.fit(x_train,y_train)

# model prediction + grid cv
grid_prediction = grid_model.predict(x_test)
print("grid_prediction: ",grid_prediction)

# model evaluation --- Performance on the unseen TEST dataset
from sklearn.metrics import confusion_matrix, classification_report
cm = confusion_matrix(y_test,grid_prediction)
clf_report = classification_report(y_test, grid_prediction)
print("cm: ", cm)
print("clf_report: ", clf_report)

## ROC AUC 
from sklearn.metrics import roc_auc_score     # key metric for binary classification 
roc_auc_score = roc_auc_score(y_test,grid_model.predict_proba(x_test)[:,1] )
print("roc_auc_score: ", roc_auc_score)

from sklearn.metrics import f1_score
f1_score = f1_score(y_test,grid_prediction, average='weighted' )
print("f1_score: ", f1_score)


# grid results 
grid_results = grid_model.cv_results_
print("grid_results: ", grid_results)

print("grid_model.best_params_ : ",grid_model.best_params_)
print("grid_model.best_estimator_:  ",grid_model.best_estimator_)
print("grid_model.best_score_  :  ", grid_model.best_score_)

Fitting 5 folds for each of 8 candidates, totalling 40 fits
grid_prediction:  [0 0 0 1 0 0 0 1 1 0 0 0 0 0 1 1 0 1 1 0 0 1 0 0 0 1 1 0 0 1 1 0 0 0 0 0 1
 0 0 0 1 0 1 0 1 0 1 0 0 0 1 0 0 1 0 0 1 1 0 0 0 0 1 1 0 0 1 0 0 0 1 1 1 0
 1 1 0 0 0 1]
cm:  [[37 21]
 [13  9]]
clf_report:                precision    recall  f1-score   support

           0       0.74      0.64      0.69        58
           1       0.30      0.41      0.35        22

    accuracy                           0.57        80
   macro avg       0.52      0.52      0.52        80
weighted avg       0.62      0.57      0.59        80

roc_auc_score:  0.48824451410658304
f1_score:  0.5919515669515669
grid_results:  {'mean_fit_time': array([0.03456964, 0.01253185, 0.01661758, 0.01292815, 0.01345854,
       0.01409421, 0.01167436, 0.0177598 ]), 'std_fit_time': array([0.02931945, 0.00266501, 0.0070475 , 0.00610893, 0.00338532,
       0.00840889, 0.00314421, 0.00736478]), 'mean_score_time': array([0.02241397, 0.01589155, 0.013

## CategoricalNB

In [82]:
# model creation + grid cv  CategoricalNB
from sklearn.model_selection import GridSearchCV
from sklearn.naive_bayes import CategoricalNB 

param_grid ={
"alpha" :[0.1, 0.5, 1.0],
"min_categories" :[None, 2, 5],

}
model = CategoricalNB()

grid_model = GridSearchCV(model, param_grid,cv=5, refit=True, verbose=3  , n_jobs =-1, scoring='f1_weighted') 
grid_model.fit(x_train,y_train)

# model prediction + grid cv
grid_prediction = grid_model.predict(x_test)
print("grid_prediction: ",grid_prediction)

# model evaluation --- Performance on the unseen TEST dataset
from sklearn.metrics import confusion_matrix, classification_report
cm = confusion_matrix(y_test,grid_prediction)
clf_report = classification_report(y_test, grid_prediction)
print("cm: ", cm)
print("clf_report: ", clf_report)

## ROC AUC 
from sklearn.metrics import roc_auc_score     # key metric for binary classification 
roc_auc_score = roc_auc_score(y_test,grid_model.predict_proba(x_test)[:,1] )
print("roc_auc_score: ", roc_auc_score)

from sklearn.metrics import f1_score
f1_score = f1_score(y_test,grid_prediction, average='weighted' )
print("f1_score: ", f1_score)


# grid results 
grid_results = grid_model.cv_results_
print("grid_results: ", grid_results)

print("grid_model.best_params_ : ",grid_model.best_params_)
print("grid_model.best_estimator_:  ",grid_model.best_estimator_)
print("grid_model.best_score_  :  ", grid_model.best_score_)

Fitting 5 folds for each of 9 candidates, totalling 45 fits




grid_prediction:  [0 0 0 0 0 1 0 1 0 0 0 0 0 0 0 1 0 0 1 0 0 1 0 1 0 0 0 0 0 0 0 0 1 0 0 0 0
 0 1 1 1 0 0 1 1 0 0 1 0 0 1 0 0 0 1 1 0 0 1 0 0 1 0 0 0 1 0 0 0 1 1 0 0 1
 0 0 0 0 1 1]
cm:  [[52  6]
 [ 4 18]]
clf_report:                precision    recall  f1-score   support

           0       0.93      0.90      0.91        58
           1       0.75      0.82      0.78        22

    accuracy                           0.88        80
   macro avg       0.84      0.86      0.85        80
weighted avg       0.88      0.88      0.88        80

roc_auc_score:  0.9149686520376177
f1_score:  0.8766209000762777
grid_results:  {'mean_fit_time': array([0.05582461, 0.05830116, 0.07547822, 0.06779914, 0.05148211,
       0.06952777, 0.07338476, 0.0676115 , 0.07259007]), 'std_fit_time': array([0.00955131, 0.0143145 , 0.01084725, 0.00964969, 0.00439893,
       0.00581673, 0.01088232, 0.01526651, 0.03315617]), 'mean_score_time': array([0.03284707, 0.02245874, 0.01791182, 0.01933918, 0.01774507,
      

## ComplementNB

In [83]:
# model creation + grid cv  ComplementNB
from sklearn.model_selection import GridSearchCV
from sklearn.naive_bayes import ComplementNB 

param_grid ={
"alpha" : [0.1, 0.5, 1.0],
"norm" : [True, False]
}
model = ComplementNB()

grid_model = GridSearchCV(model, param_grid,cv=5, refit=True, verbose=3  , n_jobs =-1, scoring='f1_weighted') 
grid_model.fit(x_train,y_train)

# model prediction + grid cv
grid_prediction = grid_model.predict(x_test)
print("grid_prediction: ",grid_prediction)

# model evaluation --- Performance on the unseen TEST dataset
from sklearn.metrics import confusion_matrix, classification_report
cm = confusion_matrix(y_test,grid_prediction)
clf_report = classification_report(y_test, grid_prediction)
print("cm: ", cm)
print("clf_report: ", clf_report)

## ROC AUC 
from sklearn.metrics import roc_auc_score     # key metric for binary classification 
roc_auc_score = roc_auc_score(y_test,grid_model.predict_proba(x_test)[:,1] )
print("roc_auc_score: ", roc_auc_score)

from sklearn.metrics import f1_score
f1_score = f1_score(y_test,grid_prediction, average='weighted' )
print("f1_score: ", f1_score)


# grid results 
grid_results = grid_model.cv_results_
print("grid_results: ", grid_results)

print("grid_model.best_params_ : ",grid_model.best_params_)
print("grid_model.best_estimator_:  ",grid_model.best_estimator_)
print("grid_model.best_score_  :  ", grid_model.best_score_)

Fitting 5 folds for each of 6 candidates, totalling 30 fits
grid_prediction:  [1 0 0 1 0 0 0 1 1 0 0 1 0 0 1 1 0 1 1 0 0 1 0 1 0 1 1 1 1 1 1 1 0 0 1 1 1
 0 0 0 1 0 1 0 1 1 1 1 0 0 1 0 1 1 0 0 1 1 0 0 0 0 1 1 0 0 1 0 0 1 1 1 1 0
 1 1 1 0 0 1]
cm:  [[27 31]
 [10 12]]
clf_report:                precision    recall  f1-score   support

           0       0.73      0.47      0.57        58
           1       0.28      0.55      0.37        22

    accuracy                           0.49        80
   macro avg       0.50      0.51      0.47        80
weighted avg       0.61      0.49      0.51        80

roc_auc_score:  0.48824451410658304
f1_score:  0.5136437246963562
grid_results:  {'mean_fit_time': array([0.02371101, 0.01645703, 0.01730247, 0.01908183, 0.01740894,
       0.02121897]), 'std_fit_time': array([0.01701898, 0.01197934, 0.0130757 , 0.00637019, 0.00539416,
       0.00647434]), 'mean_score_time': array([0.01411495, 0.0151792 , 0.02204318, 0.01913166, 0.01757336,
       0.02159057

In [86]:
# standardization 
from sklearn.preprocessing import StandardScaler
sc = StandardScaler()
x_train = sc.fit_transform(x_train)
x_test = sc.transform(x_test)

## BernoulliNB

In [87]:
# model creation + grid cv  BernoulliNB
from sklearn.model_selection import GridSearchCV
from sklearn.naive_bayes import BernoulliNB 

param_grid ={
"alpha" : [0.1, 0.5, 1.0],
"binarize" : [0.0, 0.5, 1.0]
}
model = BernoulliNB()

grid_model_BernoulliNB = GridSearchCV(model, param_grid,cv=5, refit=True, verbose=3  , n_jobs =-1, scoring='f1_weighted') 
grid_model_BernoulliNB.fit(x_train,y_train)

# model prediction + grid cv
grid_prediction = grid_model_BernoulliNB.predict(x_test)
print("grid_prediction: ",grid_prediction)

# model evaluation --- Performance on the unseen TEST dataset
from sklearn.metrics import confusion_matrix, classification_report
cm = confusion_matrix(y_test,grid_prediction)
clf_report = classification_report(y_test, grid_prediction)
print("cm: ", cm)
print("clf_report: ", clf_report)

## ROC AUC 
from sklearn.metrics import roc_auc_score     # key metric for binary classification 
roc_auc_score = roc_auc_score(y_test,grid_model_BernoulliNB.predict_proba(x_test)[:,1] )
print("roc_auc_score: ", roc_auc_score)

from sklearn.metrics import f1_score
f1_score = f1_score(y_test,grid_prediction, average='weighted' )
print("f1_score: ", f1_score)


# grid results 
grid_results = grid_model_BernoulliNB.cv_results_
print("grid_results: ", grid_results)

print("grid_model.best_params_ : ",grid_model_BernoulliNB.best_params_)
print("grid_model.best_estimator_:  ",grid_model_BernoulliNB.best_estimator_)
print("grid_model.best_score_  :  ", grid_model_BernoulliNB.best_score_)

Fitting 5 folds for each of 9 candidates, totalling 45 fits
grid_prediction:  [0 0 0 0 0 0 0 1 0 1 0 0 0 0 0 1 0 0 1 0 0 1 0 1 0 1 0 0 0 0 1 0 1 0 0 0 0
 0 0 1 1 0 0 0 1 0 0 1 0 1 1 0 0 1 1 1 0 0 1 0 0 1 0 1 0 1 1 0 0 0 1 0 0 1
 1 0 0 0 1 1]
cm:  [[51  7]
 [ 1 21]]
clf_report:                precision    recall  f1-score   support

           0       0.98      0.88      0.93        58
           1       0.75      0.95      0.84        22

    accuracy                           0.90        80
   macro avg       0.87      0.92      0.88        80
weighted avg       0.92      0.90      0.90        80

roc_auc_score:  0.9502351097178684
f1_score:  0.9032727272727273
grid_results:  {'mean_fit_time': array([0.17053394, 0.00854344, 0.00589814, 0.00583034, 0.00820489,
       0.005756  , 0.00635471, 0.00627542, 0.00763063]), 'std_fit_time': array([0.19417104, 0.00337104, 0.00198769, 0.00095278, 0.00247264,
       0.00132182, 0.00249179, 0.00174821, 0.00261252]), 'mean_score_time': array([0.0125

## GaussianNB

In [40]:
# model creation + grid cv  GaussianNB
from sklearn.model_selection import GridSearchCV
from sklearn.naive_bayes import GaussianNB 

param_grid ={
"var_smoothing" : [1e-9, 1e-8, 1e-7, 1e-6]
}
model = GaussianNB()

grid_model_GaussianNB = GridSearchCV(model, param_grid,cv=5, refit=True, verbose=3  , n_jobs =-1, scoring='f1_weighted') 
grid_model_GaussianNB.fit(x_train,y_train)

# model prediction + grid cv
grid_prediction = grid_model_GaussianNB.predict(x_test)
print("grid_prediction: ",grid_prediction)

# model evaluation --- Performance on the unseen TEST dataset
from sklearn.metrics import confusion_matrix, classification_report
cm = confusion_matrix(y_test,grid_prediction)
clf_report = classification_report(y_test, grid_prediction)
print("cm: ", cm)
print("clf_report: ", clf_report)

## ROC AUC 
from sklearn.metrics import roc_auc_score     # key metric for binary classification 
roc_auc_score = roc_auc_score(y_test,grid_model_GaussianNB.predict_proba(x_test)[:,1] )
print("roc_auc_score: ", roc_auc_score)

from sklearn.metrics import f1_score
f1_score = f1_score(y_test,grid_prediction, average='weighted' )
print("f1_score: ", f1_score)


# grid results 
grid_results = grid_model_GaussianNB.cv_results_
print("grid_results: ", grid_results)

print("grid_model.best_params_ : ",grid_model_GaussianNB.best_params_)
print("grid_model.best_estimator_:  ",grid_model_GaussianNB.best_estimator_)
print("grid_model.best_score_  :  ", grid_model_GaussianNB.best_score_)

Fitting 5 folds for each of 4 candidates, totalling 20 fits
grid_prediction:  [0 0 0 0 0 0 0 1 0 0 0 0 0 0 0 1 0 0 1 0 0 1 0 1 0 1 0 0 0 0 0 0 1 0 0 0 0
 0 0 1 0 0 0 0 1 0 0 1 0 1 1 0 0 1 1 0 0 0 1 0 0 1 0 0 0 1 0 0 0 0 1 0 0 0
 0 0 0 0 1 1]
cm:  [[56  2]
 [ 4 18]]
clf_report:                precision    recall  f1-score   support

           0       0.93      0.97      0.95        58
           1       0.90      0.82      0.86        22

    accuracy                           0.93        80
   macro avg       0.92      0.89      0.90        80
weighted avg       0.92      0.93      0.92        80

roc_auc_score:  0.9827586206896552
f1_score:  0.9238498789346247
grid_results:  {'mean_fit_time': array([0.00416136, 0.00517302, 0.0050528 , 0.00471735]), 'std_fit_time': array([0.00091522, 0.00134381, 0.00177974, 0.00129401]), 'mean_score_time': array([0.00921645, 0.01020894, 0.00851207, 0.00962462]), 'std_score_time': array([0.00121252, 0.00199239, 0.00079427, 0.00119536]), 'param_var_smoo

## model prediction with real time data

In [98]:
## user input 
Gender=int(input("Enter your gender (0-female/1-male): "))
Age	= int(input("Enter your age: "))
EstimatedSalary = int(input("Enter your estimated salary: "))


""" test input data's
Male	19	19000	- 0
Female	46	41000	- 1
Female	50	20000	- 1
Male	36	33000	- 0
"""

Enter your gender (0-female/1-male):  0
Enter your age:  50
Enter your estimated salary:  20000


" test input data's\nMale\t19\t19000\t- 0\nFemale\t46\t41000\t- 1\nFemale\t50\t20000\t- 1\nMale\t36\t33000\t- 0\n"

In [99]:
## user input predction 
user_input_prediction = grid_model_GaussianNB.predict([[Gender,Age,EstimatedSalary ]])
if user_input_prediction==1:
    print("user will purchase")
else:
    print("user will not purchase")

print(user_input_prediction)

user will not purchase
[0]
