## Naive bayes with GridSearchCV

In [1]:
# read the dataset
import pandas as pd 
dataset = pd.read_csv("Social_Network_Ads.csv")
dataset.head()

Unnamed: 0,User ID,Gender,Age,EstimatedSalary,Purchased
0,15624510,Male,19,19000,0
1,15810944,Male,35,20000,0
2,15668575,Female,26,43000,0
3,15603246,Female,27,57000,0
4,15804002,Male,19,76000,0


In [2]:
# convert categorical into numerical 
dataset = pd.get_dummies(dataset, drop_first=True, dtype=int)
dataset = dataset.drop(["User ID"],axis=1)
dataset.head()

Unnamed: 0,Age,EstimatedSalary,Purchased,Gender_Male
0,19,19000,0,1
1,35,20000,0,1
2,26,43000,0,0
3,27,57000,0,0
4,19,76000,0,1


In [3]:
# split input and output 
independent = dataset[[ "Age",	"EstimatedSalary", "Gender_Male"]]
dependent = dataset[[ "Purchased" ]]

In [4]:
# check dataset balance or imbalance
dataset["Purchased"].value_counts()

Purchased
0    257
1    143
Name: count, dtype: int64

In [5]:
# split train and test
from sklearn.model_selection import train_test_split
x_train,x_test,y_train,y_test = train_test_split(independent,dependent, test_size=0.20,random_state=0)
print(x_train,x_test,y_train,y_test )

     Age  EstimatedSalary  Gender_Male
336   58           144000            1
64    59            83000            0
55    24            55000            0
106   26            35000            0
300   58            38000            0
..   ...              ...          ...
323   48            30000            0
192   29            43000            1
117   36            52000            1
47    27            54000            0
172   26           118000            0

[320 rows x 3 columns]      Age  EstimatedSalary  Gender_Male
132   30            87000            1
309   38            50000            0
341   35            75000            1
196   30            79000            0
246   35            50000            0
..   ...              ...          ...
14    18            82000            1
363   42            79000            0
304   40            60000            0
361   53            34000            0
329   47           107000            0

[80 rows x 3 columns]      Purchased
33

In [6]:
# standardization 
from sklearn.preprocessing import StandardScaler
sc = StandardScaler()
x_train = sc.fit_transform(x_train)
x_test = sc.transform(x_test)

In [11]:
x_train

array([[ 1.92295008e+00,  2.14601566e+00,  1.02532046e+00],
       [ 2.02016082e+00,  3.78719297e-01, -9.75304830e-01],
       [-1.38221530e+00, -4.32498705e-01, -9.75304830e-01],
       [-1.18779381e+00, -1.01194013e+00, -9.75304830e-01],
       [ 1.92295008e+00, -9.25023920e-01, -9.75304830e-01],
       [ 3.67578135e-01,  2.91803083e-01, -9.75304830e-01],
       [ 1.73156642e-01,  1.46942725e-01, -9.75304830e-01],
       [ 2.02016082e+00,  1.74040666e+00,  1.02532046e+00],
       [ 7.56421121e-01, -8.38107706e-01, -9.75304830e-01],
       [ 2.70367388e-01, -2.87638347e-01, -9.75304830e-01],
       [ 3.67578135e-01, -1.71750061e-01,  1.02532046e+00],
       [-1.18475597e-01,  2.20395980e+00, -9.75304830e-01],
       [-1.47942605e+00, -6.35303205e-01, -9.75304830e-01],
       [-1.28500455e+00, -1.06988428e+00,  1.02532046e+00],
       [-1.38221530e+00,  4.07691369e-01,  1.02532046e+00],
       [-1.09058306e+00,  7.55356227e-01, -9.75304830e-01],
       [-1.47942605e+00, -2.00722133e-01

In [7]:
y_train = y_train.values.ravel()

## BernoulliNB

In [8]:
# model creation + grid cv  BernoulliNB
from sklearn.model_selection import GridSearchCV
from sklearn.naive_bayes import BernoulliNB 

param_grid ={
"alpha" : [0.1, 0.5, 1.0],
"binarize" : [0.0, 0.5, 1.0]
}
model = BernoulliNB()

grid_model = GridSearchCV(model, param_grid,cv=5, refit=True, verbose=3  , n_jobs =-1, scoring='f1_weighted') 
grid_model.fit(x_train,y_train)

# model prediction + grid cv
grid_prediction = grid_model.predict(x_test)
print("grid_prediction: ",grid_prediction)

# model evaluation --- Performance on the unseen TEST dataset
from sklearn.metrics import confusion_matrix, classification_report
cm = confusion_matrix(y_test,grid_prediction)
clf_report = classification_report(y_test, grid_prediction)
print("cm: ", cm)
print("clf_report: ", clf_report)

## ROC AUC 
from sklearn.metrics import roc_auc_score     # key metric for binary classification 
roc_auc_score = roc_auc_score(y_test,grid_model.predict_proba(x_test)[:,1] )
print("roc_auc_score: ", roc_auc_score)

from sklearn.metrics import f1_score
f1_score = f1_score(y_test,grid_prediction, average='weighted' )
print("f1_score: ", f1_score)


# grid results 
grid_results = grid_model.cv_results_
print("grid_results: ", grid_results)

print("grid_model.best_params_ : ",grid_model.best_params_)
print("grid_model.best_estimator_:  ",grid_model.best_estimator_)
print("grid_model.best_score_  :  ", grid_model.best_score_)

Fitting 5 folds for each of 9 candidates, totalling 45 fits
grid_prediction:  [0 0 0 0 0 0 0 1 0 1 0 0 0 0 0 1 0 0 1 0 0 1 0 1 0 1 0 0 0 0 1 0 1 0 0 0 0
 0 0 1 1 0 0 0 1 0 0 1 0 1 1 0 0 1 1 1 0 0 1 0 0 1 0 1 0 1 1 0 0 0 1 0 0 1
 1 0 0 0 1 1]
cm:  [[51  7]
 [ 1 21]]
clf_report:                precision    recall  f1-score   support

           0       0.98      0.88      0.93        58
           1       0.75      0.95      0.84        22

    accuracy                           0.90        80
   macro avg       0.87      0.92      0.88        80
weighted avg       0.92      0.90      0.90        80

roc_auc_score:  0.9502351097178684
f1_score:  0.9032727272727273
grid_results:  {'mean_fit_time': array([0.35561304, 0.00619502, 0.00562644, 0.00688944, 0.00661182,
       0.00552511, 0.00674129, 0.00503969, 0.00540166]), 'std_fit_time': array([0.42831609, 0.00265882, 0.00077393, 0.0020471 , 0.00206112,
       0.00118378, 0.00179541, 0.00079856, 0.00120046]), 'mean_score_time': array([0.0124

## GaussianNB

In [9]:
# model creation + grid cv  GaussianNB
from sklearn.model_selection import GridSearchCV
from sklearn.naive_bayes import GaussianNB 

param_grid ={
"var_smoothing" : [1e-9, 1e-8, 1e-7, 1e-6]
}
model = GaussianNB()

grid_model = GridSearchCV(model, param_grid,cv=5, refit=True, verbose=3  , n_jobs =-1, scoring='f1_weighted') 
grid_model.fit(x_train,y_train)

# model prediction + grid cv
grid_prediction = grid_model.predict(x_test)
print("grid_prediction: ",grid_prediction)

# model evaluation --- Performance on the unseen TEST dataset
from sklearn.metrics import confusion_matrix, classification_report
cm = confusion_matrix(y_test,grid_prediction)
clf_report = classification_report(y_test, grid_prediction)
print("cm: ", cm)
print("clf_report: ", clf_report)

## ROC AUC 
from sklearn.metrics import roc_auc_score     # key metric for binary classification 
roc_auc_score = roc_auc_score(y_test,grid_model.predict_proba(x_test)[:,1] )
print("roc_auc_score: ", roc_auc_score)

from sklearn.metrics import f1_score
f1_score = f1_score(y_test,grid_prediction, average='weighted' )
print("f1_score: ", f1_score)


# grid results 
grid_results = grid_model.cv_results_
print("grid_results: ", grid_results)

print("grid_model.best_params_ : ",grid_model.best_params_)
print("grid_model.best_estimator_:  ",grid_model.best_estimator_)
print("grid_model.best_score_  :  ", grid_model.best_score_)

Fitting 5 folds for each of 4 candidates, totalling 20 fits
grid_prediction:  [0 0 0 0 0 0 0 1 0 0 0 0 0 0 0 1 0 0 1 0 0 1 0 1 0 1 0 0 0 0 0 0 1 0 0 0 0
 0 0 1 0 0 0 0 1 0 0 1 0 1 1 0 0 1 1 0 0 0 1 0 0 1 0 0 0 1 0 0 0 0 1 0 0 0
 0 0 0 0 1 1]
cm:  [[56  2]
 [ 4 18]]
clf_report:                precision    recall  f1-score   support

           0       0.93      0.97      0.95        58
           1       0.90      0.82      0.86        22

    accuracy                           0.93        80
   macro avg       0.92      0.89      0.90        80
weighted avg       0.92      0.93      0.92        80

roc_auc_score:  0.9827586206896552
f1_score:  0.9238498789346247
grid_results:  {'mean_fit_time': array([0.0035183 , 0.00913148, 0.0044127 , 0.00540533]), 'std_fit_time': array([0.00026552, 0.00865911, 0.00120049, 0.00223086]), 'mean_score_time': array([0.01142879, 0.01226616, 0.01213589, 0.01030736]), 'std_score_time': array([0.0028372 , 0.00268854, 0.00078005, 0.00113716]), 'param_var_smoo

## MultinomialNB

In [10]:
# model creation + grid cv  MultinomialNB
from sklearn.model_selection import GridSearchCV
from sklearn.naive_bayes import MultinomialNB 

param_grid ={
"alpha" :[0.1, 0.5, 1.0, 2.0],
"fit_prior": [True, False]
}
model = MultinomialNB()

grid_model = GridSearchCV(model, param_grid,cv=5, refit=True, verbose=3  , n_jobs =-1, scoring='f1_weighted') 
grid_model.fit(x_train,y_train)

# model prediction + grid cv
grid_prediction = grid_model.predict(x_test)
print("grid_prediction: ",grid_prediction)

# model evaluation --- Performance on the unseen TEST dataset
from sklearn.metrics import confusion_matrix, classification_report
cm = confusion_matrix(y_test,grid_prediction)
clf_report = classification_report(y_test, grid_prediction)
print("cm: ", cm)
print("clf_report: ", clf_report)

## ROC AUC 
from sklearn.metrics import roc_auc_score     # key metric for binary classification 
roc_auc_score = roc_auc_score(y_test,grid_model.predict_proba(x_test)[:,1] )
print("roc_auc_score: ", roc_auc_score)

from sklearn.metrics import f1_score
f1_score = f1_score(y_test,grid_prediction, average='weighted' )
print("f1_score: ", f1_score)


# grid results 
grid_results = grid_model.cv_results_
print("grid_results: ", grid_results)

print("grid_model.best_params_ : ",grid_model.best_params_)
print("grid_model.best_estimator_:  ",grid_model.best_estimator_)
print("grid_model.best_score_  :  ", grid_model.best_score_)

Fitting 5 folds for each of 8 candidates, totalling 40 fits


ValueError: 
All the 40 fits failed.
It is very likely that your model is misconfigured.
You can try to debug the error by setting error_score='raise'.

Below are more details about the failures:
--------------------------------------------------------------------------------
40 fits failed with the following error:
Traceback (most recent call last):
  File "E:\anaconda3\Lib\site-packages\sklearn\model_selection\_validation.py", line 866, in _fit_and_score
    estimator.fit(X_train, y_train, **fit_params)
    ~~~~~~~~~~~~~^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^
  File "E:\anaconda3\Lib\site-packages\sklearn\base.py", line 1389, in wrapper
    return fit_method(estimator, *args, **kwargs)
  File "E:\anaconda3\Lib\site-packages\sklearn\naive_bayes.py", line 762, in fit
    self._count(X, Y)
    ~~~~~~~~~~~^^^^^^
  File "E:\anaconda3\Lib\site-packages\sklearn\naive_bayes.py", line 889, in _count
    check_non_negative(X, "MultinomialNB (input X)")
    ~~~~~~~~~~~~~~~~~~^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^
  File "E:\anaconda3\Lib\site-packages\sklearn\utils\validation.py", line 1827, in check_non_negative
    raise ValueError(f"Negative values in data passed to {whom}.")
ValueError: Negative values in data passed to MultinomialNB (input X).


## CategoricalNB

In [12]:
# model creation + grid cv  CategoricalNB
from sklearn.model_selection import GridSearchCV
from sklearn.naive_bayes import CategoricalNB 

param_grid ={
"alpha" :[0.1, 0.5, 1.0],
"min_categories" :[None, 2, 5],

}
model = CategoricalNB()

grid_model = GridSearchCV(model, param_grid,cv=5, refit=True, verbose=3  , n_jobs =-1, scoring='f1_weighted') 
grid_model.fit(x_train,y_train)

# model prediction + grid cv
grid_prediction = grid_model.predict(x_test)
print("grid_prediction: ",grid_prediction)

# model evaluation --- Performance on the unseen TEST dataset
from sklearn.metrics import confusion_matrix, classification_report
cm = confusion_matrix(y_test,grid_prediction)
clf_report = classification_report(y_test, grid_prediction)
print("cm: ", cm)
print("clf_report: ", clf_report)

## ROC AUC 
from sklearn.metrics import roc_auc_score     # key metric for binary classification 
roc_auc_score = roc_auc_score(y_test,grid_model.predict_proba(x_test)[:,1] )
print("roc_auc_score: ", roc_auc_score)

from sklearn.metrics import f1_score
f1_score = f1_score(y_test,grid_prediction, average='weighted' )
print("f1_score: ", f1_score)


# grid results 
grid_results = grid_model.cv_results_
print("grid_results: ", grid_results)

print("grid_model.best_params_ : ",grid_model.best_params_)
print("grid_model.best_estimator_:  ",grid_model.best_estimator_)
print("grid_model.best_score_  :  ", grid_model.best_score_)

Fitting 5 folds for each of 9 candidates, totalling 45 fits


ValueError: 
All the 45 fits failed.
It is very likely that your model is misconfigured.
You can try to debug the error by setting error_score='raise'.

Below are more details about the failures:
--------------------------------------------------------------------------------
45 fits failed with the following error:
Traceback (most recent call last):
  File "E:\anaconda3\Lib\site-packages\sklearn\model_selection\_validation.py", line 866, in _fit_and_score
    estimator.fit(X_train, y_train, **fit_params)
    ~~~~~~~~~~~~~^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^
  File "E:\anaconda3\Lib\site-packages\sklearn\naive_bayes.py", line 1388, in fit
    return super().fit(X, y, sample_weight=sample_weight)
           ~~~~~~~~~~~^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^
  File "E:\anaconda3\Lib\site-packages\sklearn\base.py", line 1389, in wrapper
    return fit_method(estimator, *args, **kwargs)
  File "E:\anaconda3\Lib\site-packages\sklearn\naive_bayes.py", line 735, in fit
    X, y = self._check_X_y(X, y)
           ~~~~~~~~~~~~~~~^^^^^^
  File "E:\anaconda3\Lib\site-packages\sklearn\naive_bayes.py", line 1463, in _check_X_y
    check_non_negative(X, "CategoricalNB (input X)")
    ~~~~~~~~~~~~~~~~~~^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^
  File "E:\anaconda3\Lib\site-packages\sklearn\utils\validation.py", line 1827, in check_non_negative
    raise ValueError(f"Negative values in data passed to {whom}.")
ValueError: Negative values in data passed to CategoricalNB (input X).


## ComplementNB

In [13]:
# model creation + grid cv  ComplementNB
from sklearn.model_selection import GridSearchCV
from sklearn.naive_bayes import ComplementNB 

param_grid ={
"alpha" : [0.1, 0.5, 1.0],
"norm" : [True, False]
}
model = ComplementNB()

grid_model = GridSearchCV(model, param_grid,cv=5, refit=True, verbose=3  , n_jobs =-1, scoring='f1_weighted') 
grid_model.fit(x_train,y_train)

# model prediction + grid cv
grid_prediction = grid_model.predict(x_test)
print("grid_prediction: ",grid_prediction)

# model evaluation --- Performance on the unseen TEST dataset
from sklearn.metrics import confusion_matrix, classification_report
cm = confusion_matrix(y_test,grid_prediction)
clf_report = classification_report(y_test, grid_prediction)
print("cm: ", cm)
print("clf_report: ", clf_report)

## ROC AUC 
from sklearn.metrics import roc_auc_score     # key metric for binary classification 
roc_auc_score = roc_auc_score(y_test,grid_model.predict_proba(x_test)[:,1] )
print("roc_auc_score: ", roc_auc_score)

from sklearn.metrics import f1_score
f1_score = f1_score(y_test,grid_prediction, average='weighted' )
print("f1_score: ", f1_score)


# grid results 
grid_results = grid_model.cv_results_
print("grid_results: ", grid_results)

print("grid_model.best_params_ : ",grid_model.best_params_)
print("grid_model.best_estimator_:  ",grid_model.best_estimator_)
print("grid_model.best_score_  :  ", grid_model.best_score_)

Fitting 5 folds for each of 6 candidates, totalling 30 fits


ValueError: 
All the 30 fits failed.
It is very likely that your model is misconfigured.
You can try to debug the error by setting error_score='raise'.

Below are more details about the failures:
--------------------------------------------------------------------------------
30 fits failed with the following error:
Traceback (most recent call last):
  File "E:\anaconda3\Lib\site-packages\sklearn\model_selection\_validation.py", line 866, in _fit_and_score
    estimator.fit(X_train, y_train, **fit_params)
    ~~~~~~~~~~~~~^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^
  File "E:\anaconda3\Lib\site-packages\sklearn\base.py", line 1389, in wrapper
    return fit_method(estimator, *args, **kwargs)
  File "E:\anaconda3\Lib\site-packages\sklearn\naive_bayes.py", line 762, in fit
    self._count(X, Y)
    ~~~~~~~~~~~^^^^^^
  File "E:\anaconda3\Lib\site-packages\sklearn\naive_bayes.py", line 1037, in _count
    check_non_negative(X, "ComplementNB (input X)")
    ~~~~~~~~~~~~~~~~~~^^^^^^^^^^^^^^^^^^^^^^^^^^^^^
  File "E:\anaconda3\Lib\site-packages\sklearn\utils\validation.py", line 1827, in check_non_negative
    raise ValueError(f"Negative values in data passed to {whom}.")
ValueError: Negative values in data passed to ComplementNB (input X).


## model prediction with real time data

In [32]:
## user input 
Gender=int(input("Enter your gender (0-female/1-male): "))
Age	= int(input("Enter your age: "))
EstimatedSalary = int(input("Enter your estimated salary: "))


""" test input data's
Male	19	19000	- 0
Female	46	41000	- 1
Female	50	20000	- 1
Male	36	33000	- 0
"""

Enter your gender (0-female/1-male):  1
Enter your age:  36
Enter your estimated salary:  33000


" test input data's\nMale\t19\t19000\t- 0\nFemale\t46\t41000\t- 1\nFemale\t50\t20000\t- 1\nMale\t36\t33000\t- 0\n"

In [34]:
## user input predction 
user_input_prediction = grid_model.predict([[Gender,Age,EstimatedSalary ]])
if user_input_prediction==1:
    print("user will purchase")
else:
    print("user will not purchase")

print(user_input_prediction)

user will purchase
[1]
