<a href="https://www.kaggle.com/code/larsmagnusson/itf31519-naive-bayes-hyperparameter-tuning?scriptVersionId=106340166" target="_blank"><img align="left" alt="Kaggle" title="Open in Kaggle" src="https://kaggle.com/static/images/open-in-kaggle.svg"></a>

In [1]:
# This Python 3 environment comes with many helpful analytics libraries installed
# It is defined by the kaggle/python Docker image: https://github.com/kaggle/docker-python
# For example, here's several helpful packages to load

import numpy as np # linear algebra
import pandas as pd # data processing, CSV file I/O (e.g. pd.read_csv)

# Input data files are available in the read-only "../input/" directory
# For example, running this (by clicking run or pressing Shift+Enter) will list all files under the input directory

import os
for dirname, _, filenames in os.walk('/kaggle/input'):
    for filename in filenames:
        print(os.path.join(dirname, filename))

# You can write up to 20GB to the current directory (/kaggle/working/) that gets preserved as output when you create a version using "Save & Run All" 
# You can also write temporary files to /kaggle/temp/, but they won't be saved outside of the current session

/kaggle/input/playtennis3/dataset.csv
/kaggle/input/uci-wine/wine.data


In [2]:
# Load the play tennis dataset
play_tennis = pd.read_csv('/kaggle/input/playtennis3/dataset.csv')

# Convert the entire dataframe to categorical type
play_tennis = play_tennis.astype('category')

# Show the first instances
play_tennis.tail()

Unnamed: 0,Outlook,Temperature,Humidity,Wind,Play?
6,Overcast,Cool,Normal,Strong,Yes
7,Sunny,Mild,High,Weak,No
8,Sunny,Cool,Normal,Weak,Yes
9,Rain,Mild,Normal,Weak,Yes
10,Sunny,Mild,Normal,Strong,Yes


In [3]:
from sklearn.preprocessing import OrdinalEncoder

# Encode dataset using OrdinalEncoder (converts to integers). This is done 
# on the entire dataset to ensure that all classes are seen
play_tennis_encoded = pd.DataFrame(OrdinalEncoder().fit_transform(play_tennis))

# Split into training and test 
pt_train = play_tennis_encoded.iloc[0:-1]
pt_test = play_tennis_encoded.iloc[-1:]

# Split features from targets for easy access
pt_train_features = pt_train.iloc[:,0:4]
pt_train_targets = pt_train.iloc[:,4]
pt_test_features = pt_test.iloc[:,0:4]
pt_test_targets = pt_test.iloc[:,4]

pt_train_features.head()

Unnamed: 0,0,1,2,3
0,2.0,1.0,0.0,1.0
1,2.0,1.0,0.0,0.0
2,0.0,1.0,0.0,1.0
3,1.0,0.0,1.0,1.0
4,1.0,2.0,0.0,1.0


In [4]:
from sklearn.naive_bayes import CategoricalNB

# Create a naive classifier for categorical data and fit it to our 
# training data
categorical_nb = CategoricalNB()
categorical_nb.fit(pt_train_features, pt_train_targets)

# Score the classifier on our test data
(categorical_nb.score(pt_train_features, pt_train_targets), categorical_nb.score(pt_test_features, pt_test_targets))

(0.9, 0.0)

In [5]:
# Load the UCI wine dataset. 
uci_wine = pd.read_csv('/kaggle/input/uci-wine/wine.data',header=None)
uci_wine[0] = uci_wine[0].astype('category')

# Perform stratified sampling 80/20 split
uci_wine_train = uci_wine.groupby(0, group_keys=False).apply(lambda group: group.sample(frac=0.8))
uci_wine_test = uci_wine.drop(uci_wine_train.index)

# Split training data into training and validation for non cross-validated evaluation
uci_wine_val = uci_wine_train.groupby(0, group_keys=False).apply(lambda group: group.sample(frac=0.25))
uci_wine_train_ = uci_wine_train.drop(uci_wine_val.index)

# Split into features and ground thruths/target values
uw_train_targets = uci_wine_train[0]
uw_train_features = uci_wine_train.drop(0, axis=1)
uw_test_targets = uci_wine_test[0]
uw_test_features = uci_wine_test.drop(0, axis=1)

In [6]:
from sklearn.naive_bayes import GaussianNB

# Create Gaussian naive Bayes classifier and train it using the training data
gaussian_nb = GaussianNB()
gaussian_nb.fit(uw_train_features, uw_train_targets)

# Get the scores for both training and test data
(gaussian_nb.score(uw_train_features, uw_train_targets), gaussian_nb.score(uw_test_features, uw_test_targets))

(0.9859154929577465, 1.0)

In [7]:
correlation = uw_train_features.corr()
correlation.style.background_gradient(cmap='coolwarm', axis=None)

Unnamed: 0,1,2,3,4,5,6,7,8,9,10,11,12,13
1,1.0,0.100448,0.169458,-0.356089,0.308653,0.338907,0.301617,-0.226535,0.274731,0.508133,-0.049843,0.128912,0.642919
2,0.100448,1.0,0.138324,0.257923,-0.00317,-0.337405,-0.389181,0.276577,-0.184976,0.265143,-0.581048,-0.32382,-0.174606
3,0.169458,0.138324,1.0,0.422062,0.370725,0.138065,0.137774,0.149775,0.101977,0.241186,-0.050709,0.042503,0.220309
4,-0.356089,0.257923,0.422062,1.0,-0.118682,-0.390032,-0.377948,0.352725,-0.239679,-0.011557,-0.296752,-0.275086,-0.456064
5,0.308653,-0.00317,0.370725,-0.118682,1.0,0.21073,0.201074,-0.236016,0.212391,0.259371,-0.019452,0.039482,0.37837
6,0.338907,-0.337405,0.138065,-0.390032,0.21073,1.0,0.865457,-0.473185,0.638132,-0.048549,0.444486,0.710593,0.526616
7,0.301617,-0.389181,0.137774,-0.377948,0.201074,0.865457,1.0,-0.554712,0.713628,-0.136565,0.545809,0.797497,0.515645
8,-0.226535,0.276577,0.149775,0.352725,-0.236016,-0.473185,-0.554712,1.0,-0.392152,0.082086,-0.253199,-0.497073,-0.334439
9,0.274731,-0.184976,0.101977,-0.239679,0.212391,0.638132,0.713628,-0.392152,1.0,0.013461,0.341743,0.565588,0.431889
10,0.508133,0.265143,0.241186,-0.011557,0.259371,-0.048549,-0.136565,0.082086,0.013461,1.0,-0.52569,-0.401663,0.325333


In [8]:
from sklearn.decomposition import PCA
from sklearn.naive_bayes import GaussianNB

# Naive Bayes expects independent features, so we could try Principle Component Analysis
# to try to improve our results

# Create the PCA object 
pca = PCA()
# Fit PCA object to training data and return the components (new features)
transformed_train_features = pca.fit_transform(uw_train_features, uw_train_targets)

# Create and fit a second Gaussian classifier to the transformed data
gaussian_nb2 = GaussianNB()
gaussian_nb2.fit(transformed_train_features, uw_train_targets)

# Tranform the test data and get the scores for both training and test data
transformed_test_features = pca.transform(uw_test_features)
(gaussian_nb2.score(transformed_train_features,uw_train_targets), gaussian_nb2.score(transformed_test_features,uw_test_targets))


(0.9859154929577465, 0.9444444444444444)

In [9]:
from sklearn.pipeline import make_pipeline

# The second classifier can be created much simpler using a Pipeline
pca_gaussian_nb = make_pipeline(PCA(),GaussianNB())
pca_gaussian_nb.fit(uw_train_features, uw_train_targets)

# Get the scores for both training and test data
(pca_gaussian_nb.score(uw_train_features, uw_train_targets), pca_gaussian_nb.score(uw_test_features, uw_test_targets))

(0.9859154929577465, 0.9444444444444444)

In [10]:
uw_train_targets_ = uci_wine_train_[0]
uw_train_features_ = uci_wine_train_.drop(0, axis=1)

uw_val_targets = uci_wine_val[0]
uw_val_features = uci_wine_val.drop(0, axis=1)

nb1 = GaussianNB(var_smoothing=1e-15)
nb2 = GaussianNB(var_smoothing=1e-9)
nb3 = GaussianNB(var_smoothing=1e-4)
nb4 = GaussianNB(var_smoothing=1e-2)

nb1.fit(uw_train_features_, uw_train_targets_)
nb2.fit(uw_train_features_, uw_train_targets_)
nb3.fit(uw_train_features_, uw_train_targets_)
nb4.fit(uw_train_features_, uw_train_targets_)

(nb1.score(uw_val_features, uw_val_targets), nb2.score(uw_val_features, uw_val_targets), nb3.score(uw_val_features, uw_val_targets), nb4.score(uw_val_features, uw_val_targets))

final_nb = GaussianNB(var_smoothing=1e-9)
final_nb.fit(uw_train_features, uw_train_targets)

final_nb.score(uw_test_features, uw_test_targets)

# Then tested on final test data

1.0

In [11]:
from sklearn.model_selection import cross_val_score

# Test three different var_smoothing parameter values with 5-fold cross validation
scores1 = cross_val_score(GaussianNB(var_smoothing=1e-9), uw_train_features, uw_train_targets, cv=5)
scores2 = cross_val_score(GaussianNB(var_smoothing=1e-6), uw_train_features, uw_train_targets, cv=5)
scores3 = cross_val_score(GaussianNB(var_smoothing=1e-4), uw_train_features, uw_train_targets, cv=5)

# Show the average scores and the standard deviation of the three models
(scores1.mean(), scores1.std(), scores2.mean(), scores2.std(), scores3.mean(), scores3.std())

# var_smoothing=1e-9 offers the best average performance

# we finish with a new training on the entire training set with the best parameter value

(0.9862068965517242,
 0.027586206896551738,
 0.9650246305418719,
 0.04401637025357374,
 0.9091133004926109,
 0.08925776698673663)