<a href="https://www.kaggle.com/code/larsmagnusson/itf31519-naive-bayes-hyperparameter-tuning?scriptVersionId=106149989" target="_blank"><img align="left" alt="Kaggle" title="Open in Kaggle" src="https://kaggle.com/static/images/open-in-kaggle.svg"></a>

In [1]:
# This Python 3 environment comes with many helpful analytics libraries installed
# It is defined by the kaggle/python Docker image: https://github.com/kaggle/docker-python
# For example, here's several helpful packages to load

import numpy as np # linear algebra
import pandas as pd # data processing, CSV file I/O (e.g. pd.read_csv)

# Input data files are available in the read-only "../input/" directory
# For example, running this (by clicking run or pressing Shift+Enter) will list all files under the input directory

import os
for dirname, _, filenames in os.walk('/kaggle/input'):
    for filename in filenames:
        print(os.path.join(dirname, filename))

# You can write up to 20GB to the current directory (/kaggle/working/) that gets preserved as output when you create a version using "Save & Run All" 
# You can also write temporary files to /kaggle/temp/, but they won't be saved outside of the current session

/kaggle/input/playtennis3/dataset.csv
/kaggle/input/uci-wine/wine.data


In [2]:
# Load the play tennis dataset
play_tennis = pd.read_csv('/kaggle/input/playtennis3/dataset.csv')

# Convert the entire dataframe to categorical type
play_tennis = play_tennis.astype('category')

# Show the first instances
play_tennis.tail()

Unnamed: 0,Outlook,Temperature,Humidity,Wind,Play?
6,Overcast,Cool,Normal,Strong,Yes
7,Sunny,Mild,High,Weak,No
8,Sunny,Cool,Normal,Weak,Yes
9,Rain,Mild,Normal,Weak,Yes
10,Sunny,Mild,Normal,Strong,Yes


In [3]:
from sklearn.preprocessing import OrdinalEncoder

# Encode dataset using OrdinalEncoder (converts to integers). This is done 
# on the entire dataset to ensure that all classes are seen
play_tennis_encoded = pd.DataFrame(OrdinalEncoder().fit_transform(play_tennis))

# Split into training and test 
pt_train = play_tennis_encoded.iloc[0:-1]
pt_test = play_tennis_encoded.iloc[-1:]

# Split features from targets for easy access
pt_train_features = pt_train.iloc[:,0:4]
pt_train_targets = pt_train.iloc[:,4]
pt_test_features = pt_test.iloc[:,0:4]
pt_test_targets = pt_test.iloc[:,4]

pt_test_features

Unnamed: 0,0,1,2,3
10,2.0,2.0,1.0,0.0


In [4]:
from sklearn.naive_bayes import CategoricalNB

# Create a naive classifier for categorical data and fit it to our 
# training data
categorical_nb = CategoricalNB()
categorical_nb.fit(pt_train_features, pt_train_targets)

# Score the classifier on our test data
(categorical_nb.score(pt_train_features, pt_train_targets), categorical_nb.score(pt_test_features, pt_test_targets))

(0.9, 0.0)

In [5]:
# Load the UCI wine dataset. 
uci_wine = pd.read_csv('/kaggle/input/uci-wine/wine.data',header=None)
uci_wine[0] = uci_wine[0].astype('category')

# Perform stratified sampling 80/20 split
uci_wine_train = uci_wine.groupby(0, group_keys=False).apply(lambda group: group.sample(frac=0.8))
uci_wine_test = uci_wine.drop(uci_wine_train.index)

# Split into features and ground thruths/target values
uw_train_targets = uci_wine_train[0]
uw_train_features = uci_wine_train.drop(0, axis=1)
uw_test_targets = uci_wine_test[0]
uw_test_features = uci_wine_test.drop(0, axis=1)

In [6]:
from sklearn.naive_bayes import GaussianNB

# Create Gaussian naive Bayes classifier and train it using the training data
gaussian_nb = GaussianNB()
gaussian_nb.fit(uw_train_features, uw_train_targets)

# Get the scores for both training and test data
(gaussian_nb.score(uw_train_features, uw_train_targets), gaussian_nb.score(uw_test_features, uw_test_targets))

(0.9859154929577465, 1.0)

In [7]:
correlation = uw_train_features.corr()
correlation.style.background_gradient(cmap='coolwarm', axis=None)

Unnamed: 0,1,2,3,4,5,6,7,8,9,10,11,12,13
1,1.0,0.13631,0.149731,-0.317138,0.245946,0.245012,0.211613,-0.132392,0.118965,0.523134,-0.068995,0.072658,0.640309
2,0.13631,1.0,0.182814,0.256963,-0.054539,-0.31523,-0.388262,0.264812,-0.209218,0.299212,-0.545039,-0.348151,-0.181562
3,0.149731,0.182814,1.0,0.490927,0.277862,0.106831,0.110521,0.223184,0.007547,0.227656,-0.082203,0.025495,0.171749
4,-0.317138,0.256963,0.490927,1.0,-0.058171,-0.27417,-0.322663,0.364135,-0.205917,0.074074,-0.250571,-0.267063,-0.394292
5,0.245946,-0.054539,0.277862,-0.058171,1.0,0.220864,0.189272,-0.247031,0.252368,0.16164,0.058113,0.092573,0.363919
6,0.245012,-0.31523,0.106831,-0.27417,0.220864,1.0,0.853917,-0.418965,0.61532,-0.113744,0.419722,0.704257,0.496724
7,0.211613,-0.388262,0.110521,-0.322663,0.189272,0.853917,1.0,-0.507935,0.663534,-0.188508,0.521171,0.787622,0.513729
8,-0.132392,0.264812,0.223184,0.364135,-0.247031,-0.418965,-0.507935,1.0,-0.363269,0.144517,-0.202223,-0.494678,-0.33598
9,0.118965,-0.209218,0.007547,-0.205917,0.252368,0.61532,0.663534,-0.363269,1.0,-0.026697,0.297833,0.529136,0.382099
10,0.523134,0.299212,0.227656,0.074074,0.16164,-0.113744,-0.188508,0.144517,-0.026697,1.0,-0.549865,-0.423514,0.250161


In [8]:
from sklearn.decomposition import PCA
from sklearn.naive_bayes import GaussianNB

# Naive Bayes expects independent features, so we could try Principle Component Analysis
# to try to improve our results

# Create the PCA object 
pca = PCA()
# Fit PCA object to training data and return the components (new features)
transformed_train_features = pca.fit_transform(uw_train_features, uw_train_targets)

# Create and fit a second Gaussian classifier to the transformed data
gaussian_nb2 = GaussianNB()
gaussian_nb2.fit(transformed_train_features, uw_train_targets)

# Tranform the test data and get the scores for both training and test data
transformed_test_features = pca.transform(uw_test_features)
(gaussian_nb2.score(transformed_train_features,uw_train_targets), gaussian_nb2.score(transformed_test_features,uw_test_targets))


(0.9788732394366197, 1.0)

In [9]:
from sklearn.pipeline import make_pipeline

# The second classifier can be created much simpler using a Pipeline
pca_gaussian_nb = make_pipeline(PCA(),GaussianNB())
pca_gaussian_nb.fit(uw_train_features, uw_train_targets)

# Get the scores for both training and test data
(pca_gaussian_nb.score(uw_train_features, uw_train_targets), pca_gaussian_nb.score(uw_test_features, uw_test_targets))

(0.9788732394366197, 1.0)

In [10]:
from sklearn.model_selection import cross_val_score

# Test three different var_smoothing parameter values with 5-fold cross validation
scores1 = cross_val_score(GaussianNB(var_smoothing=1e-9), uw_train_features, uw_train_targets, cv=5)
scores2 = cross_val_score(GaussianNB(var_smoothing=1e-6), uw_train_features, uw_train_targets, cv=5)
scores3 = cross_val_score(GaussianNB(var_smoothing=1e-4), uw_train_features, uw_train_targets, cv=5)

# Show the average scores and the standard deviation of the three models
(scores1.mean(), scores1.std(), scores2.mean(), scores2.std(), scores3.mean(), scores3.std())

# var_smoothing=1e-9 offers the best average performance

# we finish with a new training on the entire training set with the best parameter value

(0.9790640394088671,
 0.01710005414552686,
 0.972167487684729,
 0.02587496746141655,
 0.916256157635468,
 0.0800511557183895)