In [1]:
# This Python 3 environment comes with many helpful analytics libraries installed
# It is defined by the kaggle/python Docker image: https://github.com/kaggle/docker-python
# For example, here's several helpful packages to load

import numpy as np # linear algebra
import pandas as pd # data processing, CSV file I/O (e.g. pd.read_csv)

# Input data files are available in the read-only "../input/" directory
# For example, running this (by clicking run or pressing Shift+Enter) will list all files under the input directory

import os
for dirname, _, filenames in os.walk('/kaggle/input'):
    for filename in filenames:
        print(os.path.join(dirname, filename))

# You can write up to 20GB to the current directory (/kaggle/working/) that gets preserved as output when you create a version using "Save & Run All" 
# You can also write temporary files to /kaggle/temp/, but they won't be saved outside of the current session

/kaggle/input/uci-wine/wine.data
/kaggle/input/playtennis3/dataset.csv


In [2]:
# Load the play tennis dataset
play_tennis = pd.read_csv('/kaggle/input/playtennis3/dataset.csv')

# Convert the entire dataframe to categorical type
play_tennis = play_tennis.astype('category')

# Show the first instances
play_tennis.head()

Unnamed: 0,Outlook,Temperature,Humidity,Wind,Play?
0,Sunny,Hot,High,Weak,No
1,Sunny,Hot,High,Strong,No
2,Overcast,Hot,High,Weak,Yes
3,Rain,Cool,Normal,Weak,Yes
4,Rain,Mild,High,Weak,Yes


In [3]:
from sklearn.preprocessing import OrdinalEncoder

# Encode dataset using OrdinalEncoder (converts to integers). This is done 
# on the entire dataset to ensure that all classes are seen
play_tennis_encoded = pd.DataFrame(OrdinalEncoder().fit_transform(play_tennis))

# Split into training and test 
pt_train = play_tennis_encoded.iloc[0:-1]
pt_test = play_tennis_encoded.iloc[-1:]

# Split features from targets for easy access
pt_train_features = pt_train.iloc[:,0:3]
pt_train_targets = pt_train.iloc[:,4]
pt_test_features = pt_test.iloc[:,0:3]
pt_test_targets = pt_test.iloc[:,4]

In [4]:
from sklearn.naive_bayes import CategoricalNB

# Create a naive classifier for categorical data and fit it to our 
# training data
categorical_nb = CategoricalNB()
categorical_nb.fit(pt_train_features, pt_train_targets)

# Score the classifier on our test data
categorical_nb.score(pt_test_features, pt_test_targets)

1.0

In [5]:
# Load the UCI wine dataset. 
uci_wine = pd.read_csv('/kaggle/input/uci-wine/wine.data',header=None)
uci_wine[0] = uci_wine[0].astype('category')

# Perform stratified sampling 80/20 split
uci_wine_train = uci_wine.groupby(0, group_keys=False).apply(lambda group: group.sample(frac=0.8))
uci_wine_test = uci_wine.drop(uci_wine_train.index)

# Split into features and ground thruths/target values
uw_train_targets = uci_wine_train[0]
uw_train_features = uci_wine_train.drop(0, axis=1)
uw_test_targets = uci_wine_test[0]
uw_test_features = uci_wine_test.drop(0, axis=1)

In [6]:
from sklearn.naive_bayes import GaussianNB

# Create Gaussian naive Bayes classifier and train it using the training data
gaussian_nb = GaussianNB()
gaussian_nb.fit(uw_train_features, uw_train_targets)

# Get the scores for both training and test data
(gaussian_nb.score(uw_train_features, uw_train_targets), gaussian_nb.score(uw_test_features, uw_test_targets))

(0.9859154929577465, 1.0)

In [7]:
correlation = uw_train_features.corr()
correlation.style.background_gradient(cmap='coolwarm', axis=None)

Unnamed: 0,1,2,3,4,5,6,7,8,9,10,11,12,13
1,1.0,0.153864,0.186216,-0.290791,0.272764,0.21608,0.174437,-0.152751,0.026448,0.552462,-0.132056,0.037527,0.629068
2,0.153864,1.0,0.169273,0.295003,-0.023612,-0.376003,-0.443269,0.335714,-0.247472,0.301363,-0.583187,-0.420309,-0.192938
3,0.186216,0.169273,1.0,0.472486,0.284305,0.123426,0.145014,0.206637,0.003113,0.238015,-0.062966,0.02545,0.251791
4,-0.290791,0.295003,0.472486,1.0,-0.085249,-0.318081,-0.328908,0.43829,-0.19725,0.019386,-0.25487,-0.306228,-0.381058
5,0.272764,-0.023612,0.284305,-0.085249,1.0,0.247058,0.230547,-0.293194,0.309299,0.171029,0.075405,0.106829,0.390005
6,0.21608,-0.376003,0.123426,-0.318081,0.247058,1.0,0.860795,-0.442066,0.5878,-0.111362,0.426312,0.715179,0.501294
7,0.174437,-0.443269,0.145014,-0.328908,0.230547,0.860795,1.0,-0.510546,0.633676,-0.170186,0.512893,0.78518,0.489536
8,-0.152751,0.335714,0.206637,0.43829,-0.293194,-0.442066,-0.510546,1.0,-0.362668,0.101131,-0.228893,-0.45947,-0.349993
9,0.026448,-0.247472,0.003113,-0.19725,0.309299,0.5878,0.633676,-0.362668,1.0,-0.083595,0.300323,0.506176,0.31767
10,0.552462,0.301363,0.238015,0.019386,0.171029,-0.111362,-0.170186,0.101131,-0.083595,1.0,-0.526061,-0.430499,0.308461


In [8]:
from sklearn.decomposition import PCA
from sklearn.naive_bayes import GaussianNB

# Naive Bayes expects independent features, so we could try Principle Component Analysis
# to try to improve our results

# Create the PCA object 
pca = PCA()
# Fit PCA object to training data and return the components (new features)
transformed_train_features = pca.fit_transform(uw_train_features, uw_train_targets)

# Create and fit a second Gaussian classifier to the transformed data
gaussian_nb2 = GaussianNB()
gaussian_nb2.fit(transformed_train_features, uw_train_targets)

# Tranform the test data and get the scores for both training and test data
transformed_test_features = pca.transform(uw_test_features)
(gaussian_nb2.score(transformed_train_features,uw_train_targets), gaussian_nb2.score(transformed_test_features,uw_test_targets))


(0.9859154929577465, 0.9444444444444444)

In [9]:
from sklearn.pipeline import make_pipeline

# The second classifier can be created much simpler using a Pipeline
pca_gaussian_nb = make_pipeline(PCA(),GaussianNB())
pca_gaussian_nb.fit(uw_train_features, uw_train_targets)

# Get the scores for both training and test data
(pca_gaussian_nb.score(uw_train_features, uw_train_targets), pca_gaussian_nb.score(uw_test_features, uw_test_targets))

(0.9859154929577465, 0.9444444444444444)