In [3]:
from google.colab import drive
drive.mount('/content/drive')

Drive already mounted at /content/drive; to attempt to forcibly remount, call drive.mount("/content/drive", force_remount=True).


In [4]:
import sys
root = '/content/drive/My Drive/Colab Notebooks/Project 1 - Naive Bayes'
sys.path.append(root)

In [5]:
import sklearn
import numpy as np
from sklearn import datasets
import scipy.stats
from sklearn.naive_bayes import GaussianNB, CategoricalNB
from sklearn.model_selection import train_test_split
import pandas as pd

In [6]:
from model import naivebayes 

# **1. Iris Dataset**

Let's look at iris dataset first. The dataset has four numeric variables, one categorical response variable. There are three types of iris flower, setosa, versicolor and virginica. Each type has 50 records, and the whole dataset contains 150 records in total.

In [7]:
iris_data = sklearn.datasets.load_iris()

iris_df = {}

for i in range(len(iris_data.feature_names)):
  iris_df[iris_data.feature_names[i]] = iris_data.data[:, i]
iris_df['specie'] = [iris_data.target_names[iris_data.target[i]] for i in range(len(iris_data.target))]

iris_df = pd.DataFrame(iris_df)
iris_df

Unnamed: 0,sepal length (cm),sepal width (cm),petal length (cm),petal width (cm),specie
0,5.1,3.5,1.4,0.2,setosa
1,4.9,3.0,1.4,0.2,setosa
2,4.7,3.2,1.3,0.2,setosa
3,4.6,3.1,1.5,0.2,setosa
4,5.0,3.6,1.4,0.2,setosa
...,...,...,...,...,...
145,6.7,3.0,5.2,2.3,virginica
146,6.3,2.5,5.0,1.9,virginica
147,6.5,3.0,5.2,2.0,virginica
148,6.2,3.4,5.4,2.3,virginica


In [8]:
iris_df.groupby('specie').count().iloc[:, 1]

specie
setosa        50
versicolor    50
virginica     50
Name: sepal width (cm), dtype: int64

We split dataset to train set and test set. Test size is 0.2 of total dataset. We also use stratified split to make the distribution of target variable be equal between train and test.

In [9]:
X = iris_data.data
y = iris_data.target

X_train, X_test, y_train, y_test = train_test_split(X, y, test_size= 0.2, stratify= y)

## **1.1 Our Model**

In this part, we will fit train data to our Gaussian Naive Bayes model.

In [10]:
our_gnb = naivebayes.GaussianNB()
our_gnb.fit(X_train, y_train)

In [11]:
import importlib
importlib.reload(naivebayes)

<module 'model.naivebayes' from '/content/drive/My Drive/Colab Notebooks/Project 1 - Naive Bayes/model/naivebayes.py'>

In [12]:
our_pred = our_gnb.predict(X_test)
acc = (y_test == our_pred).sum() / len(y_test)
print("Accuracy on test set:", acc)

Accuracy on test set: 0.9666666666666667


## **1.2 Scikit learn Model**

In [13]:
sk_gnb = GaussianNB()
sk_gnb.fit(X_train, y_train)

GaussianNB()

In [14]:
sk_pred = sk_gnb.predict(X_test)
acc = (y_test == sk_pred).sum() / len(y_test)
print("Accuracy on test set:", acc)

Accuracy on test set: 0.9666666666666667


In [15]:
print("Similarity with our model:", (sk_pred == our_pred).sum() / len(sk_pred))

Similarity with our model: 1.0


We visualize the iris dataset by parallel coordinates plot, each line coresponding to one record, its color represent the class label. 

In [60]:
import plotly.express as px
df = px.data.iris()
fig = px.parallel_coordinates(df, color="species_id",
                              dimensions=['sepal_width', 'sepal_length', 'petal_width',
                                          'petal_length'],
                              color_continuous_scale=px.colors.diverging.Tealrose,
                              color_continuous_midpoint=2)
fig.show()

# **2. Breast Cancer Dataset**

Below are breast cancer dataset. The dataset contains 30 numeric features variables, and response variabe has two value, which is diagnosis, malignan and benign. 

In [16]:
breast_data = sklearn.datasets.load_breast_cancer()

breast_df = {}

for i in range(len(breast_data.feature_names)):
  breast_df[breast_data.feature_names[i]] = breast_data.data[:, i]
breast_df['diagnosis'] = [breast_data.target_names[breast_data.target[i]] for i in range(len(breast_data.target))]

breast_df = pd.DataFrame(breast_df)
breast_df

Unnamed: 0,mean radius,mean texture,mean perimeter,mean area,mean smoothness,mean compactness,mean concavity,mean concave points,mean symmetry,mean fractal dimension,radius error,texture error,perimeter error,area error,smoothness error,compactness error,concavity error,concave points error,symmetry error,fractal dimension error,worst radius,worst texture,worst perimeter,worst area,worst smoothness,worst compactness,worst concavity,worst concave points,worst symmetry,worst fractal dimension,diagnosis
0,17.99,10.38,122.80,1001.0,0.11840,0.27760,0.30010,0.14710,0.2419,0.07871,1.0950,0.9053,8.589,153.40,0.006399,0.04904,0.05373,0.01587,0.03003,0.006193,25.380,17.33,184.60,2019.0,0.16220,0.66560,0.7119,0.2654,0.4601,0.11890,malignant
1,20.57,17.77,132.90,1326.0,0.08474,0.07864,0.08690,0.07017,0.1812,0.05667,0.5435,0.7339,3.398,74.08,0.005225,0.01308,0.01860,0.01340,0.01389,0.003532,24.990,23.41,158.80,1956.0,0.12380,0.18660,0.2416,0.1860,0.2750,0.08902,malignant
2,19.69,21.25,130.00,1203.0,0.10960,0.15990,0.19740,0.12790,0.2069,0.05999,0.7456,0.7869,4.585,94.03,0.006150,0.04006,0.03832,0.02058,0.02250,0.004571,23.570,25.53,152.50,1709.0,0.14440,0.42450,0.4504,0.2430,0.3613,0.08758,malignant
3,11.42,20.38,77.58,386.1,0.14250,0.28390,0.24140,0.10520,0.2597,0.09744,0.4956,1.1560,3.445,27.23,0.009110,0.07458,0.05661,0.01867,0.05963,0.009208,14.910,26.50,98.87,567.7,0.20980,0.86630,0.6869,0.2575,0.6638,0.17300,malignant
4,20.29,14.34,135.10,1297.0,0.10030,0.13280,0.19800,0.10430,0.1809,0.05883,0.7572,0.7813,5.438,94.44,0.011490,0.02461,0.05688,0.01885,0.01756,0.005115,22.540,16.67,152.20,1575.0,0.13740,0.20500,0.4000,0.1625,0.2364,0.07678,malignant
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
564,21.56,22.39,142.00,1479.0,0.11100,0.11590,0.24390,0.13890,0.1726,0.05623,1.1760,1.2560,7.673,158.70,0.010300,0.02891,0.05198,0.02454,0.01114,0.004239,25.450,26.40,166.10,2027.0,0.14100,0.21130,0.4107,0.2216,0.2060,0.07115,malignant
565,20.13,28.25,131.20,1261.0,0.09780,0.10340,0.14400,0.09791,0.1752,0.05533,0.7655,2.4630,5.203,99.04,0.005769,0.02423,0.03950,0.01678,0.01898,0.002498,23.690,38.25,155.00,1731.0,0.11660,0.19220,0.3215,0.1628,0.2572,0.06637,malignant
566,16.60,28.08,108.30,858.1,0.08455,0.10230,0.09251,0.05302,0.1590,0.05648,0.4564,1.0750,3.425,48.55,0.005903,0.03731,0.04730,0.01557,0.01318,0.003892,18.980,34.12,126.70,1124.0,0.11390,0.30940,0.3403,0.1418,0.2218,0.07820,malignant
567,20.60,29.33,140.10,1265.0,0.11780,0.27700,0.35140,0.15200,0.2397,0.07016,0.7260,1.5950,5.772,86.22,0.006522,0.06158,0.07117,0.01664,0.02324,0.006185,25.740,39.42,184.60,1821.0,0.16500,0.86810,0.9387,0.2650,0.4087,0.12400,malignant


In [17]:
print("Number of features:", len(breast_data.feature_names))
print("Target name:", breast_data.target_names)

Number of features: 30
Target name: ['malignant' 'benign']


We can see that the target value is not equally distributed, there are more records which are benign than malignant. Like iris dataset, we use stratificationi to split train and test set.

In [18]:
breast_df.groupby('diagnosis').count().iloc[:, 0]

diagnosis
benign       357
malignant    212
Name: mean radius, dtype: int64

In [19]:
X = breast_data.data
y = breast_data.target

X_train, X_test, y_train, y_test = train_test_split(X, y, test_size= 0.2, stratify= y)

## **2.1 Our Model**

In [20]:
our_gnb = naivebayes.GaussianNB()
our_gnb.fit(X_train, y_train)

In [21]:
our_pred = our_gnb.predict(X_test)
acc = (y_test == our_pred).sum() / len(y_test)
print("Accuracy on test set:", acc)

Accuracy on test set: 0.9210526315789473


  prob_of_class[:, i] += np.log2(likelihood)


## **2.2 Scikit learn Model**

In [22]:
sk_gnb = GaussianNB()
sk_gnb.fit(X_train, y_train)

GaussianNB()

In [23]:
sk_pred = sk_gnb.predict(X_test)
acc = (y_test == sk_pred).sum() / len(y_test)
print("Accuracy on test set:", acc)

Accuracy on test set: 0.956140350877193


In [24]:
print("Similarity with our model:", (sk_pred == our_pred).sum() / len(sk_pred))

Similarity with our model: 0.9649122807017544


# **3. Wine Dataset**

The wine dataset has 13 real feature variables and it's target varibale is categorical, and has 3 values. Below is overview of the dataset.

In [25]:
wine_data = sklearn.datasets.load_wine()

wine_df = {}

for i in range(len(wine_data.feature_names)):
  wine_df[wine_data.feature_names[i]] = wine_data.data[:, i]
wine_df['class'] = [wine_data.target_names[wine_data.target[i]] for i in range(len(wine_data.target))]

wine_df = pd.DataFrame(wine_df)
wine_df

Unnamed: 0,alcohol,malic_acid,ash,alcalinity_of_ash,magnesium,total_phenols,flavanoids,nonflavanoid_phenols,proanthocyanins,color_intensity,hue,od280/od315_of_diluted_wines,proline,class
0,14.23,1.71,2.43,15.6,127.0,2.80,3.06,0.28,2.29,5.64,1.04,3.92,1065.0,class_0
1,13.20,1.78,2.14,11.2,100.0,2.65,2.76,0.26,1.28,4.38,1.05,3.40,1050.0,class_0
2,13.16,2.36,2.67,18.6,101.0,2.80,3.24,0.30,2.81,5.68,1.03,3.17,1185.0,class_0
3,14.37,1.95,2.50,16.8,113.0,3.85,3.49,0.24,2.18,7.80,0.86,3.45,1480.0,class_0
4,13.24,2.59,2.87,21.0,118.0,2.80,2.69,0.39,1.82,4.32,1.04,2.93,735.0,class_0
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
173,13.71,5.65,2.45,20.5,95.0,1.68,0.61,0.52,1.06,7.70,0.64,1.74,740.0,class_2
174,13.40,3.91,2.48,23.0,102.0,1.80,0.75,0.43,1.41,7.30,0.70,1.56,750.0,class_2
175,13.27,4.28,2.26,20.0,120.0,1.59,0.69,0.43,1.35,10.20,0.59,1.56,835.0,class_2
176,13.17,2.59,2.37,20.0,120.0,1.65,0.68,0.53,1.46,9.30,0.60,1.62,840.0,class_2


In [26]:
wine_df.groupby('class').count().iloc[:, 0]

class
class_0    59
class_1    71
class_2    48
Name: alcohol, dtype: int64

Split data to train and test set.

In [27]:
X = wine_data.data
y = wine_data.target
X_train, X_test, y_train, y_test = train_test_split(X, y, test_size= 0.2, stratify= y)

## **3.1 Our Model**

In [28]:
our_gnb = naivebayes.GaussianNB()
our_gnb.fit(X_train, y_train)

In [29]:
our_pred = our_gnb.predict(X_test)
print("Accuracy on test set:", (y_test == our_pred).sum() / len(y_test))

Accuracy on test set: 0.9722222222222222


## **3.2 Scikit learn Model**

In [30]:
sk_gnb = GaussianNB()
sk_gnb.fit(X_train, y_train)

GaussianNB()

In [31]:
sk_pred = sk_gnb.predict(X_test)
print("Accuracy on test set:", (y_test == sk_pred).sum() / len(y_test))

Accuracy on test set: 0.9722222222222222


In [32]:
print("Similarity with our model:", (sk_pred == our_pred).sum() / len(sk_pred))

Similarity with our model: 1.0


# **4. Categorical NB**

In [33]:
X = np.array([['Young', 'High', 'No', 'Fair'],
              ['Young', 'High', 'No', 'Excellent'],
              ['Medium', 'High', 'No', 'Fair'],
              ['Old', 'Medium', 'No', 'Fair'],
              ['Old', 'Low', 'Yes', 'Fair'],
              ['Old', 'Low', 'Yes', 'Excellent'],
              ['Medium', 'Low', 'Yes', 'Excellent'],
              ['Young', 'Medium', 'No', 'Fair'],
              ['Young', 'Low', 'Yes', 'Fair'],
              ['Old', 'Medium', 'Yes', 'Fair'],
              ['Young', 'Medium', 'Yes', 'Excellent'],
              ['Medium', 'Medium', 'No', 'Excellent'],
              ['Medium', 'High', 'Yes', 'Fair'],
              ['Old', 'Medium', 'No', 'Excellent']])
y = np.array(['No', 'No', 'Yes', 'Yes', 'Yes', 'No', 'Yes', 'No', 'Yes', 'Yes', 'Yes', 'Yes', 'Yes', 'No'])

In [34]:
nb = naivebayes.CategoricalNB()
nb.fit(X, y)

In [35]:
instance = np.array([['Young', 'Medium', 'Yes', 'Fair'],
                     ['Old', 'Medium', 'No', 'Excellent']])
pred = nb.predict(instance)
print(pred)

['Yes' 'No']
