In [126]:
import pandas as pd
import numpy as np
from sklearn import datasets
from sklearn.naive_bayes import GaussianNB

In [127]:
datasets.load_iris()

# datasets module stores its datasets as dictionary objects. We're only interested in the data and target keys.

 'data': array([[ 5.1,  3.5,  1.4,  0.2],
        [ 4.9,  3. ,  1.4,  0.2],
        [ 4.7,  3.2,  1.3,  0.2],
        [ 4.6,  3.1,  1.5,  0.2],
        [ 5. ,  3.6,  1.4,  0.2],
        [ 5.4,  3.9,  1.7,  0.4],
        [ 4.6,  3.4,  1.4,  0.3],
        [ 5. ,  3.4,  1.5,  0.2],
        [ 4.4,  2.9,  1.4,  0.2],
        [ 4.9,  3.1,  1.5,  0.1],
        [ 5.4,  3.7,  1.5,  0.2],
        [ 4.8,  3.4,  1.6,  0.2],
        [ 4.8,  3. ,  1.4,  0.1],
        [ 4.3,  3. ,  1.1,  0.1],
        [ 5.8,  4. ,  1.2,  0.2],
        [ 5.7,  4.4,  1.5,  0.4],
        [ 5.4,  3.9,  1.3,  0.4],
        [ 5.1,  3.5,  1.4,  0.3],
        [ 5.7,  3.8,  1.7,  0.3],
        [ 5.1,  3.8,  1.5,  0.3],
        [ 5.4,  3.4,  1.7,  0.2],
        [ 5.1,  3.7,  1.5,  0.4],
        [ 4.6,  3.6,  1. ,  0.2],
        [ 5.1,  3.3,  1.7,  0.5],
        [ 4.8,  3.4,  1.9,  0.2],
        [ 5. ,  3. ,  1.6,  0.2],
        [ 5. ,  3.4,  1.6,  0.4],
        [ 5.2,  3.5,  1.5,  0.2],
        [ 5.2,  3.4,  1.4,  0.2],
      

In [128]:
iris = datasets.load_iris()

In [129]:
data = pd.DataFrame(data = iris.data)
data.head()

Unnamed: 0,0,1,2,3
0,5.1,3.5,1.4,0.2
1,4.9,3.0,1.4,0.2
2,4.7,3.2,1.3,0.2
3,4.6,3.1,1.5,0.2
4,5.0,3.6,1.4,0.2


In [130]:
target = pd.DataFrame(data = iris.target)
target.head()

Unnamed: 0,0
0,0
1,0
2,0
3,0
4,0


In [131]:
# concat the two together.

df = pd.concat((data, target), axis = 1) 
df.head()

Unnamed: 0,0,1,2,3,0.1
0,5.1,3.5,1.4,0.2,0
1,4.9,3.0,1.4,0.2,0
2,4.7,3.2,1.3,0.2,0
3,4.6,3.1,1.5,0.2,0
4,5.0,3.6,1.4,0.2,0


In [132]:
# set actual column names
col_names = ['sepal_length', 'sepal_width', 'petal_length', 'petal_width', 'species']
df.columns = col_names
df.head()

Unnamed: 0,sepal_length,sepal_width,petal_length,petal_width,species
0,5.1,3.5,1.4,0.2,0
1,4.9,3.0,1.4,0.2,0
2,4.7,3.2,1.3,0.2,0
3,4.6,3.1,1.5,0.2,0
4,5.0,3.6,1.4,0.2,0


In [133]:
# we're trying to predict species (0 = setosa, 1 = versicolor, 2 = vergincia). 
# map them to the actual values for easier representation (but not required for actual model)

vals_to_replace = {0: 'setosa', 1: 'versicolor', 2: 'vergincia'}
df['species'] = df['species'].map(vals_to_replace)
df.head()

Unnamed: 0,sepal_length,sepal_width,petal_length,petal_width,species
0,5.1,3.5,1.4,0.2,setosa
1,4.9,3.0,1.4,0.2,setosa
2,4.7,3.2,1.3,0.2,setosa
3,4.6,3.1,1.5,0.2,setosa
4,5.0,3.6,1.4,0.2,setosa


In [134]:
# split into train and test sets
# 70% will be train and 30% will be test

train_number = int(df['sepal_length'].count() * (0.70))
print(train_number)

105


In [135]:
train = df.sample(n = train_number, random_state = 1)
train.head()

Unnamed: 0,sepal_length,sepal_width,petal_length,petal_width,species
14,5.8,4.0,1.2,0.2,setosa
98,5.1,2.5,3.0,1.1,versicolor
75,6.6,3.0,4.4,1.4,versicolor
16,5.4,3.9,1.3,0.4,setosa
131,7.9,3.8,6.4,2.0,vergincia


In [136]:
test = df.drop(train.index)
test.head()

Unnamed: 0,sepal_length,sepal_width,petal_length,petal_width,species
0,5.1,3.5,1.4,0.2,setosa
1,4.9,3.0,1.4,0.2,setosa
3,4.6,3.1,1.5,0.2,setosa
7,5.0,3.4,1.5,0.2,setosa
8,4.4,2.9,1.4,0.2,setosa


In [137]:
train_X = np.asarray(train[['sepal_length', 'sepal_width', 'petal_length', 'petal_width']])
train_y = np.asarray(train['species'])

In [138]:
test_X = np.asarray(test[['sepal_length', 'sepal_width', 'petal_length', 'petal_width']])
test_y = np.asarray(test['species'])

In [139]:
# Naive Bayes (NB) classifier works will with large datasets.

clf = GaussianNB() #naive bayes classifier
clf = clf.fit(train_X, train_y)
y_pred = clf.predict(test_X)

print(y_pred)

['setosa' 'setosa' 'setosa' 'setosa' 'setosa' 'setosa' 'setosa' 'setosa'
 'setosa' 'setosa' 'setosa' 'setosa' 'setosa' 'setosa' 'setosa' 'setosa'
 'setosa' 'versicolor' 'versicolor' 'versicolor' 'versicolor' 'versicolor'
 'versicolor' 'vergincia' 'versicolor' 'versicolor' 'versicolor'
 'versicolor' 'versicolor' 'versicolor' 'versicolor' 'versicolor'
 'vergincia' 'versicolor' 'vergincia' 'vergincia' 'vergincia' 'versicolor'
 'versicolor' 'vergincia' 'vergincia' 'vergincia' 'vergincia' 'vergincia'
 'vergincia']


In [140]:
print("Number of mislabeled points out of a total {} points : {}"
      .format(len(test),(test_y != y_pred).sum()))

Number of mislabeled points out of a total 45 points : 4


In [145]:
1 - ((test_y != y_pred).sum()/len(test)) # get success rate

0.91111111111111109

In [149]:
test['evaluation'] = (test_y == y_pred)
test['pred'] = y_pred

In [151]:
test[test['evaluation'] == False] # get misclassified predictions

Unnamed: 0,sepal_length,sepal_width,petal_length,petal_width,species,evaluation,pred
70,5.9,3.2,4.8,1.8,versicolor,False,vergincia
106,4.9,2.5,4.5,1.7,vergincia,False,versicolor
133,6.3,2.8,5.1,1.5,vergincia,False,versicolor
134,6.1,2.6,5.6,1.4,vergincia,False,versicolor
