## Sklearn GNB(Gaussian Naive Bayes) example

http://scikit-learn.org/stable/modules/naive_bayes.html#gaussian-naive-bayes

In [2]:
from sklearn import datasets
from sklearn.naive_bayes import GaussianNB

iris = datasets.load_iris()
gnb = GaussianNB()

In [4]:
y_pred = gnb.fit(iris.data, iris.target).predict(iris.data)

In [5]:
print("Number of mislabeled points out of a total %d points : %d" % (iris.data.shape[0],(iris.target != y_pred).sum()))

Number of mislabeled points out of a total 150 points : 6


In [14]:
y_pred

array([0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0,
       0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0,
       0, 0, 0, 0, 1, 1, 2, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1,
       1, 2, 1, 1, 1, 1, 1, 1, 2, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1,
       1, 1, 1, 1, 1, 1, 1, 1, 2, 2, 2, 2, 2, 2, 1, 2, 2, 2, 2, 2, 2, 2, 2,
       2, 2, 2, 2, 1, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 1, 2, 2, 2, 2,
       2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2])

## Quiz In Lesson

### Implement with `sklearn.naive_bayes.GaussianNB`

In [32]:
from sklearn.naive_bayes import GaussianNB
import json

gnb = GaussianNB()

# ## load training data
# with open('train.json', 'rb') as f:
#     j = json.load(f)
# # print (j.keys())
# X = j['states']
# Y = j['labels']

with open('train.json') as f:
    j = json.load(f)

print (j.keys())
X_train = j['states']
Y_train = j['labels']

dict_keys(['states', 'labels'])


In [33]:
X_train[0]

[34.76807295828066, 0.8329138124321809, 8.206638634642987, -0.9989611424577053]

In [34]:
Y_train[0]

'left'

In [35]:
len(X_train)

750

In [38]:
## Load test data
with open('test.json') as f:
    j = json.load(f)
    
X_test = j['states']
Y_test = j['labels']

In [39]:
X_test[0]

[21.274185669072015,
 1.2821698906640548,
 10.513370658008723,
 1.2038825759131897]

In [40]:
Y_test[0]

'right'

In [41]:
len(X_test)

250

In [44]:
## train and predict
y_pred = gnb.fit(X_train,Y_train).predict(X_test)

In [45]:
y_pred[:10]

array(['right', 'keep', 'left', 'left', 'keep', 'left', 'left', 'keep',
       'left', 'right'], 
      dtype='<U5')

In [47]:
## evaluate
score = 0
for predicted , label in zip(y_pred , Y_test):
    if predicted == label:
        score += 1
fraction_correct = float(score) / len(Y_test)
print ("You got {} percent correct".format(100 * fraction_correct))

You got 84.39999999999999 percent correct


### Implement from scratch

In [1]:
import numpy as np
import random
from math import sqrt, pi, exp

## gaussian function
def gaussian_prob(obs, mu, sig):
    num = (obs - mu)**2
    denum = 2*sig**2
    norm = 1 / sqrt(2*pi*sig**2)
    return norm * exp(-num/denum)

In [3]:
gaussian_prob(1, 0, 1)

0.24197072451914337

In [26]:
## class : gaussian naive bayes

class GNB():
    def __init__(self):
        self.classes = ['left','keep','right']
        
    def process_vars(self, vars):
        # could do something fancy in here, but right now
        # s, d, s_dot and d_dot alone give good results
        s, d, s_dot, d_dot = vars
        return s, d, s_dot, d_dot
    
    
    def train(self, X, Y):
        """
        X is an array of training data, each entry of which is a 
        length 4 array which represents a snapshot of a vehicle's
        s, d, s_dot, and d_dot coordinates.

        Y is an array of labels, each of which is either 'left', 'keep',
        or 'right'. These labels indicate what maneuver the vehicle was 
        engaged in during the corresponding training data snapshot. 
        """
        num_vars = 4
        
        # initialize an empty array of arrays. For this problem
        # we are looking at three labels and keeping track of 4 
        # variables for each (s,d,s_dot,d_dot), so the empty array
        # totals_by_label will look like this:

        # {
        #   "left" :[ [s1,s2...],[s_dot1,s_dot2...],[d1,d2,...],[d_dot1,d_dot2,...] ], 
        #   "keep" :[ [s1,s2...],[s_dot1,s_dot2...],[d1,d2,...],[d_dot1,d_dot2,...] ], 
        #   "right":[ [s1,s2...],[s_dot1,s_dot2...],[d1,d2,...],[d_dot1,d_dot2,...] ], 
        # }

        totals_by_label = {
            "left" : [], 
            "keep" : [],
            "right": [],
        }
        
        for label in self.classes:
            for i in range(num_vars):
                totals_by_label[label].append([])
                
        
        for x, label in zip(X,Y):
            x = self.process_vars(x)
            
            for i,val in enumerate(x):
                totals_by_label[label][i].append(val)
                
        # Get the mean and standard deviation for each of the arrays
        # we've built up. These will be used as our priors in GNB
        # will be means = []
        
        means = []
        stds = []
        for i in self.classes:
            means.append([])
            stds.append([])
            for arr in totals_by_label[i]:
                mean = np.mean(arr)
                std = np.std(arr)
                means[-1].append(mean)
                stds[-1].append(std)
                
        self._means = means ## np.array(gnb._means).shape , (3,4)
        self._stds = stds  ## np.array(gnb._stds).shape, (3,4)
        
        
    def _predict(self, obs):
            """
            Private method used to assign a probability to each class.
            """
            
            probs = []
            obs = self.process_vars(obs)
            for(means, stds, lab) in zip(self._means, self._stds, self.classes):
                product = 1
                for mu, sig, o in zip(means, stds, obs):
                    likelihood = gaussian_prob(o, mu, sig)
                    product *= likelihood
                probs.append(product)
            t = sum(probs)
            return [p/t for p in probs]
        
    def predict(self, observation):
        probs = self._predict(observation)
        idx = 0
        best_p = 0
        for i, p in enumerate(probs):
            if p > best_p:
                best_p = p
                idx = i
        names = ['left','keep','right']
        return names[idx]        

### class GNB() unit test

In [9]:
## Load train data
import json

with open('train.json') as f:
    j = json.load(f)

print (j.keys())
X_train = j['states']
Y_train = j['labels']

## Load test data
with open('test.json') as f:
    j = json.load(f)
    
X_test = j['states']
Y_test = j['labels']

dict_keys(['states', 'labels'])


In [14]:
### test data sample
x = X_train[0]
y=  Y_train[0]

In [15]:
x

[34.76807295828066, 0.8329138124321809, 8.206638634642987, -0.9989611424577053]

In [27]:
gnb = GNB()
gnb.train(X_train, Y_train)
gnb._means[0]

[19.714133843021123,
 5.0518059713027306,
 9.9141257688632702,
 -0.96708726365040631]

In [32]:
observation = X_test[2]
print(gnb._predict(observation))
print(gnb.predict(observation))

[0.9959381405520635, 1.1522196925886314e-15, 0.004061859447935169]
left


### Final test

In [34]:
## Load train data
import json

with open('train.json') as f:
    j = json.load(f)

print (j.keys())
X_train = j['states']
Y_train = j['labels']

## Load test data
with open('test.json') as f:
    j = json.load(f)
    
X_test = j['states']
Y_test = j['labels']

gnb = GNB()
gnb.train(X_train, Y_train)

## evaluate
score = 0
for coords, label in zip(X_test,Y_test):
    predicted = gnb.predict(coords)
    if predicted == label:
        score += 1
fraction_correct = float(score) / len(Y_test)
print ("You got {} percent correct".format(100 * fraction_correct))

dict_keys(['states', 'labels'])
You got 84.39999999999999 percent correct
