### Naive Bayes Classification

In [2]:
from sklearn.datasets import load_iris
from sklearn.model_selection import train_test_split
from sklearn.metrics import classification_report
import numpy as np

In [12]:
#Load in datasets
iris = load_iris()
X= iris.data.copy()
y = np.array([iris.target_names[yi] for yi in iris.target])
n_quantiles = 3
#take a look at the description of iris dataset
print(iris['DESCR'])
print(X.shape)
print(X[:10, :4])
print(y[:10])

Iris Plants Database

Notes
-----
Data Set Characteristics:
    :Number of Instances: 150 (50 in each of three classes)
    :Number of Attributes: 4 numeric, predictive attributes and the class
    :Attribute Information:
        - sepal length in cm
        - sepal width in cm
        - petal length in cm
        - petal width in cm
        - class:
                - Iris-Setosa
                - Iris-Versicolour
                - Iris-Virginica
    :Summary Statistics:

                    Min  Max   Mean    SD   Class Correlation
    sepal length:   4.3  7.9   5.84   0.83    0.7826
    sepal width:    2.0  4.4   3.05   0.43   -0.4194
    petal length:   1.0  6.9   3.76   1.76    0.9490  (high!)
    petal width:    0.1  2.5   1.20  0.76     0.9565  (high!)

    :Missing Attribute Values: None
    :Class Distribution: 33.3% for each of 3 classes.
    :Creator: R.A. Fisher
    :Donor: Michael Marshall (MARSHALL%PLU@io.arc.nasa.gov)
    :Date: July, 1988

This is a copy of UCI ML iris d

In [29]:
#map each value to which quantile it belongs
def quantize(x, n_intervals = 3):
    p = np.percentile(x, np.linspace(0,100, num = (n_intervals + 1)))
    return np.array([max(sum(xi > p), 1) for xi in x])

In [30]:
#discrete features by their quantile
for feature in range(X.shape[1]):
    X[:, feature] = quantize(X[:, feature], n_quantiles)
X = X.astype(int)    

In [14]:
#Split data into train/test
X_train, X_test, y_train, y_test = train_test_split(X,y,test_size = 0.25)
print("Number of train instances:", len(X_train))
print("Number of test instances:", len(X_test))
print("Number of features:", X.shape[1])

Number of train instances: 112
Number of test instances: 38
Number of features: 4


In [31]:
classes = np.unique(y)
n,d,C = X_train.shape[0], X_train.shape[1], len(classes)

In [16]:
#in order to aviod # of instance = 0, set Laplace factor = 1
alpha = 1

In [32]:
#Store the class probabilities into a dic
class_probs = {}
#get probability of each value
for c in classes:
    class_probs[c] = (np.array(y_train == c).sum() + alpha) / (n+C*alpha)

Formula: P(Xij = x| yi = c) = (Nj,x,c + alpha)/(Nc + Qj*alpha)
Xij is jth feature for ith instance, yi = c means this instance is in class . 
Nj,x,c is # of how many times jth feature with value x falls into class c;
Nc is # of instance in class c
Qj is the # of possible values for jth feature

In [36]:
#first get Q list of possible values for each features
possible_values = [set(X_train[:, feature]) for feature in range(d)]
#store them in a dic with key (feature, class), as feature_probs[j,c] maps 
#each possible value of the jth feature to its probability
#i.e. P(Xij = x|yi = c) can be obtained from feture_probs[j,c][k]
feature_probs = {(j,c): {v: 0 for v in possible_values[j]}
                for c in classes for j in range(d)}
#now get the probabilities for each feature, given each class
for j in range(d):
    for c in classes:
        # the number of instance in class c for jth feature
        in_class_c = X_train[y_train ==c,j]
        for x in possible_values[j]:
            numerator = sum(in_class_c == x) + alpha
            denominator = len(in_class_c) + len(possible_values[j])*alpha
            feature_probs[j,c][x] = numerator / denominator

### Test

In [38]:
test_size = len(X_test)
y_pred = []
for i in range(test_size):
    posterior_prob = {c: 0 for c in classes}
    y_max = classes[0]
    
    for c in classes:
        #get the posterior p for class c
        posterior_prob[c] = class_probs[c]
        for j in range(d):
            x = X_test[i,j]
            posterior_prob[c]*=feature_probs[j,c][x]
            
        #update which class has the max posterior
        if posterior_prob[c] >= posterior_prob[y_max]:
            y_max = c
            
    y_pred.append(y_max)
        

KeyError: 3.6000000000000001

In [39]:
print(classification_report(y_test, y_pred))

ValueError: Found input variables with inconsistent numbers of samples: [38, 10]