# Lab4 - Machine Learning
## Classification with KNN and Naive Bayes

## 1. k-nearest-neighbors (kNN)

In [None]:
from __future__ import division
import numpy as np
import matplotlib.pyplot as plt
%matplotlib inline
from PIL import Image
import scipy.io

In [None]:
def load_data_subset():
    """
    Load subsets from MNIST for 5 and 9"""
    data = scipy.io.loadmat('data/mnist/mnistSubset5and9.mat')
    X_train = np.vstack([data['train5'],data['train9']])/255
    X_test = np.vstack([data['test5'],data['test9']])/255
    y_train = np.vstack([np.array([[1,0] for _ in range(data['train5'].shape[0])]),
                         np.array([[0,1] for _ in range(data['train9'].shape[0])])])
    y_test = np.vstack([np.array([[1,0] for _ in range(data['test5'].shape[0])]),
                         np.array([[0,1] for _ in range(data['test9'].shape[0])])])
    print "Train data array size: ", X_train.shape    
    print "Train truth array size: ", y_train.shape
    print "Test data array size: ", X_test.shape
    print "Test truth array size: ", y_test.shape
    return X_train, X_test, y_train, y_test

In [None]:
X_train, X_test, y_train, y_test = load_data_subset()

In [None]:
def sqrdist(x, y):
    """ X is a matrix (N x D) and y is a vector (1 x D)
    """
    y = np.tile(y,(x.shape[0],1))
    d = np.sqrt((np.square(x-y).sum(axis=1)))
    return d

Here you should implement the knn_test function which takes as input a $N \times D$ matrix $X$ with the input data, a $N \times K$ matrix $T$ which holds the one hot vectors that are the ground truth for $X$. Inside the function you should compute for each instance in Xtest, the k-nearest-neighbors and take a decision for the class using the majority rule. 

Afterwards you can decide for the best K using cross validation and run knn_test for the test_data.

In [None]:
def knn_test(X, T, Xtest, K):
    ##########################################################
    ######################Your code Here######################
    ##########################################################
    return Ttest

In [None]:
def cross_validation(K, numFolds, X, T):
    """
    """
    N = X.shape[0]
    #random permute the data before we split them
    np.random.seed(10)
    perm = np.random.permutation(N)
    X = X[perm]
    T = T[perm]
    # The fold variable will have size numFolds + 1
    chunk_size = int(N/numFolds)
    split_ind = np.arange(chunk_size,chunk_size*numFolds, chunk_size)
    if chunk_size*numFolds < N:
        split_ind[-1] = N-1
    else:
        last_chunk = chunk_size
    Xfolds = np.vsplit(X, split_ind)
    Tfolds = np.vsplit(T, split_ind)
    valerr = np.zeros((K, numFolds))
    for k in range(1, K+1):
        for j in range(numFolds):
            X_train = np.vstack([Xfolds[i] for i in range(len(Xfolds)) if i != j])
            y_train = np.vstack([Tfolds[i] for i in range(len(Tfolds)) if i != j])
            X_test = Xfolds[j]
            # prediction using kNN
            Ttest = knn_test(X_train, y_train, X_test, k)
            # Compute the percent misclassified data points 
            Tval = np.argmax(Ttest, axis=1)
            Tcor = np.argmax(Tfolds[j], axis=1)
            valerr[k-1, j] = np.count_nonzero(Tval!=Tcor)/X_test.shape[0]
            print("kNN for k = {}, fold # {}, error = {} ".format(k, j, valerr[k-1, j]))
    # average the validation errors
    val = valerr.sum(axis=1)/numFolds
    # select best k
    kbest = np.argmin(val)+1
    # Plot the evolution of the validation error with respect to k
    plt.plot(range(1, K+1), val*100)
    plt.ylabel('Average Validation Error %')
    plt.xlabel('K Nearest Neighbors')
    plt.show()
    print("Best k is {} with average error = {} ".format(kbest, val[kbest]))
    return kbest

In [None]:
kbest = cross_validation(20, 10, X_train, y_train)

In [None]:
Ttest = knn_test(X_train, y_train, X_test, kbest)

Tval = np.argmax(Ttest, axis=1)
Tcor = np.argmax(y_test, axis=1)
print np.count_nonzero(Tval!=Tcor)/X_test.shape[0]
print "Error is ", np.count_nonzero(Tval!=Tcor)/X_test.shape[0] * 100, " %"  

## 2. Naive Bayes

Check slides of Lec5.pdf - slides 20-56

Check Bishop - Pattern Recognition and Machine Learning Chapter 4.2.3 - Generative Models with Discrete features - Naive Bayes.

In Discriminative models given data $\mathbf{X}$ we want to calculate the probability that it belongs to a class. So we are trying to model $P(\mathcal{C_k} \mid \mathbf{X})$, which actually is finding the boundaries between different classes. 

In Generative models we want to model the joint distribution of each class $P(\mathbf{X}, \, \mathcal{C_k})$.

Given an instance $\mathbf{x}$ we want to calculate $ P(\mathcal{C_k} \mid \mathbf{x})$ for each $k \in K$ and choose the category/class that holds the highest probability.


We will use Bayes Rule to calculate what the probability is 
$$
P(\mathcal{C_k} \mid \mathbf{x}) = \frac{P(\mathbf{x} \mid \mathcal{C_k}) \, P(\mathcal{C_k})}{P(\mathbf{x})} $$




The term "Naive" is adopted because of the naive assumption of independence between every pair of features. Given a class variable y and a dependent feature vector x<sub>1</sub> through x<sub>n</sub>, Bayes’ theorem states the following relationship:

$$P(C_k \mid x_1, \dots, x_n) = \frac{P(x_1, \dots x_n \mid C_k)\, P(\mathcal{C_k})}
                                 {P(x_1, \dots, x_n)}$$
                                 
Using the naive independence assumption that
$$ P(x_i \mid \mathcal{C_k}, x_1, \dots, x_{i-1}, x_{i+1}, \dots, x_n) = P(x_i \mid \mathcal{C_k})$$

for all i, this relationship is simplified to:

$$P(C_k\mid x_1, \dots, x_n) = \frac{\prod_{i=1}^{n} P(x_i \mid y) \,  P(\mathcal{C_k})}
                                 {P(x_1, \dots, x_n)}$$


With respect to the above "naive" assumption for (1)  we have use a Bernoulli distribution for each of the K classes:
$$
p(\mathbf{x} \mid  \mathcal{C_k}) = \prod_{d=1}^D \mu_{k,d}^{x_d} \, (1- \mu_{k,d})^{1 - x_d}
$$
and 
$$ P(\mathbf{x}) = \sum_{j=1}^K P(\mathbf{x} \mid \mathcal{C_j}) \, P(\mathcal{C_j}) $$

Although we want to calculate $ P(\mathcal{C_k} \mid \mathbf{x})$, Naive Bayes is a Generative Model because capture $P(\mathbf{x} \mid \mathcal{C_k}) \, P(\mathcal{C_k})$ which equals to $P(\mathbf{X}, \, \mathcal{C_k})$.

Since the denominator is common for all the classifiers for a given $\mathbf{x}$ we need to calculate all the nominators and compare them. Optionally you can use the softmax function to convert $\mathcal{L(\mu)}$ to real probabilities. The likelihood becomes:
$$ \mathcal{L(\mu_k)} = \ln \left(p(\mathbf{x} \mid \mathcal{C_k})P(\mathcal{C_k})\right)= \sum_{n=1}^N \sum_{d=1}^D x_{nd}\,\ln\mu_{k,d} + (1 - x_nd)\ln(1- \mu_{k,d})$$

By differentiating the above log likelihood with respect to $\mu_{k,d}$ and equating to zero we obtain the optimal parameters
$$\mu_{k,d} = \frac{\sum_{n \in X_k}  x_{n,d}}{N_k}$$

In [None]:
def load_data():
    """
    Loads the MNIST dataset. Reads the training files and creates matrices.
    :return: train_data:the matrix with the training data
    test_data: the matrix with the data that will be used for testing
    train_truth: the matrix consisting of one 
                        hot vectors on each row(ground truth for training)
    test_truth: the matrix consisting of one
                        hot vectors on each row(ground truth for testing)
    """
    train_files = ['data/mnist/train%d.txt' % (i,) for i in range(10)]
    test_files = ['data/mnist/test%d.txt' % (i,) for i in range(10)]
    tmp = []
    for i in train_files:
        with open(i, 'r') as fp:
            tmp += fp.readlines()
    # load train data in N*D array (60000x784 for MNIST) 
    #                              divided by 255 to achieve normalization
    train_data = np.array([[j for j in i.split(" ")] for i in tmp], dtype='int') / 255
    print "Train data array size: ", train_data.shape
    tmp = []
    for i in test_files:
        with open(i, 'r') as fp:
            tmp += fp.readlines()
    # load test data in N*D array (10000x784 for MNIST) 
    #                             divided by 255 to achieve normalization
    test_data = np.array([[j for j in i.split(" ")] for i in tmp], dtype='int') / 255
    print "Test data array size: ", test_data.shape
    tmp = []
    for i, _file in enumerate(train_files):
        with open(_file, 'r') as fp:
            for line in fp:
                tmp.append([1 if j == i else 0 for j in range(0, 10)])
    train_truth = np.array(tmp, dtype='int')
    del tmp[:]
    for i, _file in enumerate(test_files):
        with open(_file, 'r') as fp:
            for _ in fp:
                tmp.append([1 if j == i else 0 for j in range(0, 10)])
    test_truth = np.array(tmp, dtype='int')
    print "Train truth array size: ", train_truth.shape
    print "Test truth array size: ", test_truth.shape
    return train_data, test_data, train_truth, test_truth

In [None]:
X_train, X_test, y_train, y_test = load_data()

In [None]:
def softmax(y):
    max_of_rows = np.max(y, 1)
    m = np.array([max_of_rows, ] * y.shape[1]).T
    y = y - m
    y = np.exp(y)
    return y / (np.array([np.sum(y, 1), ] * y.shape[1])).T

In [None]:
def ml_nb_train(X, y):
    """
    Trains Naive Vayes classifier for binary input data"""
    ##########################################################
    ######################Your code Here######################
    ##########################################################
    return m, pc

In [None]:
def log_bernoulli(X, mu):
    N = X.shape[0]
    mu[mu<=0] = 1e-100
    mu[mu>=1] = 1 - 1e-100
    logPr = np.sum(X*np.log(mu) + (1-X)*np.log(1-mu), axis=1)
    # logPr = np.zeros(X.shape[0])
    # for n in range(X.shape[0]):
    #     for d in range(X.shape[1]):
    #         logPr[n] = np.sum(np.sum(X[n,d]*np.log(mu) + 1-X[n,d]*np.log(1-mu)))    
    return logPr
    

In [None]:
def ml_nb_test(m, pc, X_test):
    ##########################################################
    ######################Your code Here######################
    ##########################################################
    return ttest

In [None]:
m, pc = ml_nb_train(X_train, y_train)

In [None]:
ttest= ml_nb_test(m, pc, X_test)

In [None]:
T_true = np.argmax(y_test, axis=1)

In [None]:
print np.sum(T_true!=ttest)/X_test.shape[0]
print np.count_nonzero(T_true!=ttest)/X_test.shape[0]

In [None]:
f, ax = plt.subplots(1,10)
f.set_figheight(15)
f.set_figwidth(15)
for k in range(10):
    im = Image.fromarray((m[k]).reshape(28,28)*255)
    ax[k].axis('off')
    ax[k].imshow(im)

In [None]:
f, ax = plt.subplots(10,10)
f.set_figheight(15)
f.set_figwidth(15)
for k in range(10):
    for i in range(10):
        im = Image.fromarray((m[k]*np.random.rand(784)).reshape(28,28)*255)
        ax[k, i].axis('off')
        ax[k, i].imshow(im)