## Lab 6: Learning an NBC with EM

### Datasets load and split

In [1]:
import numpy as np
from sklearn import metrics, datasets, cluster
import scipy.stats as st

In [2]:
digits = datasets.load_digits()
digits_data = digits.data
digits_split = int(len(digits_data)*0.7)
x_train = digits_data[:digits_split]
x_test = digits_data[digits_split:]
digits_target = digits.target
y_train = digits_target[:digits_split]
y_test = digits_target[digits_split:]
print('Training data:', len(x_train), '\nTraining Labels:', len(y_train), '\nTesting Data:', 
      len(x_test), '\nTesting Labels:', len(y_test), '\nCheck:', 
      len(digits_data) == len(x_train) + len(x_test))
print(x_train.shape)
print(y_train.shape)
x_train /= 16
x_test /= 16

Training data: 1257 
Training Labels: 1257 
Testing Data: 540 
Testing Labels: 540 
Check: True
(1257, 64)
(1257,)


### EM Algorithm

#### Initialization

In [3]:
def initialize():
    indexes = np.random.randint(len(digits_data), size=int(len(digits_data)*0.1))
    theta = dict()
    temp = dict()
    for i in indexes:
        k = digits_target[i]
        pixels = digits_data[i]
        if k not in temp:
            temp[k] = list()
        temp[k].append(pixels)
        
    for k in temp:
        prior = 0.1
        values = np.array(temp[k])
        means = np.zeros(digits_data.shape[1])
        var = np.zeros(digits_data.shape[1])
        for i in range(len(values[0])):
            means[i] = np.mean(values[:,i])
            var[i] = np.var(values[:,i]) + epsilon
        theta[k] = np.array([prior, means, var])
    return theta

classes = 10
epsilon = 0.01
theta_init = initialize()

#### E-Step

In [4]:
def E_step(X, theta):
    r = np.zeros((X.shape[0],classes)) 
    for i in range(len(X)):
        prob = np.prod([st.norm.pdf(X[i], theta[k][1], np.sqrt(theta[k][2])) for k in range(classes)], axis = 1)
        prod = [theta[k][0]*prob[k] for k in range(classes)]
        den = np.sum(prod)
        r[i,:] = prod/den    
    return r

#### M-Step

In [5]:
def M_step(X, r):
    r_k = {k:np.sum(r[:,k]) for k in range(10)}
    theta = dict()
    for k in r_k:
        prior = r_k[k]/len(X)
        means = np.sum([r[i][k]*X[i] for i in range(len(X))], axis=0)/r_k[k]
        vars_ = np.sum([r[i][k]*(X[i]**2) for i in range(len(X))], axis=0)/r_k[k] - means**2 + epsilon
        theta[k] = np.array([prior, means, vars_])
    return theta

#### Termination

In [6]:
e = E_step(x_train, theta_init)
m = M_step(x_train, e)
for i in range(100):
    e = E_step(x_train, m)
    m = M_step(x_train, e)

#### Predict Function

In [7]:
def predict(X,theta):
    probs = np.zeros((len(X), classes))
    for k in theta:
        prior = theta[k][0]
        mean = theta[k][1]
        var = theta[k][2]
        probs[:,k] = np.sum(np.log(st.norm.pdf(X, mean, var)) + np.log(prior), axis=1)
    return np.argmax(probs,axis=1)

#### Prediction Over Train Data

In [8]:
y_pred_train = predict(x_train,m)

print("Classification report EM:\n%s\n" % 
      (metrics.classification_report(y_train, y_pred_train)))
print("Confusion matrix EM:\n%s" % metrics.confusion_matrix(y_train, y_pred_train))
print()
h_c_v = metrics.homogeneity_completeness_v_measure(y_train, y_pred_train)
print('Homogenity:',h_c_v[0])
print('Completeness:',h_c_v[1])
print('V-measure:',h_c_v[2])

Classification report EM:
              precision    recall  f1-score   support

           0       0.98      0.99      0.99       125
           1       1.00      0.60      0.75       129
           2       0.75      0.76      0.76       124
           3       0.99      0.70      0.82       130
           4       0.97      0.89      0.93       124
           5       0.98      0.63      0.76       126
           6       1.00      0.97      0.98       127
           7       0.78      1.00      0.87       125
           8       0.63      0.98      0.77       122
           9       0.62      0.84      0.72       125

    accuracy                           0.83      1257
   macro avg       0.87      0.84      0.83      1257
weighted avg       0.87      0.83      0.83      1257


Confusion matrix EM:
[[124   0   0   0   1   0   0   0   0   0]
 [  1  77  27   0   0   1   0   2  16   5]
 [  0   0  94   0   0   0   0   0  30   0]
 [  0   0   2  91   0   0   0   5  10  22]
 [  1   0   0   0 110

  import sys


#### Prediction Over Test Data

In [9]:
y_pred_test = predict(x_test,m)

print("Classification report EM:\n%s\n" % 
      (metrics.classification_report(y_test, y_pred_test)))
print("Confusion matrix EM:\n%s" % metrics.confusion_matrix(y_test, y_pred_test))
print()
h_c_v = metrics.homogeneity_completeness_v_measure(y_test, y_pred_test)
print('Homogenity:',h_c_v[0])
print('Completeness:',h_c_v[1])
print('V-measure:',h_c_v[2])

Classification report EM:
              precision    recall  f1-score   support

           0       0.96      0.87      0.91        53
           1       1.00      0.57      0.72        53
           2       0.92      0.85      0.88        53
           3       1.00      0.58      0.74        53
           4       0.92      0.86      0.89        57
           5       0.93      0.48      0.64        56
           6       1.00      0.93      0.96        54
           7       0.71      0.89      0.79        54
           8       0.52      0.90      0.66        52
           9       0.49      0.82      0.62        55

    accuracy                           0.77       540
   macro avg       0.84      0.77      0.78       540
weighted avg       0.85      0.77      0.78       540


Confusion matrix EM:
[[46  0  0  0  4  0  0  1  2  0]
 [ 0 30  2  0  0  1  0  0  9 11]
 [ 1  0 45  0  0  0  0  0  6  1]
 [ 0  0  0 31  0  0  0  2 12  8]
 [ 0  0  0  0 49  0  0  7  1  0]
 [ 0  0  0  0  0 27  0  0  3

  import sys


#### Repair

In [10]:
def repair(y_true, y_pred):
    k_map = dict()
    for k in range(classes):
        idxs = [i for i in range(len(y_test)) if y_pred[i]==k]
        unique, counts = np.unique(y_true[idxs], return_counts=True)
        k_map[k] = unique[np.argmax(counts)]
    y_real = list()
    for y in y_pred:
        y_real.append(k_map[y])
    return y_real

In [12]:
y_pred_repair = repair(y_train, y_pred_train)
print("Classification report SKLearn K-Means:\n%s\n" % 
      (metrics.classification_report(y_train, y_pred_repair)))
print("Confusion matrix SKLearn EM:\n%s" % metrics.confusion_matrix(y_train, y_pred_repair))
print()
h_c_v = metrics.homogeneity_completeness_v_measure(y_train, y_pred_repair)
print('Homogenity:',h_c_v[0])
print('Completeness:',h_c_v[1])
print('V-measure:',h_c_v[2])
print()

y_pred_test_repair = repair(y_test, y_pred_test)
print("Classification report SKLearn k-Means:\n%s\n" % 
      (metrics.classification_report(y_test, y_pred_test_repair)))
print("Confusion matrix SKLearn EM:\n%s" % metrics.confusion_matrix(y_test, y_pred_test_repair))
print()
h_c_v = metrics.homogeneity_completeness_v_measure(y_test, y_pred_test_repair)
print('Homogenity:',h_c_v[0])
print('Completeness:',h_c_v[1])
print('V-measure:',h_c_v[2])

Classification report SKLearn K-Means:
              precision    recall  f1-score   support

           0       0.98      0.99      0.99       125
           1       1.00      0.60      0.75       129
           2       0.75      0.76      0.76       124
           3       0.99      0.70      0.82       130
           4       0.97      0.89      0.93       124
           5       0.98      0.63      0.76       126
           6       1.00      0.97      0.98       127
           7       0.78      1.00      0.87       125
           8       0.63      0.98      0.77       122
           9       0.62      0.84      0.72       125

    accuracy                           0.83      1257
   macro avg       0.87      0.84      0.83      1257
weighted avg       0.87      0.83      0.83      1257


Confusion matrix SKLearn EM:
[[124   0   0   0   1   0   0   0   0   0]
 [  1  77  27   0   0   1   0   2  16   5]
 [  0   0  94   0   0   0   0   0  30   0]
 [  0   0   2  91   0   0   0   5  10  22]


### SciKitLearn k-Means

#### Predict

In [13]:
k_means = cluster.KMeans(n_clusters=10).fit(x_train)
y_pred_kmeans = k_means.predict(x_train)
y_pred_kmeans_real = repair(y_train, y_pred_kmeans)
print("Classification report SKLearn K-Means:\n%s\n" % 
      (metrics.classification_report(y_train, y_pred_kmeans_real)))
print("Confusion matrix SKLearn EM:\n%s" % metrics.confusion_matrix(y_train, y_pred_kmeans_real))
print()
h_c_v = metrics.homogeneity_completeness_v_measure(y_train, y_pred_kmeans_real)
print('Homogenity:',h_c_v[0])
print('Completeness:',h_c_v[1])
print('V-measure:',h_c_v[2])
print()

y_pred_kmeans_test = k_means.predict(x_test)
y_pred_kmeans_test_real = repair(y_test, y_pred_kmeans_test)
print("Classification report SKLearn k-Means:\n%s\n" % 
      (metrics.classification_report(y_test, y_pred_kmeans_test_real)))
print("Confusion matrix SKLearn EM:\n%s" % metrics.confusion_matrix(y_test, y_pred_kmeans_test_real))
print()
h_c_v = metrics.homogeneity_completeness_v_measure(y_test, y_pred_kmeans_test_real)
print('Homogenity:',h_c_v[0])
print('Completeness:',h_c_v[1])
print('V-measure:',h_c_v[2])

Classification report SKLearn K-Means:
              precision    recall  f1-score   support

           0       0.99      1.00      1.00       125
           1       0.61      0.30      0.40       129
           2       0.79      0.85      0.82       124
           3       0.93      0.87      0.90       130
           4       0.99      0.90      0.94       124
           5       0.90      0.75      0.81       126
           6       0.98      0.98      0.98       127
           7       0.86      0.99      0.92       125
           8       0.45      0.53      0.49       122
           9       0.54      0.78      0.63       125

    accuracy                           0.79      1257
   macro avg       0.80      0.79      0.79      1257
weighted avg       0.80      0.79      0.79      1257


Confusion matrix SKLearn EM:
[[125   0   0   0   0   0   0   0   0   0]
 [  0  39  26   0   0   0   1   0  63   0]
 [  0   1 105   3   0   0   0   4  10   1]
 [  0   0   0 113   0   2   0   2   0  13]


  'precision', 'predicted', average, warn_for)
