In [1]:
import pandas as pd
import collections
import numpy as np
import matplotlib.pyplot as plt
np.seterr(divide='ignore', invalid='ignore')

from sklearn.metrics import confusion_matrix, classification_report, precision_score
from sklearn.decomposition import PCA
from sklearn.model_selection import train_test_split

## Define functions

The function for training `QDA` :
- takes the inputs : $X_{app}$ (training input matrix $n, p$), $z_{app}$ (binary class indicator matrix $n, K$), $m_{prior}$ (Gaussian prior expectation matrix $K, p$), $df_{exp}$ (Gaussian prior shrinkage parameter), $S_{prior}$ (covariance matrix $p, p, K$ for the inverse -Wishart prior), $df_{cov}$ (degree of freedom of the inverse-Wishart prior)
- provides : $pi$ (vector $K, 1$ of class proportions), $mu$ (estimated expectation matrix $K, p$), $Sig$ (estimated covariance matrices $p, p, K$)

In [2]:
def QDA(Xapp, zapp, mprior, df_exp, Sprior, df_cov):
    n, p, K =  Xapp.shape[0], Xapp.shape[1], zapp.shape[1]
    pi = np.zeros(K)
    mu = np.zeros((K, p))
    B = np.zeros((p, p, n))
    Sig = np.zeros((p, p, K))
    
    pi = np.mean(zapp, axis=0)
    for j in range(K):
        mu[j, :] = (sum(zapp[i, j] * Xapp[i, :] for i in range(n)) 
                    + df_exp * mprior[j, :]) / (sum(zapp[:, j]) + df_exp)
        
        for i in range(n):
            temp = (Xapp[i, :] - mprior[j, :]).reshape((p, -1))
            B[:, :, i] = temp @ temp.T
        
        Sig[:, :, j] = (sum((zapp[i, j] * B[:, :, i]) for i in range(n)) + 
                        df_exp * (mu[j, :] - mprior[j, :]) @ (mu[j, :] - mprior[j, :]).T + 
                        Sprior[:, :, j].T) / (sum(zapp[:, j]) + df_cov + p + 2)
    return pi, mu, Sig

In [3]:
def QDA_without_prior(Xapp, zapp):
    n, p, K =  Xapp.shape[0], Xapp.shape[1], zapp.shape[1]
    pi = np.zeros(K)
    mu = np.zeros((K, p))
    B = np.zeros((p, p, n))
    Sig = np.zeros((p, p, K))
    
    pi = np.mean(zapp, axis=0)
    for j in range(K):
        mu[j, :] = sum(zapp[i, j] * Xapp[i, :] for i in range(n))
        
        for i in range(n):
            temp = (Xapp[i, :] - mu[j, :]).reshape((p, -1))
            
            B[:, :, i] = temp @ temp.T
        
        Sig[:, :, j] = sum((zapp[i, j] * B[:, :, i]) for i in range(n)) / sum(zapp[:, j])
    return pi, mu, Sig

The function for evaluating test set `evaluation` :
- takes the inputs : $X_{tst}$ (test input matrix $n, p$), $pi$ (vector $K, 1$ of class proportions), $mu$ (estimated expectation matrix $K, p$), $Sig$ (estimated covariance matrices $p, p, K$)
- provides : $prob$ (matrix $n, K$ of estimated class posterior probabilities) and $pred$ (vector $n, 1$ of corresponding decisions)

In [4]:
def evaluation(Xtst, pi, mu, Sig):
    n, p, K = Xtst.shape[0], Xtst.shape[1], len(pi)
    prob = np.zeros((n, K))
    pred = np.zeros(n)
    
    f = lambda x, mu, Sig: np.linalg.det(Sig)**(-1/2) * np.exp((-1/2) *(x - mu).T
                                                               @ np.linalg.inv(Sig)
                                                               @ (x - mu))
   
    for j in range(K):
        for i in range(n):
            total = sum(pi[k] * f(Xtst[i, :], mu[k, :], Sig[:, :, k]) for k in range(K))
            prob[i, j] = pi[j] * f(Xtst[i, :], mu[j, :], Sig[:, :, j]) / total
        
    pred = np.argmax(prob, axis=1)
    
    return prob, pred

We define a `data_processing` function to test different dataset. The function does respectively :
- Read the dataset by the `pandas` library and transform it to the one in form of the `numpy` library in order to facilitate our next works.
- Apply PCA filtering process to the dataset if necessary.
- Sort all classes of the dataset and display their enumeration
- Separate the dataset into training and test ones by `cut_off` coefficient predefined at `0.9`
- Calculate the prior expectation and variance based on the training set

In [5]:
def data_processing(file_url, n_components_pca=None, test_size=0.3):
    data = pd.read_csv(file_url, header=0)
    data = data.to_numpy()
    
    if n_components_pca is not None:
        data_y = data[:, -1]
        data_X = data[:, :-1]
        pca = PCA(n_components=n_components_pca)
        data_X = pca.fit_transform(data_X)
        data = np.hstack((data_X, data_y.reshape((-1,1))))
    
    classes = sorted(collections.Counter(list(data[:, -1])).keys())
    classes_with_index = {}
    
    K = len(classes)
    n, p = data.shape[0], data.shape[1] - 1
    for i_class in range(K):
        classes_with_index[i_class] = classes[i_class]
        data[data[:, -1] == classes[i_class], -1] = i_class
    print("Ours classes are enumerated as bellow : \n", classes_with_index)
    
    if test_size > 1 or test_size < 0:
        raise IndexError("invalid value of test_size")
    else:
        Xapp, Xtst, yapp, ytst = train_test_split(data[:, :-1], data[:, -1], test_size=test_size)

        Zapp = np.zeros((len(yapp), K))
        Ztst = np.zeros((len(ytst), K))
        for i in range(len(yapp)):
            Zapp[i, int(yapp[i])] = 1
        for i in range(len(ytst)):
            Ztst[i, int(ytst[i])] = 1
    
    mprior = np.zeros((K, p))
    Sprior = np.zeros((p, p, K))
    for j in range(K):
        temp = data[data[:, -1] == j, :-1]
        temp = temp.astype(float)
        mprior[j, :] = temp.mean(axis = 0)
        Sprior[:, :, j] = np.cov(temp, rowvar=False)
    return Xapp, Zapp, Xtst, Ztst, mprior, Sprior

## Optdigits test

The `optdigits` dataset has many zero elements that make itselft correlated. That's why the result from the inital training datatset is badly evaluated. We then apply a PCA filtering process with number of principal components depending on the `cut_off` coefficient (rank of training set) in order to eliminate the correlated ones. As a result, we get much better performance. <br>
For the choice of the shrinkage parameter $df_{exp}$ and the degree of freedom $df_{co}v$, we a priori choose $df_{exp} = 10$ and $df_{cov} = 300$ for this dataset and also the next ones.

In [6]:
df_exp_opt = 0.1
df_cov_opt = 65

Xapp_opt, Zapp_opt, Xtst_opt, Ztst_opt, mprior_opt, Sprior_opt = data_processing(
                                            "dataset/optdigits.csv", n_components_pca=20)
pi_opt, mu_opt, Sig_opt = QDA(Xapp_opt, Zapp_opt, mprior_opt, df_exp_opt, Sprior_opt, df_cov_opt)
prob_opt, pred_opt = evaluation(Xtst_opt, pi_opt, mu_opt, Sig_opt)

print('\n\n', confusion_matrix(np.argmax(Ztst_opt, axis=1), pred_opt))
print('\n\n', classification_report(np.argmax(Ztst_opt, axis=1), pred_opt))
print('\n\n', precision_score(np.argmax(Ztst_opt, axis=1), pred_opt, average='weighted'))

Ours classes are enumerated as bellow : 
 {0: 0.0, 1: 1.0, 2: 2.0, 3: 3.0, 4: 4.0, 5: 5.0, 6: 6.0, 7: 7.0, 8: 8.0, 9: 9.0}


 [[154   0   0   0   0   1   0   0   0   0]
 [  0 161   0   1   1   0   0   0   0   1]
 [  0   1 180   1   0   0   0   0   0   1]
 [  0   0   0 165   0   0   0   1   0   0]
 [  0   0   0   0 170   0   0   0   0   2]
 [  0   0   0   1   1 149   0   0   2   3]
 [  0   0   0   0   1   0 180   0   0   0]
 [  0   0   0   0   0   0   0 175   0   0]
 [  0   2   0   0   0   0   0   0 159   0]
 [  0   0   0   0   1   0   0   1   2 169]]


               precision    recall  f1-score   support

           0       1.00      0.99      1.00       155
           1       0.98      0.98      0.98       164
           2       1.00      0.98      0.99       183
           3       0.98      0.99      0.99       166
           4       0.98      0.99      0.98       172
           5       0.99      0.96      0.97       156
           6       1.00      0.99      1.00       181
       

In [7]:
pi_opt_2, mu_opt_2, Sig_opt_2 = QDA_without_prior(Xapp_opt, Zapp_opt)
prob_opt_2, pred_opt_2 = evaluation(Xtst_opt, pi_opt_2, mu_opt_2, Sig_opt_2)

print('\n\n', confusion_matrix(np.argmax(Ztst_opt, axis=1), pred_opt_2))
print('\n\n', classification_report(np.argmax(Ztst_opt, axis=1), pred_opt_2))
print('\n\n', precision_score(np.argmax(Ztst_opt, axis=1), pred_opt_2, average='weighted'))



 [[154   0   0   0   0   1   0   0   0   0]
 [  0 160   0   1   1   0   0   0   1   1]
 [  0   2 178   0   0   1   0   1   0   1]
 [  0   0   0 166   0   0   0   0   0   0]
 [  0   0   0   0 169   0   0   0   0   3]
 [  0   0   0   2   0 147   0   1   2   4]
 [  0   0   0   0   0   1 179   0   0   1]
 [  0   1   0   0   0   0   0 174   0   0]
 [  0   3   0   1   0   0   0   0 157   0]
 [  0   0   0   0   1   0   0   1   2 169]]


               precision    recall  f1-score   support

           0       1.00      0.99      1.00       155
           1       0.96      0.98      0.97       164
           2       1.00      0.97      0.99       183
           3       0.98      1.00      0.99       166
           4       0.99      0.98      0.99       172
           5       0.98      0.94      0.96       156
           6       1.00      0.99      0.99       181
           7       0.98      0.99      0.99       175
           8       0.97      0.98      0.97       161
           9       0.9

df_exp_opt = 0.1
df_cov_opt = 65
cuts_off = np.array([0.3, 0.4, 0.5, 0.6, 0.7, 0.8, 0.9])
prediction = np.zeros(7)
prediction_without_prior = np.zeros(7)

for i in range(len(cuts_off)):
    Xapp_opt, Zapp_opt, Xtst_opt, Ztst_opt, mprior_opt, Sprior_opt = data_processing(
                                            "dataset/optdigits.csv", n_components_pca=10, cut_off=cuts_off[i])
    
    pi_opt, mu_opt, Sig_opt = QDA(Xapp_opt, Zapp_opt, mprior_opt, df_exp_opt, Sprior_opt, df_cov_opt)
    prob_opt, pred_opt = evaluation(Xtst_opt, pi_opt, mu_opt, Sig_opt)
    prediction[i] = precision_score(np.argmax(Ztst_opt, axis=1), pred_opt, average='weighted')
    
    pi_opt_2, mu_opt_2, Sig_opt_2 = QDA_without_prior(Xapp_opt, Zapp_opt)
    prob_opt_2, pred_opt_2 = evaluation(Xtst_opt, pi_opt_2, mu_opt_2, Sig_opt_2)
    prediction_without_prior[i] = precision_score(np.argmax(Ztst_opt, axis=1), pred_opt_2, average='weighted')

plt.plot(cuts_off, prediction)
plt.plot(cuts_off, prediction_without_prior)

In [8]:
Xapp_pag, Zapp_pag, Xtst_pag, Ztst_pag, mprior_pag, Sprior_pag = data_processing("dataset/page-blocks.csv")

df_exp_pag = 0.1
df_cov_pag = 65

pi_pag, mu_pag, Sig_pag = QDA(Xapp_pag, Zapp_pag, mprior_pag, df_exp_pag, Sprior_pag, df_cov_pag)
prob_pag, pred_pag = evaluation(Xtst_pag, pi_pag, mu_pag, Sig_pag)

print('\n\n')
print(confusion_matrix(np.argmax(Ztst_pag, axis=1), pred_pag))
print('\n\n')
print(classification_report(np.argmax(Ztst_pag, axis=1), pred_pag))
print('\n\n')
print(precision_score(np.argmax(Ztst_pag, axis=1), pred_pag, average='weighted'))

Ours classes are enumerated as bellow : 
 {0: 1.0, 1: 2.0, 2: 3.0, 3: 4.0, 4: 5.0}



[[424   5   0   3  18]
 [ 10  56   0   6   0]
 [  0   0   4   0   0]
 [  0   0   0  16   1]
 [  1   0   1   0   3]]



              precision    recall  f1-score   support

           0       0.97      0.94      0.96       450
           1       0.92      0.78      0.84        72
           2       0.80      1.00      0.89         4
           3       0.64      0.94      0.76        17
           4       0.14      0.60      0.22         5

    accuracy                           0.94      1642
   macro avg       0.74      0.73      0.71      1642
weighted avg       0.95      0.94      0.94      1642




0.9499717966943255


In [9]:
pi_pag_2, mu_pag_2, Sig_pag_2 = QDA_without_prior(Xapp_pag, Zapp_pag)
prob_pag_2, pred_pag_2 = evaluation(Xtst_pag, pi_pag_2, mu_pag_2, Sig_pag_2)


print('\n\n', confusion_matrix(np.argmax(Ztst_pag, axis=1), pred_pag_2))
print('\n\n', classification_report(np.argmax(Ztst_pag, axis=1), pred_pag_2))
print('\n\n', precision_score(np.argmax(Ztst_pag, axis=1), pred_pag_2, average='weighted'))



 [[1380   26    2   22   32]
 [   6   83    0   16    5]
 [   0    0    9    0    0]
 [   0    0    0   29    4]
 [  12    0    0    1   15]]


               precision    recall  f1-score   support

           0       0.99      0.94      0.97      1462
           1       0.76      0.75      0.76       110
           2       0.82      1.00      0.90         9
           3       0.43      0.88      0.57        33
           4       0.27      0.54      0.36        28

    accuracy                           0.92      1642
   macro avg       0.65      0.82      0.71      1642
weighted avg       0.95      0.92      0.93      1642



 0.9475484770587973


## Satimage test

In [10]:
Xapp_sat, Zapp_sat, Xtst_sat, Ztst_sat, mprior_sat, Sprior_sat = data_processing("dataset/satimage.csv",
                                                                                n_components_pca=30)

df_exp_sat = 0.1
df_cov_sat = 38

pi_sat, mu_sat, Sig_sat = QDA(Xapp_sat, Zapp_sat, mprior_sat, df_exp_sat, Sprior_sat, df_cov_sat)
prob_sat, pred_sat = evaluation(Xtst_sat, pi_sat, mu_sat, Sig_sat)

print('\n\n')
print(confusion_matrix(np.argmax(Ztst_sat, axis=1), pred_sat))
print('\n\n')
print(classification_report(np.argmax(Ztst_sat, axis=1), pred_sat))
print('\n\n')
print(precision_score(np.argmax(Ztst_sat, axis=1), pred_sat, average='weighted'))

Ours classes are enumerated as bellow : 
 {0: 1.0, 1: 2.0, 2: 3.0, 3: 4.0, 4: 5.0, 5: 7.0}



[[442   1   6   0   3   0]
 [  0 214   0   0   3   0]
 [  5   2 361  12   2   7]
 [  0   5  53  50   3  67]
 [  4  10   0   2 189  12]
 [  1   5  17  10  15 428]]



              precision    recall  f1-score   support

           0       0.98      0.98      0.98       452
           1       0.90      0.99      0.94       217
           2       0.83      0.93      0.87       389
           3       0.68      0.28      0.40       178
           4       0.88      0.87      0.88       217
           5       0.83      0.90      0.86       476

    accuracy                           0.87      1929
   macro avg       0.85      0.82      0.82      1929
weighted avg       0.86      0.87      0.86      1929




0.8640099226624413


In [11]:
pi_sat_2, mu_sat_2, Sig_sat_2 = QDA_without_prior(Xapp_sat, Zapp_sat)
prob_sat_2, pred_sat_2 = evaluation(Xtst_sat, pi_sat_2, mu_sat_2, Sig_sat_2)


print('\n\n', confusion_matrix(np.argmax(Ztst_sat, axis=1), pred_sat_2))
print('\n\n', classification_report(np.argmax(Ztst_sat, axis=1), pred_sat_2))
print('\n\n', precision_score(np.argmax(Ztst_sat, axis=1), pred_sat_2, average='weighted'))



 [[442   1   4   1   2   2]
 [  0 212   0   0   5   0]
 [ 10   2 351  17   6   3]
 [  0   7  47  64   6  54]
 [ 15  12   1   8 165  16]
 [  1   8  22  19  15 411]]


               precision    recall  f1-score   support

           0       0.94      0.98      0.96       452
           1       0.88      0.98      0.92       217
           2       0.83      0.90      0.86       389
           3       0.59      0.36      0.45       178
           4       0.83      0.76      0.79       217
           5       0.85      0.86      0.85       476

    accuracy                           0.85      1929
   macro avg       0.82      0.81      0.81      1929
weighted avg       0.84      0.85      0.84      1929



 0.8425287154985674


## Segment test

In [16]:
Xapp_seg, Zapp_seg, Xtst_seg, Ztst_seg, mprior_seg, Sprior_seg = data_processing("dataset/segment.csv",
                                                                                n_components_pca=18)

df_exp_seg = 0.1
df_cov_seg = 20

pi_seg, mu_seg, Sig_seg = QDA(Xapp_seg, Zapp_seg, mprior_seg, df_exp_seg, Sprior_seg, df_cov_seg)
prob_seg, pred_seg = evaluation(Xtst_seg, pi_seg, mu_seg, Sig_seg)

print('\n\n')
print(confusion_matrix(np.argmax(Ztst_seg, axis=1), pred_seg))
print('\n\n')
print(classification_report(np.argmax(Ztst_seg, axis=1), pred_seg))
print('\n\n')
print(precision_score(np.argmax(Ztst_seg, axis=1), pred_seg, average='weighted'))

Ours classes are enumerated as bellow : 
 {0: 'brickface', 1: 'cement', 2: 'foliage', 3: 'grass', 4: 'path', 5: 'sky', 6: 'window'}



[[106   0   1   0   0   0   2]
 [  0 104   0   0   0   0   5]
 [  1   3  57   0   0   0  28]
 [  0   0   0  90   0   0   0]
 [  0   5   0   0  92   0   0]
 [  0   0   0   0   0  94   0]
 [  1   9   6   0   0   0  89]]



              precision    recall  f1-score   support

           0       0.98      0.97      0.98       109
           1       0.86      0.95      0.90       109
           2       0.89      0.64      0.75        89
           3       1.00      1.00      1.00        90
           4       1.00      0.95      0.97        97
           5       1.00      1.00      1.00        94
           6       0.72      0.85      0.78       105

    accuracy                           0.91       693
   macro avg       0.92      0.91      0.91       693
weighted avg       0.92      0.91      0.91       693




0.918175988629886


In [17]:
pi_seg_2, mu_seg_2, Sig_seg_2 = QDA_without_prior(Xapp_seg, Zapp_seg)
prob_seg_2, pred_seg_2 = evaluation(Xtst_seg, pi_seg_2, mu_seg_2, Sig_seg_2)


print('\n\n', confusion_matrix(np.argmax(Ztst_seg, axis=1), pred_seg_2))
print('\n\n', classification_report(np.argmax(Ztst_seg, axis=1), pred_seg_2))
print('\n\n', precision_score(np.argmax(Ztst_seg, axis=1), pred_seg_2, average='weighted'))



 [[107   0   1   0   0   0   1]
 [  0  99   2   0   0   0   8]
 [  1   3  55   2   0   0  28]
 [  0   0   0  90   0   0   0]
 [  0   3   0   0  94   0   0]
 [  0   0   0   0   0  94   0]
 [  0   8   7   1   0   0  89]]


               precision    recall  f1-score   support

           0       0.99      0.98      0.99       109
           1       0.88      0.91      0.89       109
           2       0.85      0.62      0.71        89
           3       0.97      1.00      0.98        90
           4       1.00      0.97      0.98        97
           5       1.00      1.00      1.00        94
           6       0.71      0.85      0.77       105

    accuracy                           0.91       693
   macro avg       0.91      0.90      0.90       693
weighted avg       0.91      0.91      0.91       693



 0.9106168097112367


In [None]:
# 0.9766310415063318 1;p+20 ll
# 0.9758926103474012 10;p+20
# 0.9671482548426473 10;p+300 
# v reprensent the least informative -> v high -> cov lower -> more "confident" ????
# high df_cov -> Sig =~~ sampling covariance matrix (in this case Sprior)