## <font color='green'> <div align="center">In the name of God </div></font>

### <font color='red'> Author: Sayed Kamaledin Ghiasi-Shrirazi <a href="http://profsite.um.ac.ir/~k.ghiasi">(http://profsite.um.ac.ir/~k.ghiasi)</a> </font>

# Experiments on UCI datasets

In [1]:
import numpy as np
import matplotlib.pyplot as plt
from matplotlib.patches import Ellipse
import sklearn
import scipy.stats as st
import scipy.io as sio
import time
import matplotlib.image as mpimg
import matplotlib as mpl

In [2]:
from LinearDiscriminantAnalysis import LinearDiscriminantAnalysis
from LDATrainingDataPreparation import TrainingData
from NearestNeighbor import NearestNeighbor

In [3]:
nFolds = 10
maxVqIteration = 100

In [4]:
def loadDataset(dataset):
    if dataset == 'singular_iris':
        X, y = loadDataset('iris')
        X = np.hstack([X, np.atleast_2d(y).T])
    else:    
        file = '../datasets/' + dataset + '/' + dataset + '.processed'
        f = open(file, 'r')
        N = int(f.readline())
        dim = int(f.readline())
        X = np.zeros ([N, dim])
        y = np.zeros ([N], dtype = int)
        for n in range (N):
            line = f.readline()
            features = line.strip('\n').split('\t')
            X[n,:] = features[0:dim]
            y[n]   = int(features[dim])
        mean = np.mean(X, axis=0)
        std  = np.std(X, axis=0)
        X = (X - mean) / std

    return X, y

In [5]:
def compute_accuracy(XTrain, yTrain, XTest, yTest, K, solver, metric_learning, S1, S2, regularization, min_svd):
    td = TrainingData(XTrain, yTrain)

    clusAlg = sklearn.cluster.KMeans()
    clusAlg.max_iter = maxVqIteration
    td.findSubclasses(K, clusAlg)
        
    lda = LinearDiscriminantAnalysis(td, solver, metric_learning, S1, S2, regularization, min_svd)
    lda.fitFeatureExtractor()
    true_solver = solver
    if solver == 'orthogonal_centroid':
        lda.solver = 'eigen'
    obj = lda.objective2()
    lda.solver == true_solver
    
    XTrainFtr = lda.transform (XTrain)
    XTestFtr  = lda.transform (XTest)
    accNN = NearestNeighbor (XTrainFtr, yTrain, XTestFtr, yTest)

    XTrainFtr = lda.transformByProjection(XTrain)
    XTestFtr = lda.transformByProjection(XTest)
    accProjNN = NearestNeighbor(XTrainFtr, yTrain, XTestFtr, yTest)

    return obj, accNN, accProjNN

In [6]:
def compute_average_accuracy (X, y, K, solver, metric_learning, S1, S2):
    accNN = np.zeros ([nFolds])
    accProjNN = np.zeros ([nFolds])
    obj = np.zeros ([nFolds])
    
    N, dim = X.shape
    C = max(y)+1
    Ki = np.bincount (y)
    Xc = np.zeros ([C, max(Ki), dim] )
    for c in range (C):
        Xc[c,:Ki[c],:] = X[y == c,:]
        
    np.random.seed(1)
    for fold in range (nFolds):
        idxTrn = 0
        idxTst = 0
        NTest = np.sum ((Ki+nFolds-1) // nFolds)
        NTrain = N - NTest
        XTrain = np.zeros ([NTrain, dim])
        yTrain = np.zeros ([NTrain])
        XTest = np.zeros ([NTest, dim])
        yTest = np.zeros ([NTest])
        for c in range (C):
            idx1 = fold * (Ki[c] // nFolds)
            idx2 = idx1 + (Ki[c]+nFolds-1) // nFolds
            if idx2 > Ki[c]:
                idx2 = Ki[c]
                idx1 -= 1
            XTrain1 = Xc[c,:idx1,:]
            XTrain2 = Xc[c,idx2:Ki[c],:]
            length  = Ki[c] - (idx2 - idx1)
            XTrain[idxTrn:idxTrn+length,:]  = np.concatenate((XTrain1, XTrain2), axis = 0)
            yTrain[idxTrn:idxTrn+length]    = c
            idxTrn += length
            length  = (idx2 - idx1)
            XTest[idxTst:idxTst+length,:]   = Xc[c,idx1:idx2,:]
            yTest[idxTst:idxTst+length]     = c
            idxTst += length
        (obj[fold], accNN[fold], accProjNN[fold]) = \
                compute_accuracy(XTrain, yTrain, XTest, yTest, K, solver, metric_learning, S1, S2, 0.0, 0.00001)

    return obj, accNN, accProjNN

In [7]:
datasets = ['banknote', 'breast_tissue', 'forest_types',     'iris',    'leaf', 
            'rwq',      'seeds',         'urban_land_cover', 'vehicle', 'wdbc', 'singular_iris']
noDatasets = 12
no_methods = 4
mean_table = np.zeros([noDatasets, no_methods, 3])
std_table = np.zeros([noDatasets, no_methods, 3])

for i, dataset in enumerate (datasets):
    X,y = loadDataset(dataset)
    N = X.shape[0]
    classes = np.unique(y)
    count = np.bincount(y)
    #K = np.maximum(count // 20, 1)
    K = 1
    C = max(y)+1
    arrayK = np.ones (C, dtype = int) * K
    L = list(range(N))
    np.random.seed(11)
    np.random.shuffle(L)
    X = X[L,:]
    y = y[L]
    print ('Number of clusters for dataset {} is {}'.format(dataset, K))
    solvers =  [('svd', '', 'Sb', 'St'), 
                ('ghiasi_svd', 'none', 'Sb', 'St'), ('ghiasi_svd', 'eig', 'Sb', 'St'), 
                ('ghiasi_lstsq', 'none', 'Sb', 'Sw')] #, ('orthogonal_centroid', 'Sb', 'St')
    
    for j, (solver, metric_learning, S1, S2) in enumerate (solvers):
        print ('---------------------------------------------------------------------------------------')
        print ('Experiment on dataset {} using solver {} with metric_learning={}, S1={} and S2={}'.format(dataset, solver, metric_learning, S1, S2))
        obj, accNN, accProjNN = compute_average_accuracy (X, y, arrayK, solver, metric_learning, S1, S2)
        L = [('obj', obj), ('NN', accNN), ('ProjNN', accProjNN)]
        for k, (name, acc) in enumerate(L):
            if name != 'obj':
                acc *= 100
            print (name , '=', '%.2f'% np.mean (acc), '+-', '%.2f'% (np.std (acc)/np.sqrt(10)), end='\t')
            mean_table[i,j, k] = (np.mean (acc)  * 10000 + 50) //100 / 100
            std_table[i,j,k] = (np.std (acc) * 10000 + 50) //100 / 100

        print ('')
    print ('***************************************************************************************')

Number of clusters for dataset banknote is 1
---------------------------------------------------------------------------------------
Experiment on dataset banknote using solver svd with metric_learning=, S1=Sb and S2=St
obj = 0.86 +- 0.00	NN = 99.57 +- 0.15	ProjNN = 99.57 +- 0.15	
---------------------------------------------------------------------------------------
Experiment on dataset banknote using solver ghiasi_svd with metric_learning=none, S1=Sb and S2=St
obj = 0.86 +- 0.00	NN = 99.57 +- 0.15	ProjNN = 99.57 +- 0.15	
---------------------------------------------------------------------------------------
Experiment on dataset banknote using solver ghiasi_svd with metric_learning=eig, S1=Sb and S2=St
obj = 0.86 +- 0.00	NN = 99.93 +- 0.07	ProjNN = 99.57 +- 0.15	
---------------------------------------------------------------------------------------
Experiment on dataset banknote using solver ghiasi_lstsq with metric_learning=none, S1=Sb and S2=Sw
obj = 0.86 +- 0.00	NN = 99.57 +- 0.

obj = 5.27 +- 0.01	NN = 75.00 +- 1.85	ProjNN = 56.39 +- 1.70	
---------------------------------------------------------------------------------------
Experiment on dataset urban_land_cover using solver ghiasi_svd with metric_learning=eig, S1=Sb and S2=St
obj = 5.28 +- 0.01	NN = 77.92 +- 1.66	ProjNN = 56.39 +- 1.70	
---------------------------------------------------------------------------------------
Experiment on dataset urban_land_cover using solver ghiasi_lstsq with metric_learning=none, S1=Sb and S2=Sw
obj = 5.28 +- 0.01	NN = 79.17 +- 1.94	ProjNN = 56.39 +- 1.70	
***************************************************************************************
Number of clusters for dataset vehicle is 1
---------------------------------------------------------------------------------------
Experiment on dataset vehicle using solver svd with metric_learning=, S1=Sb and S2=St
obj = 1.51 +- 0.00	NN = 75.93 +- 1.51	ProjNN = 75.70 +- 1.35	
---------------------------------------------------------

In [8]:
txt1='''
\\begin{table}[!t]
\\renewcommand{\\arraystretch}{1.0}
\\caption{UCI}
\\label{tbl:uci}
  \\centering
  \\begin{tabular}{l|c|c|c|c}
    \\hline
    \\multicolumn{1}{c|}{\\small\\textbf{Dataset}} &
    \\multicolumn{1}{c|}{\\small\\textbf{EIG-LDA}} &
    \\multicolumn{1}{c|}{\\small\\textbf{LDA++}} &
    \\multicolumn{1}{c|}{\\small\\textbf{EIG-LDA++}}&
    \\multicolumn{1}{c}{\\small{$\\mathbf{S_w^\\dagger M}$}} \\\\
    \\hline'''
txt2 = '''
    \\hline
  \\end{tabular}  
\\end{table}
'''

datasetNames = ['banknote', 'breast\_tissue', 'forest\_types',     'iris',    'leaf', 
            'rwq',      'seeds',         'urban\_land', 'vehicle', 'wdbc', 'singular\_iris']
for k in range (3):
    print (txt1)
    print ('\n')
    for i, dataset in enumerate (datasetNames):
        print ('%s'%dataset, end=' ')
        for j in range (no_methods):
            print (' & ', end = '')
            if k == 0:
                print ('$', "{:.2f}".format(mean_table[i,j,k]), '$', end=' ')
            else:
                print ('$', "{:.2f}".format(mean_table[i,j,k]), '\pm ', "{:.2f}".format(std_table[i,j,k]), '$', end=' ')
        print ('\\\\')
    print (txt2)


\begin{table}[!t]
\renewcommand{\arraystretch}{1.0}
\caption{UCI}
\label{tbl:uci}
  \centering
  \begin{tabular}{l|c|c|c|c}
    \hline
    \multicolumn{1}{c|}{\small\textbf{Dataset}} &
    \multicolumn{1}{c|}{\small\textbf{EIG-LDA}} &
    \multicolumn{1}{c|}{\small\textbf{LDA++}} &
    \multicolumn{1}{c|}{\small\textbf{EIG-LDA++}}&
    \multicolumn{1}{c}{\small{$\mathbf{S_w^\dagger M}$}} \\
    \hline


banknote  & $ 0.86 $  & $ 0.86 $  & $ 0.86 $  & $ 0.86 $ \\
breast\_tissue  & $ 2.13 $  & $ 2.13 $  & $ 2.13 $  & $ 2.13 $ \\
forest\_types  & $ 1.86 $  & $ 1.86 $  & $ 1.86 $  & $ 1.86 $ \\
iris  & $ 1.19 $  & $ 1.19 $  & $ 1.19 $  & $ 1.19 $ \\
leaf  & $ 7.90 $  & $ 7.90 $  & $ 7.90 $  & $ 7.90 $ \\
rwq  & $ 0.50 $  & $ 0.50 $  & $ 0.50 $  & $ 0.50 $ \\
seeds  & $ 1.61 $  & $ 1.61 $  & $ 1.61 $  & $ 1.61 $ \\
urban\_land  & $ 5.28 $  & $ 5.27 $  & $ 5.28 $  & $ 5.28 $ \\
vehicle  & $ 1.51 $  & $ 1.51 $  & $ 1.51 $  & $ 1.51 $ \\
wdbc  & $ 0.78 $  & $ 0.78 $  & $ 0.78 $  & $ 0.78 $ \\

### <font color='red'> Author: Sayed Kamaledin Ghiasi-Shrirazi <a href="http://profsite.um.ac.ir/~k.ghiasi">(http://profsite.um.ac.ir/~k.ghiasi)</a> </font>