## <font color='green'> <div align="center">In the name of God </div></font>

### <font color='red'> Author: Sayed Kamaledin Ghiasi-Shrirazi <a href="http://profsite.um.ac.ir/~k.ghiasi">(http://profsite.um.ac.ir/~k.ghiasi)</a> </font>

# Experiments on UCI datasets

In [1]:
import numpy as np
import matplotlib.pyplot as plt
from matplotlib.patches import Ellipse
import sklearn
import scipy.stats as st
import scipy.io as sio
import time
import matplotlib.image as mpimg
import matplotlib as mpl

In [2]:
from LinearDiscriminantAnalysis import LinearDiscriminantAnalysis
from LDATrainingDataPreparation import TrainingData
from NearestNeighbor import NearestNeighbor

In [3]:
nFolds = 10
maxVqIteration = 100

In [4]:
def loadDataset(dataset):
    file = '../datasets/' + dataset + '/' + dataset + '.processed'
    f = open(file, 'r')
    N = int(f.readline())
    dim = int(f.readline())
    X = np.zeros ([N, dim])
    y = np.zeros ([N], dtype = int)
    for n in range (N):
        line = f.readline()
        features = line.strip('\n').split('\t')
        X[n,:] = features[0:dim]
        y[n]   = int(features[dim])
    mean = np.mean(X, axis=0)
    std  = np.std(X, axis=0)
    X = (X - mean) / std

    return X, y

In [5]:
def compute_accuracy(XTrain, yTrain, XTest, yTest, K, solver, S1, S2, regularization, min_svd):
    td = TrainingData(XTrain, yTrain)

    clusAlg = sklearn.cluster.KMeans()
    clusAlg.max_iter = maxVqIteration
    td.findSubclasses(K, clusAlg)
        
    lda = LinearDiscriminantAnalysis(td, solver, S1, S2, regularization, min_svd)
    lda.fitFeatureExtractor()
    true_solver = solver
    if solver == 'orthogonal_centroid':
        lda.solver = 'eigen'
    obj = lda.objective2()
    lda.solver == true_solver
    
    XTrainFtr = lda.transform (XTrain)
    XTestFtr  = lda.transform (XTest)
    accNN = NearestNeighbor (XTrainFtr, yTrain, XTestFtr, yTest)

    XTrainFtr = lda.transformByProjection(XTrain)
    XTestFtr = lda.transformByProjection(XTest)
    accProjNN = NearestNeighbor(XTrainFtr, yTrain, XTestFtr, yTest)

    return obj, accNN, accProjNN

In [6]:
def compute_average_accuracy (X, y, K, solver, S1, S2):
    accNN = np.zeros ([nFolds])
    accProjNN = np.zeros ([nFolds])
    obj = np.zeros ([nFolds])
    
    N, dim = X.shape
    C = max(y)+1
    Ki = np.bincount (y)
    Xc = np.zeros ([C, max(Ki), dim] )
    for c in range (C):
        Xc[c,:Ki[c],:] = X[y == c,:]
        
    np.random.seed(1)
    for fold in range (nFolds):
        idxTrn = 0
        idxTst = 0
        NTest = np.sum ((Ki+nFolds-1) // nFolds)
        NTrain = N - NTest
        XTrain = np.zeros ([NTrain, dim])
        yTrain = np.zeros ([NTrain])
        XTest = np.zeros ([NTest, dim])
        yTest = np.zeros ([NTest])
        for c in range (C):
            idx1 = fold * (Ki[c] // nFolds)
            idx2 = idx1 + (Ki[c]+nFolds-1) // nFolds
            if idx2 > Ki[c]:
                idx2 = Ki[c]
                idx1 -= 1
            XTrain1 = Xc[c,:idx1,:]
            XTrain2 = Xc[c,idx2:Ki[c],:]
            length  = Ki[c] - (idx2 - idx1)
            XTrain[idxTrn:idxTrn+length,:]  = np.concatenate((XTrain1, XTrain2), axis = 0)
            yTrain[idxTrn:idxTrn+length]    = c
            idxTrn += length
            length  = (idx2 - idx1)
            XTest[idxTst:idxTst+length,:]   = Xc[c,idx1:idx2,:]
            yTest[idxTst:idxTst+length]     = c
            idxTst += length
        (obj[fold], accNN[fold], accProjNN[fold]) = \
                compute_accuracy(XTrain, yTrain, XTest, yTest, K, solver, S1, S2, 0.0, 0.00001)

    return obj, accNN, accProjNN

In [9]:
datasets = ['banknote', 'breast_tissue', 'forest_types',     'iris',    'leaf', 
            'rwq',      'seeds',         'urban_land_cover', 'vehicle', 'wdbc']
noDatasets = 11
mean_table = np.zeros([noDatasets, 2, 3])
std_table = np.zeros([noDatasets, 2, 3])

for i, dataset in enumerate (datasets):
    X,y = loadDataset(dataset)
    N = X.shape[0]
    classes = np.unique(y)
    count = np.bincount(y)
    #K = np.maximum(count // 20, 1)
    K = 1
    C = max(y)+1
    arrayK = np.ones (C, dtype = int) * K
    L = list(range(N))
    np.random.seed(11)
    np.random.shuffle(L)
    X = X[L,:]
    y = y[L]
    print ('Number of clusters for dataset {} is {}'.format(dataset, K))
    solvers =  [('svd', 'Sb', 'St'), ('ghiasi_svd', 'Sb', 'St')] #, ('orthogonal_centroid', 'Sb', 'St')
    
    for j, (solver, S1, S2) in enumerate (solvers):
        print ('---------------------------------------------------------------------------------------')
        print ('Experiment on dataset {} using solver {} with S1={} and S2={}'.format(dataset, solver, S1, S2))
        obj, accNN, accProjNN = compute_average_accuracy (X, y, arrayK, solver, S1, S2)
        L = [('obj', obj), ('NN', accNN), ('ProjNN', accProjNN)]
        for k, (name, acc) in enumerate(L):
            if name != 'obj':
                acc *= 100
            print (name , '=', '%.2f'% np.mean (acc), '+-', '%.2f'% (np.std (acc)/np.sqrt(10)), end='\t')
            mean_table[i,j, k] = (np.mean (acc)  * 10000 + 50) //100 / 100
            std_table[i,j,k] = (np.std (acc) * 10000 + 50) //100 / 100

        print ('')
    print ('***************************************************************************************')

Number of clusters for dataset banknote is 1
---------------------------------------------------------------------------------------
Experiment on dataset banknote using solver svd with S1=Sb and S2=St
obj = 0.86 +- 0.00	NN = 99.57 +- 0.15	ProjNN = 99.57 +- 0.15	
---------------------------------------------------------------------------------------
Experiment on dataset banknote using solver ghiasi_svd with S1=Sb and S2=St
obj = 0.86 +- 0.00	NN = 99.57 +- 0.15	ProjNN = 99.57 +- 0.15	
***************************************************************************************
Number of clusters for dataset breast_tissue is 1
---------------------------------------------------------------------------------------
Experiment on dataset breast_tissue using solver svd with S1=Sb and S2=St
obj = 2.13 +- 0.02	NN = 67.14 +- 3.94	ProjNN = 67.86 +- 4.20	
---------------------------------------------------------------------------------------
Experiment on dataset breast_tissue using solver ghiasi_svd 

In [10]:
txt1='''
\\begin{table}[!t]
\\renewcommand{\\arraystretch}{1.3}
\\caption{UCI}
\\label{tbl:uci}
  \\centering
  \\begin{tabular}{l|l|l}
    \\hline
    \\multicolumn{1}{c|}{\\textbf{dataset}} &
    \\multicolumn{1}{c|}{\\textbf{Traditoinal LDA}} &
    \\multicolumn{1}{c}{\\textbf{Proposed Method}} \\\\
    \\hline'''
txt2 = '''
    \\hline
  \\end{tabular}  
\\end{table}
'''
datasetNames = ['banknote', 'breast tissue', 'forest types',     'iris',    'leaf', 
            'rwq',      'seeds',         'urban land cover', 'vehicle', 'wdbc']
for k in range (3):
    print (txt1)
    print ('\n')
    for i, dataset in enumerate (datasetNames):
        print ('%s'%dataset, end=' ')
        for j in range (2):
            print (' & ', end = '')
            print ('$', mean_table[i,j,k], '\pm ', std_table[i,j,k], '$', end=' ')
        print ('\\\\')
    print (txt2)


\begin{table}[!t]
\renewcommand{\arraystretch}{1.3}
\caption{UCI}
\label{tbl:uci}
  \centering
  \begin{tabular}{l|l|l}
    \hline
    \multicolumn{1}{c|}{\textbf{dataset}} &
    \multicolumn{1}{c|}{\textbf{Traditoinal LDA}} &
    \multicolumn{1}{c}{\textbf{Proposed Method}} \\
    \hline


banknote  & $ 0.86 \pm  0.0 $  & $ 0.86 \pm  0.0 $ \\
breast tissue  & $ 2.13 \pm  0.06 $  & $ 2.13 \pm  0.06 $ \\
forest types  & $ 1.86 \pm  0.01 $  & $ 1.86 \pm  0.01 $ \\
iris  & $ 1.19 \pm  0.02 $  & $ 1.19 \pm  0.02 $ \\
leaf  & $ 7.9 \pm  0.06 $  & $ 7.9 \pm  0.06 $ \\
rwq  & $ 0.5 \pm  0.01 $  & $ 0.5 \pm  0.01 $ \\
seeds  & $ 1.61 \pm  0.01 $  & $ 1.61 \pm  0.01 $ \\
urban land cover  & $ 5.28 \pm  0.04 $  & $ 5.27 \pm  0.04 $ \\
vehicle  & $ 1.51 \pm  0.01 $  & $ 1.51 \pm  0.01 $ \\
wdbc  & $ 0.78 \pm  0.0 $  & $ 0.78 \pm  0.0 $ \\

    \hline
  \end{tabular}  
\end{table}


\begin{table}[!t]
\renewcommand{\arraystretch}{1.3}
\caption{UCI}
\label{tbl:uci}
  \centering
  \begin{tabular}{l|

### <font color='red'> Author: Sayed Kamaledin Ghiasi-Shrirazi <a href="http://profsite.um.ac.ir/~k.ghiasi">(http://profsite.um.ac.ir/~k.ghiasi)</a> </font>