###### Loading the Data

In [1]:
import numpy as np
import time

In [2]:
def loaddata(a, b):
    import h5py
    with h5py.File(a+'.h5','r') as H:
        data = np.copy(H['data'])
    with h5py.File(b+'.h5','r') as H:
        label = np.copy(H['label'])

    X = np.reshape(data,(data.shape[0],-1))
    X=X.T
    Y=np.reshape(label, (label.shape[0],1))
    X=X/255
    return X, Y

###### Validation with 20% data

In [103]:
def Validation_dataset(X, Y, percent):
    k=int((percent/100)*X.shape[1])
    rand= np.random.permutation(X.shape[1])
    rand= rand[0:k]
    All_index=np.arange(X.shape[1])
    a=set(rand)
    b=set(All_index)
    c=b-a
    a=sorted(a)
    c=sorted(c)

    Train_Data_mat=np.zeros((X.shape[0],len(c)))
    Train_labels= np.zeros((len(c)))
    Val_Data_mat=np.zeros((X.shape[0], len(a)))
    Val_labels= np.zeros((len(a)))
    for i in range(len(c)):
        Train_Data_mat[:,i]=X[:,c[i]]
        Train_labels[i]= Y[c[i]]
    for i in range(len(a)):
        Val_Data_mat[:,i]=X[:,a[i]]
        Val_labels[i]= Y[a[i]] 
        
    return Train_Data_mat, Train_labels, Val_Data_mat, Val_labels

###### Data Preprocessing:  Normalisation & SVD for rank reduction

In [104]:
def SVD(X, var_desired):
    U, s, Vt= np.linalg.svd(X, full_matrices=False)
    sum_ev_sq=np.sum(s**2)
    s_square=np.square(s)
    cumsum_s=np.cumsum(s_square)
    var_arr=cumsum_s/sum_ev_sq
    k=np.argmax(var_arr>var_desired)
    k=k+1
    S=np.diag(s)
    X_hat_reconst=U[:,0:k].dot(S[0:k,0:k]).dot(Vt[0:k,:])
    n=k
    comp_ratio = (X.shape[1]*n + n + X.shape[0]*n)/(X.shape[1] * X.shape[0])
    return X_hat_reconst, k, comp_ratio      

###### Logistic Regression Classifier using Gradient Descent 

In [105]:
#binary Logisitic Regression classifier
def binry_classifier(X, Y, iterations, Eta, lamda):
    import time
    row_all_one=np.ones((1,len(X.T)))     #adding a row of all one
    X=np.concatenate((row_all_one, X), 0)
    Wk=np.zeros(len(X))
    Wk1=Wk
    for j in range(iterations):
        labels_pred=[]
        Wk=Wk1
        for i in range(len(X.T)):
            exp_part=np.dot((X[:,i]), Wk)
            exp_part=-1*exp_part
            exp_part=np.exp(exp_part)
            if(exp_part<=1):
                labels_pred.append(1)
            else:
                labels_pred.append(0)
        labels_pred=np.asarray(labels_pred)
        Comp_Grad_1=np.sum(X*labels_pred, axis=1)
        Comp_Grad_2=np.sum(X*Y, axis=1)
        Grad=Comp_Grad_1-Comp_Grad_2
        Grad=Grad/len(X.T) 
        Grad=Grad+ lamda*Wk
        Wk1= Wk1 - Eta*Grad
        if(abs((np.linalg.norm(Wk)-np.linalg.norm(Wk1)))<1e-12):   #convergence
            diff=abs(np.linalg.norm(Wk)-np.linalg.norm(Wk1))
            print("The value of norm is", diff)
            break
    diff=np.linalg.norm(Wk)-np.linalg.norm(Wk1)
    return Wk1, labels_pred      

In [106]:
def Confusn_mat(labels_pred, Y):
    lst=[]
    true_pos=0
    false_posit=0
    false_negat=0
    true_neg=0
    for i in range(len(labels_pred)):
        if (labels_pred[i]== 1 and Y[i]==1):
            true_pos=true_pos +1
        elif (labels_pred[i]==1 and Y[i]==0):
            false_posit=false_posit+1
        elif (labels_pred[i]==0 and Y[i]==1):
            false_negat=false_negat+1
        else:
            true_neg=true_neg+1
    lst=[true_pos, false_posit, false_negat, true_neg]
    lst=np.asarray(lst)
    return lst

In [107]:
# Performance Metrics
def Parameters(true_pos, false_posit, false_negat, true_neg, which_data_dataset):
    if(which_data_dataset == "testing" or which_data_dataset =="validation" or which_data_dataset == "training_data"):
        print(true_pos, false_posit, false_negat, true_neg)
    lst=[]
    Accuracy=((true_pos+true_neg)/(true_pos+true_neg+false_posit+false_negat))*100
    Accuracy = (np.round(Accuracy,2))/100
    Precision= true_pos/(true_pos+false_posit) 
    Precision = np.round(Precision,2)
    Recall= true_pos/(true_pos+false_negat)
    Recall = np.round(Recall,2)
    F_measure=(2*true_pos)/(2*true_pos + false_negat+ false_posit)
    F_measure = np.round(F_measure,2)
    TPR=true_pos/(true_pos + false_negat)
    TPR = np.round(TPR,2)
    FPR= false_posit/(false_posit + true_neg)
    FPR = np.round(FPR,2)
    lst=[Accuracy, Precision, Recall, F_measure, TPR, FPR]
    return lst

In [108]:
# Learning One Vs All implementation
def one_vs_All_train(X, Y):
    unique, counts=np.unique(Y, return_counts=True)
    opt_W=np.zeros((len(unique), X.shape[0]+1))
    labels_pred=np.zeros((len(unique), len(Y)))
    for j in range(len(unique)):
        Y_bin=np.zeros(len(Y))
        lst=[]
        for i in range(len(Y)):
            if (Y[i]==j):
                Y_bin[i]=1
            else:
                Y_bin[i]=0
        opt_W[j], labels_pred[j] =binry_classifier(X, Y_bin, 200, 1e-5, np.exp(-50))
        lst=Confusn_mat(labels_pred[j], Y_bin)
    return opt_W, labels_pred

###### Training the Classifier One Vs All

In [109]:
start_time=time.time()
X, Y=loaddata("images_training", "labels_training")
variance=0.96
sample_times=3
unique, counts=np.unique(Y, return_counts=True)
opt_W_Avg=np.zeros((len(unique), X.shape[0]+1))
for i in range(sample_times):
    X_post_Val_data, Y_post_Val_labels, Val_Data, Val_labels=Validation_dataset(X, Y, 20)
    X_post_Val_data, n_components, comp_ratio = SVD(X_post_Val_data,variance)
    opt_W, labels_pred = one_vs_All_train(X_post_Val_data, Y_post_Val_labels)
    opt_W_Avg= opt_W_Avg+ opt_W
opt_W_Avg=opt_W_Avg/sample_times
print("The SVD has number of components as {} with compression ratio of {}".format(n_components,comp_ratio) )
print("Running time: "+ str(int((time.time()-start_time)/60))+" minutes")

The SVD has number of components as 89 with compression ratio of 0.11723347151360544
Running time: 38 minutes


###### Testing Procedure

In [None]:
def testing_All(X, W):
    row_all_one=np.ones((1,len(X.T)))
    X=np.concatenate((row_all_one, X), 0)
    probabilities=np.dot(W, X)
    pred_labels=np.argmax(probabilities, axis=0)
    return pred_labels

In [None]:
def FindConfMat(pred_labels, Y_test, which_data_set):
    unique, counts=np.unique(Y_test, return_counts=True)
    Conf_mat4All= np.zeros((len(unique), 4))
    Param_mat4All= np.zeros((len(unique), 6))
    for j in range(len(unique)):
        pred_labels_bin=[]
        Y_test_bin=[]
        for i in range(len(Y_test)):
            if (pred_labels[i]==j):
                pred_labels_bin.append(1)
            else:
                pred_labels_bin.append(0)
            if(Y_test[i]==j):
                Y_test_bin.append(1)
            else:
                Y_test_bin.append(0)
        Conf_mat4All[j]=Confusn_mat(pred_labels_bin, Y_test_bin)
        Param_mat4All[j]=Parameters(Conf_mat4All[j][0], Conf_mat4All[j][1], Conf_mat4All[j][2], Conf_mat4All[j][3], which_data_set)
    return Conf_mat4All, Param_mat4All

In [110]:
#Testing over Training Data
pred_labels = testing_All(X_post_Val_data, opt_W_Avg)
acc=0
for i in range(len(Y_post_Val_labels)):
    if(Y_post_Val_labels[i]==pred_labels[i]):
        acc=acc+1
print("The accuracy is {} %".format((acc/(len(Y_post_Val_labels)))*100))
print("The confusion matrix for Training data for all classes showing TP, FP, FN, TN respectively:")
Conf_mat4All_train, Param_mat4All_train= FindConfMat(pred_labels, Y_post_Val_labels, "training_data")
print ("The Performance matrix for all the classes involving Accuracy, Precision, Recall, F-measure, TPR and FPR:")
for j in range (len(Param_mat4All_train)):
    print(Param_mat4All_train[j][0]*100, Param_mat4All_train[j][1],Param_mat4All_train[j][2],Param_mat4All_train[j][3],Param_mat4All_train[j][4],Param_mat4All_train[j][5])

The accuracy is 76.11250000000001 %
The confusion matrix for Training data for all classes showing TP, FP, FN, TN respectively:
1864.0 652.0 514.0 20970.0
2259.0 131.0 121.0 21489.0
1000.0 160.0 1449.0 21391.0
2275.0 2250.0 121.0 19354.0
1509.0 641.0 902.0 20948.0
2095.0 109.0 329.0 21467.0
561.0 80.0 1808.0 21551.0
2147.0 396.0 170.0 21287.0
2283.0 1135.0 106.0 20476.0
2274.0 179.0 213.0 21334.0
The Performance matrix for all the classes involving Accuracy, Precision, Recall, F-measure, TPR and FPR:
95.14 0.74 0.78 0.76 0.78 0.03
98.95 0.95 0.95 0.95 0.95 0.01
93.3 0.86 0.41 0.55 0.41 0.01
90.12 0.5 0.95 0.66 0.95 0.1
93.57 0.7 0.63 0.66 0.63 0.03
98.18 0.95 0.86 0.91 0.86 0.01
92.13 0.88 0.24 0.37 0.24 0.0
97.64 0.84 0.93 0.88 0.93 0.02
94.83 0.67 0.96 0.79 0.96 0.05
98.37 0.93 0.91 0.92 0.91 0.01


In [111]:
#Testing over Validation Data
pred_labels = testing_All(Val_Data, opt_W_Avg)
acc=0
for i in range(len(Val_labels)):
    if(Val_labels[i]==pred_labels[i]):
        acc=acc+1
print("The accuracy is {} %".format((acc/(len(Val_labels)))*100))
print("The confusion matrix for Validation data for all classes showing TP, FP, FN, TN respectively:")
Conf_mat4All_valid, Param_mat4All_valid= FindConfMat(pred_labels, Val_labels, "validation")
print ("The Performance matrix for all the classes involving Accuracy, Precision, Recall, F-measure, TPR and FPR:")
for j in range (len(Param_mat4All_train)):
    print(Param_mat4All_train[j][0]*100, Param_mat4All_train[j][1],Param_mat4All_train[j][2],Param_mat4All_train[j][3],Param_mat4All_train[j][4],Param_mat4All_train[j][5])

The accuracy is 75.73333333333333 %
The confusion matrix for Validation data for all classes showing TP, FP, FN, TN respectively:
487.0 171.0 146.0 5196.0
532.0 41.0 44.0 5383.0
239.0 49.0 332.0 5380.0
577.0 604.0 29.0 4790.0
370.0 135.0 248.0 5247.0
533.0 41.0 71.0 5355.0
142.0 24.0 456.0 5378.0
534.0 85.0 44.0 5337.0
584.0 268.0 29.0 5119.0
546.0 38.0 57.0 5359.0
The Performance matrix for all the classes involving Accuracy, Precision, Recall, F-measure, TPR and FPR:
95.14 0.74 0.78 0.76 0.78 0.03
98.95 0.95 0.95 0.95 0.95 0.01
93.3 0.86 0.41 0.55 0.41 0.01
90.12 0.5 0.95 0.66 0.95 0.1
93.57 0.7 0.63 0.66 0.63 0.03
98.18 0.95 0.86 0.91 0.86 0.01
92.13 0.88 0.24 0.37 0.24 0.0
97.64 0.84 0.93 0.88 0.93 0.02
94.83 0.67 0.96 0.79 0.96 0.05
98.37 0.93 0.91 0.92 0.91 0.01


In [112]:
# import testing data
X_test, Y_test=loaddata("images_testing", "labels_testing_2000")
X_test, n_components_Test, comp_ratio_Test =SVD(X_test, variance)

#Testing on 2000 Test Data
pred_labels = testing_All(X_test, opt_W_Avg)
acc=0
for i in range(len(Y_test)):
    if(Y_test[i]==pred_labels[i]):
        acc=acc+1
print("THe accuracy for test is {} %".format((acc/(len(Y_test)))*100))
print("The Confusion Matrix for Test data for all classes showing TP, FP, FN, TN respectively:")
Conf_mat4All_test, Param_mat4All_test= FindConfMat(pred_labels, Y_test, "testing")
print ("The Performance matrix for all the classes involving Accuracy, Precision, Recall, F-measure, TPR and FPR:")
for j in range (len(Param_mat4All_train)):
    print(Param_mat4All_train[j][0]*100, Param_mat4All_train[j][1],Param_mat4All_train[j][2],Param_mat4All_train[j][3],Param_mat4All_train[j][4],Param_mat4All_train[j][5])

THe accuracy for test is 75.6 %
The Confusion Matrix for Test data for all classes showing TP, FP, FN, TN respectively:
136.0 50.0 42.0 1772.0
181.0 9.0 10.0 1800.0
86.0 20.0 124.0 1770.0
185.0 191.0 6.0 1618.0
129.0 64.0 83.0 1724.0
177.0 6.0 37.0 1780.0
46.0 7.0 154.0 1793.0
188.0 27.0 10.0 1775.0
208.0 91.0 11.0 1690.0
176.0 23.0 11.0 1790.0
The Performance matrix for all the classes involving Accuracy, Precision, Recall, F-measure, TPR and FPR:
95.14 0.74 0.78 0.76 0.78 0.03
98.95 0.95 0.95 0.95 0.95 0.01
93.3 0.86 0.41 0.55 0.41 0.01
90.12 0.5 0.95 0.66 0.95 0.1
93.57 0.7 0.63 0.66 0.63 0.03
98.18 0.95 0.86 0.91 0.86 0.01
92.13 0.88 0.24 0.37 0.24 0.0
97.64 0.84 0.93 0.88 0.93 0.02
94.83 0.67 0.96 0.79 0.96 0.05
98.37 0.93 0.91 0.92 0.91 0.01


In [130]:
#Code to output the data
output=pred_labels
import numpy as np 
import h5py
with h5py.File('predicted_labels.h5','w') as H: 
    H.create_dataset('label',data=output)