# DATASET-Devanagri



In [1]:
import pandas as pd
import numpy as np
import scipy.stats as s
import os
import matplotlib.pyplot as plt

# TRAINING DATA

In [2]:
list_of_folder_names = os.listdir("./Train")

In [3]:
list_of_images = []

for each_folder in list_of_folder_names:
    
    base_path = os.path.join("./Train",each_folder)
    
    list_of_images_in_folder = os.listdir(base_path)
    
    list_of_images.extend(map(lambda x: plt.imread(os.path.join(base_path,x)).reshape(1024,),list_of_images_in_folder))

In [4]:
stacked_up_images=np.array(list_of_images)

In [5]:
raw_data=pd.DataFrame(data=stacked_up_images)

In [6]:
labels=[]
for i in range(0,46):
    for j in range(0,1700):
        labels.append(i)
train_labels=np.array(labels)

In [7]:
train_labels=pd.DataFrame(data=train_labels)

In [8]:
train_labels=train_labels.rename(columns={0:'labels'})

In [9]:
data=pd.concat([train_labels,raw_data],axis=1)

# DATA PREPROCESSING

# z normalisation of the data


In [10]:
#computing mean of every column and standard deviation of every column of the dataframe,except data['labels']
raw_data_mean=[]
raw_data_standard_dev=[]
columns=list(raw_data.columns)
for i in columns:
    raw_data_mean.append(raw_data[i].mean())
    raw_data_standard_dev.append(raw_data[i].std())

now we will subtract the mean of every column of the data from every value of the column and divide it with its standard deviation

In general, you'll only want to normalize your data if you're going to be using a machine learning or statistics technique that assumes your data is normally distributed. Some examples of these include t-tests, ANOVAs, linear regression, linear discriminant analysis (LDA) and Gaussian naive Bayes

In [11]:
#here,we will normalize our data to zero mean and one standard deviation
for i in range(len(columns)):
        if (raw_data_standard_dev[i]==0):
            raw_data[i]=raw_data[i]-raw_data_mean[i]
        else:
            raw_data[i]=(raw_data[i]-raw_data_mean[i])/raw_data_standard_dev[i]

In [12]:
#now we will calculate covariance matrix of our dataset
covariance_matrix=raw_data.cov()

In [13]:
#now we will get the determinant to conclude whether the our covariance matrix is singular or not
covariance_mat=np.linalg.det(covariance_matrix)

In [14]:
raw_data=np.array(raw_data)

since in our dataset each class covariance matrix is singular now,we have to reduce the dimensionality of our dataset.we will first use PCA(principal component analysis)

# PCA

STEPS TO REDUCE DIMENSION USING PCA

1.COMPUTE EIGEN VECTOR AND EIGEN VALUES OF THE DATASET FROM COVARIANCE MATRIX.

2.SET VARIANCE YOU WANT IN THE DATASET,RANGE(95%-99%)

3.CREATE A SET OF EIGEN VECTORS WHOSE EIGEN VALUES SUM IS EQUAL TO THE PRESET VARIANCE

4.TAKE A PROJECTION OF THE LOWER DIMENSION DATASET BY TAKING  THE DOT PRODUCT OF THE DATASET WITH THE SELECTED EIGEN VECTOR MATRIX.


In [15]:
#STEP 1. TO COMPUTE EIGEN VALUES AND EIGEN VECTOR WE WILL USE SVD(Singular value decomposition)
eigen_vector,eigen_value,eigen_vector_transpose=np.linalg.svd(covariance_matrix.T)

In [16]:
#sort the eigen vector and eigen value in the descending order
#for sorting we are using quick sort,since quick sort will sort the array in ascending order we will reverse the solution
#since we are using singular value decomposition there is no need to use this step,use when you are using different technique for getting eigen value and eigen vector
sorted_eigen_vector=np.sort(eigen_vector,axis=0)
sorted_eigen_value=np.sort(eigen_value)
sorted_eigen_vector=sorted_eigen_vector[::-1]
sorted_eigen_value=sorted_eigen_value[::-1]



In [17]:
#STEP 2. WE WILL HAVE VARIANCE OF 98%
#STEP 3. WE WILL COMPUTE THE INDICES OF ALL EIGEN VECTOR BY WHICH OUR VARIANCE IS EQUAL TO 98%
principal_components=[]
total=np.sum(eigen_value)
current_sum=0
variance_to_preserve=0.98
for i in range(1024):
    current_sum+=eigen_value[i]
    if((current_sum/total)>variance_to_preserve):
        break
    principal_components.append(eigen_vector[:,i])

In [18]:
#these are those eigen vectors which contribute to the 99% of the variance of the dataset.
important_eigen_vectors=np.array(principal_components)

In [19]:
len(important_eigen_vectors)

300

the length function used on the calculated eigen vectors from the pca tells us that 297 eigen_vectors out of 1024 are having 98 percent variance of the total dataset.

In [20]:
#we will now project the lower dimensions of this dataset via multiplying the dataset with eigen vectors we computed above having 99% of the variance.
new_data=np.matmul(raw_data,important_eigen_vectors.T)

In [21]:
#calculating the covariance of the new dataset for each classes.
cov_mats_reduced_of_each_class=[]
for i in range(46):
    cov_mats_reduced_of_each_class.append(np.cov(new_data[i*1700:(i+1)*1700,:],rowvar=False))


In [22]:
#determinant of the new dataset for each class
det=[]
for i in range(46):
    det.append(np.linalg.det(cov_mats_reduced_of_each_class[i]))


In [23]:
#lower dimension dataset.
new_data=pd.DataFrame(data=new_data)

In [24]:
for i in range(46):
    print(det[i])

8.662810895808441e-152
5.5896908648348955e-192
1.7836929593040674e-161
2.3790903174931527e-136
2.849775189487269e-148
1.486286998157952e-181
3.064896876384309e-176
1.3889392679070817e-126
3.562947546188586e-128
2.462335681254776e-122
3.879679915492212e-111
3.199407100078146e-138
5.8862876443065354e-182
2.9563777857393568e-145
2.628362037937387e-108
2.268547340179328e-115
6.182244038371368e-121
8.863660209661719e-130
1.0550352764986484e-157
2.738994823706418e-147
2.447462039207096e-134
1.7733658377221643e-78
1.4892686907976684e-129
5.13926698700883e-150
4.2649340522743084e-96
1.2473873107899989e-114
5.201731288699862e-109
1.5784890056011085e-143
1.5987737159818723e-109
1.1318937884043946e-174
4.620638108333877e-123
4.299961040664245e-110
1.4941163972585244e-154
1.8025557623136834e-94
2.2131856089244031e-150
1.4904396169046249e-89
0.0
2.9670249831695477e-261
5.29354903192228e-193
1.538307801546597e-157
2.400706179259682e-206
6.931997625425209e-172
1.3793204983847247e-144
1.69901328910483

since our new dataset's covariance is still very zero,hence we will use regularized discriminant analysis

# RDA 

In [25]:
#calculating the covariance of each class in the pca transform dataset.
cov_m=[]
for i in range(46):
    cov_m.append(new_data.iloc[i*1700:(i+1)*1700].cov())

In [26]:
#calculating pooled covariance matrix.
pool=0
for i in cov_m:
    pool=pool+1699*i
pool=pool/(78200-46)

In [27]:
#calculating sigma 
sigma=np.trace(pool)/pool.shape[0]
sigma

2.1615001816987016

In [28]:
def rda(cov_m,pool,sigma):
    #using the maximum alpha and gamma value obtained above to get the optimum rda covariance matrix for our dataset. 
    alpha=np.random.uniform(0,1)
    gamma=np.random.uniform(0,1)
    final_cov_rda=[]
    for i in cov_m:
          final_cov_rda.append((alpha*i)+(1-alpha)*pool)
    #using the optimum rda covariance matrix to get the modified rda covariance matrix
    final_mod_rda=[]
    for i in final_cov_rda:
        final_mod_rda.append(((1-gamma)*i)+(gamma*sigma*np.identity(cov_m[0].shape[0])))
    return final_mod_rda,alpha,gamma

In [29]:
def mean(data):
    #calculating mean of each class of the pca transformed data ie.(new data)
    mean_rda=[]
    for i in range(46):
        mean_rda.append(new_data.iloc[i*1700:(i+1)*1700].mean())
    return mean_rda

In [30]:
def posterior_probab(data,rda):
    #calculating posterior probablity using probablity distribution function of the normal distribution,as we assume our data is coming from gauusian distribution.
    #this will give us likelihood probablity but we maximize the likelihood probability by using -log hence getting posterior probability.
    mean_rda1=mean(new_data)
    posterior_train_probabilities=[]
    for i in range(46):
        posterior_train_probabilities.append(-np.log(s.multivariate_normal.pdf(data,mean_rda1[i],rda[i])))
    return posterior_train_probabilities

In [31]:
def predictions_cal(posterior_train_probabilities):
    #there are 46 arrays in the likelihood probabilities each representing a class
    #each array has 78200 probabilities that signifies the probabilities of each image
    ans_train=[]
    for i in range(78200):
        m=[]
        for j in range(46):
            m.append((posterior_train_probabilities[j][i],j))
        ans_train.append(min(m))
    return ans_train

In [32]:
def best():
    accuracy=[]
    alphalist=[]
    gammalist=[]
    for i in range(20):
        final_mod_rda,alpha,gamma=rda(cov_m,pool,sigma)
        posterior_train_probabilities=posterior_probab(new_data,final_mod_rda)
        ans_train=predictions_cal(posterior_train_probabilities)
        predicted_labels=[]
        for j in range(78200):
            predicted_labels.append(ans_train[j][1])
        predicted_labels=np.array(predicted_labels).reshape(78200,1)
        boolean_mask=(predicted_labels == train_labels)
        accuracy.append((np.count_nonzero(boolean_mask)/78200)*100)
        alphalist.append(alpha)
        gammalist.append(gamma)
    ind=np.argmax(accuracy)
    best_accuracy=accuracy[ind]
    best_alpha=alphalist[ind]
    best_gamma=gammalist[ind]
    
    return best_accuracy,best_alpha,best_gamma
        

In [33]:
accuracy,alpha,gamma=best()

  import sys


In [34]:
accuracy

95.5306905370844

In [35]:
def final_rda(cov_m,pool,sigma,alpha,gamma):
    best_rda=[]
    for i in cov_m:
          best_rda.append((alpha*i)+(1-alpha)*pool)
    #using the optimum rda covariance matrix to get the modified rda covariance matrix
    best_rda_dash=[]
    for i in best_rda:
        best_rda_dash.append(((1-gamma)*i)+(gamma*sigma*np.identity(cov_m[0].shape[0])))
    return best_rda_dash
    

In [36]:
rda_final=final_rda(cov_m,pool,sigma,alpha,gamma)

# testing data

In [37]:
list_of_testing_folder_names = os.listdir("./Test/")

In [38]:
list_of_testing_images = []

for each_folder in list_of_testing_folder_names:
    
    base_path = os.path.join("./Test/",each_folder)
    
    list_of_testing_images_in_folder = os.listdir(base_path)
    
    list_of_testing_images.extend(map(lambda x: plt.imread(os.path.join(base_path,x)).reshape(1024,),list_of_testing_images_in_folder))

In [39]:
testing_stacked_up_images=np.array(list_of_testing_images)

In [40]:
testing_raw_data=pd.DataFrame(data=testing_stacked_up_images)

In [41]:
testing_raw_data.shape[0]

13798

# normalizing the data

In [42]:
for i in range(len(columns)):
        if (raw_data_standard_dev[i]==0):
            testing_raw_data[i]=testing_raw_data[i]-raw_data_mean[i]
        else:
            testing_raw_data[i]=(testing_raw_data[i]-raw_data_mean[i])/raw_data_standard_dev[i]

In [43]:
#now we will use the above pca computed eigen vectors to project the lower dimension of the testing dataset.
new_testing_data=testing_raw_data.dot(important_eigen_vectors.T)

# PCA Transform data

In [44]:
#computing the mean of each class of the new_testing_data.
pca_transform_data_mean=[]
for i in range(46):
    pca_transform_data_mean.append(new_testing_data.iloc[i*300:(i+1)*300].mean())

# Prediction on testing data

In [45]:
#calculating posterior probability of the testing dataset
testing_posterior_probability=[]
for i in range(46):
    testing_posterior_probability.append(-np.log(s.multivariate_normal.pdf(new_testing_data,pca_transform_data_mean[i],rda_final[i])))

In [46]:
#there are 46 arrays in the likelihood probabilities each representing a class
#each array has 78200 probabilities that signifies the probabilities of each image
ans_test=[]
for i in range(new_testing_data.shape[0]):
    n=[]
    for j in range(46):
        n.append((testing_posterior_probability[j][i],j))
    ans_test.append(min(n))

In [47]:
predicted_test_labels=[]
for i in range(new_testing_data.shape[0]):
    predicted_test_labels.append(ans_test[i][1])

In [48]:
predicted_test_labels=np.array(predicted_test_labels)

In [49]:
predicted_test_labels=predicted_test_labels.reshape(13798,)

In [50]:
listi_test=[]
for i in range(0,46):
    for j in range(0,300):
        listi_test.append(i)
test_labels=np.array(listi_test)

In [51]:
test_labels=test_labels[0:13798]


In [52]:
test_labels.shape

(13798,)

In [53]:
boolean_mask_test=np.count_nonzero(predicted_test_labels == test_labels)

In [54]:
boolean_mask_test

12488

# classification accuracy
AC=TN+TP/TN+FP+FN+TP

In [59]:
#Accuracy-proportion or percentage of correctly predicted labels over all predictions
accuracy=(boolean_mask_test/new_testing_data.shape[0])*100
print(accuracy)

90.50587041600232
