# Question 2

# Implement logistic regression to classify the images provided in the dataset.

# Importing Libraries

In [1]:
import os
import numpy as np
import matplotlib.pyplot as plt
import matplotlib as mpl
import pandas as pd
from PIL import Image
from sklearn.preprocessing import MinMaxScaler
from sklearn.metrics import confusion_matrix
from sklearn.metrics import classification_report

# Appying the principal component analysis to get the transformed trainning set where the number of principal component is 74. Returnning the transformed traning set and the resultant eigenvectors which will be used to tranformed the test dataset also.

In [2]:
def apply_pca(train):
    n_components = 74
    x_mean = np.mean(train, axis =0)
    x_center = train - x_mean
    x_cov = np.cov(x_center.T)
    x_eigenvalues, x_eigenvectors = np.linalg.eig(x_cov)
    indexes = x_eigenvalues.argsort()[::-1]   
    eigenvalues = x_eigenvalues[indexes]
    eigenvectors = x_eigenvectors[:,indexes]
    red_eigenvec = eigenvectors[:,:n_components]
    x_pca = red_eigenvec.T.dot(train.T)
    global_eigenvector = eigenvectors
    return x_pca.T,eigenvectors

# Transforming the test data set by applying the principal component analysis.

In [3]:
def transform_pca(eigenvectors, test):
    n_components = 74
    red_eigenvec = eigenvectors[:,:n_components]
    test_pca = red_eigenvec.T.dot(test.T)
    return test_pca.T

# Code to calculate accuracy

In [4]:
def accuracy(predictions, y):
    return ((predictions == y).mean()*100)

# Sigmoid function 
\begin{align}
h_\theta(x) = \frac{1}{1 + e^{\theta^{\top} x}}
\end{align}

In [5]:
def sigmoid_function(x):
    g = 1/(1 + np.exp(-x))
    return g

# Calculation of the cost:
\begin{align}
J(\theta) & = \dfrac{1}{m} \sum_{i=1}^m \mathrm{Cost}(h_\theta(x^{(i)}),y^{(i)}) \\
& = - \dfrac{1}{m} [\sum_{i=1}^{m} y^{(i)} \log(h_\theta(x^{(i)})) + (1 - y^{(i)}) \log(1-h_\theta(x^{(i)}))] \\
\end{align}

In [6]:
def calculate_cost(x,y,w,h):
    total_cost = np.sum(-y * np.log(h) - (1 - y) * np.log(1 - h))
    cost = total_cost/len(y)
    return cost

# Calulation of the gradient
\begin{align}
\frac{\partial}{\partial \theta_j} J(\theta) = \dfrac{1}{m} \sum_{i=1}^{m} (h_\theta(x^{(i)}) - y^{(i)}) x_j^{(i)}
\end{align}

In [7]:
def claculate_gradient(x,y,h):
    gradient = (y-h).dot(x)/len(y)
    return gradient

# Trainning the data :-  One vs Rest classification technique is used. One-vs-Rest classification is a method which involves training N distinct binary classifiers, each designed for recognizing a particular class. Then those N classifiers are collectively used for multi-class classification. I take values of one class and turn them into one, and the rest of classes - into zeros. And everytime cosidering only one class, it converges by running through the number of iterations and calculate the optimized weight corrrsponding to each class.

In [8]:
def train_data(xtrain,y,numofiter,learningrate):
    x = np.ones(shape=(xtrain.shape[0], xtrain.shape[1] + 1),dtype=complex)
    x[:, 1:] = xtrain
    classes = np.unique(y)
    weights =[]
    for cls in classes:
        weight = np.zeros(x.shape[1],dtype=complex) #dtype=complex
        y_map = []
        for i in y:
            if(i==cls):
                y_map.append(1)
            else:
                y_map.append(0)
        #print("ymap ",y_map)
        for i in range(numofiter):
            h = sigmoid_function(x.dot(weight))
            gradient = claculate_gradient(x,y_map,h)
            weight = weight + learningrate*gradient
        weights.append(weight)
    return weights

# Predict class: Taking each sample  and calculating
\begin{align}
h_{\theta}(x) = [h_{\theta}^{(1)}(x), h_{\theta}^{(2)}(x), h_{\theta}^{(3)}(x),.........]
\end{align}
# and taking the maximum value and its corresponding class as the predicted class.

In [18]:
def predict_class(xval,y,weights):
    x = np.ones(shape=(xval.shape[0], xval.shape[1] + 1),dtype=complex)
    x[:, 1:] = xval
    temp_predictions = []
    for i in x:
        hypothesis = np.zeros(shape=(len(weights)),dtype=complex) #dtype=complex
        k =0
        for weight in weights:
            h = sigmoid_function(i.dot(weight))
            #print("h===== ",h)
            hypothesis[k] = h
            k = k+1
        temp_predictions.append(np.argmax(hypothesis))
    predictions = []
    #print("t_pred ",temp_predictions)
    for indx in temp_predictions:
        #print("y_indx",indx)
        predictions.append(y[indx])
    return predictions

# Loading the tranning data and applying the own PCA on it taking 74 principal components and applying min-max scaler over it.

In [260]:
path = r"G:\second_sem\SMAI\Assignment_3\q2\dataset"
images = []
labels = []
for f in os.listdir(path):
    label = f[1:f.find("_")]
    images.append(np.asarray(Image.open(path +'/'+f).convert('L').resize((64, 64))).flatten())
    labels.append(label)
train = np.array(images)
scalar = MinMaxScaler()
train = scalar.fit_transform(train)
train,eigenvects = apply_pca(train)
labels = np.array(labels)

In [261]:
xtrain = train[:,:]
ytrain = labels[:]

# Loading the test data and transforming the test data by applying the own PCA on it taking 74 principal components and applying min-max scaler over it.

In [262]:
vpath = r"G:\second_sem\SMAI\Assignment_3\q2\A3\test"
vimages = []
vlabels = []
for f in os.listdir(vpath):
    label = f[1:f.find("_")]
    vimages.append(np.asarray(Image.open(vpath +'/'+f).convert('L').resize( (64, 64))).flatten())
    vlabels.append(label)
vtrain = np.array(vimages)
vtrain = scalar.transform(vtrain)
vtrain = transform_pca(eigenvects,vtrain)
vlabels = np.array(vlabels)
xvalidation = vtrain[:,:]
yvalidation = vlabels[:]

# Experiment 1:- The number of iterations = 100000 and the learning rate = 0.0001

In [78]:
numofiter = 100000
learningrate = 0.0001
weights = train_data(xtrain, ytrain, numofiter,learningrate)
classes = np.unique(ytrain)
predictions1 = predict_class(xvalidation,classes,weights)

# Accuracy

In [79]:
acc1 = accuracy(predictions1, yvalidation)
acc1

68.75

# Confusion matrix

In [80]:
print(confusion_matrix(predictions1,yvalidation))

[[ 0  1  0  0  0  0]
 [ 0 13  0  0  0  0]
 [ 0  3  9  0  0  0]
 [ 0  1  2  0  0  0]
 [ 0  0  1  0  0  0]
 [ 0  2  0  0  0  0]]


# Classification report

In [81]:
print(classification_report(predictions1,yvalidation))

              precision    recall  f1-score   support

          01       0.00      0.00      0.00         1
          03       0.65      1.00      0.79        13
          04       0.75      0.75      0.75        12
          05       0.00      0.00      0.00         3
          06       0.00      0.00      0.00         1
          07       0.00      0.00      0.00         2

    accuracy                           0.69        32
   macro avg       0.23      0.29      0.26        32
weighted avg       0.55      0.69      0.60        32



  'precision', 'predicted', average, warn_for)


# Experiment 2:- The number of iterations = 50000 and the learning rate = 0.0005

In [82]:
numofiter = 50000
learningrate = 0.0005
weights = train_data(xtrain, ytrain, numofiter,learningrate)
classes = np.unique(ytrain)
predictions2 = predict_class(xvalidation,classes,weights)

# Accuracy

In [83]:
acc2 = accuracy(predictions2, yvalidation)
acc2

78.125

# Confusion Matrix

In [84]:
print(confusion_matrix(predictions2,yvalidation))

[[ 0  0  1  0  0  0]
 [ 0  0  0  1  0  0]
 [ 0  0 15  0  0  0]
 [ 0  0  2 10  0  0]
 [ 0  0  0  1  0  0]
 [ 0  0  2  0  0  0]]


# Classification report

In [85]:
print(classification_report(predictions2,yvalidation))

              precision    recall  f1-score   support

          00       0.00      0.00      0.00         1
          02       0.00      0.00      0.00         1
          03       0.75      1.00      0.86        15
          04       0.83      0.83      0.83        12
          05       0.00      0.00      0.00         1
          07       0.00      0.00      0.00         2

    accuracy                           0.78        32
   macro avg       0.26      0.31      0.28        32
weighted avg       0.66      0.78      0.71        32



# Experiment 3:- The number of iterations = 70000 and the learning rate = 0.005

In [86]:
numofiter = 70000
learningrate = 0.005
weights = train_data(xtrain, ytrain, numofiter,learningrate)
classes = np.unique(ytrain)
predictions3 = predict_class(xvalidation,classes,weights)

# Accuracy

In [87]:
acc3 = accuracy(predictions3, yvalidation)
acc3

84.375

# Confusion Matrix

In [88]:
print(confusion_matrix(predictions3,yvalidation))

[[16  0  0  0]
 [ 2 11  0  0]
 [ 0  1  0  0]
 [ 2  0  0  0]]


# Classification report

In [89]:
print(classification_report(predictions3,yvalidation))

              precision    recall  f1-score   support

          03       0.80      1.00      0.89        16
          04       0.92      0.85      0.88        13
          06       0.00      0.00      0.00         1
          07       0.00      0.00      0.00         2

    accuracy                           0.84        32
   macro avg       0.43      0.46      0.44        32
weighted avg       0.77      0.84      0.80        32



# Experiment 4:- The number of iterations = 100000 and the learning rate = 0.01

In [90]:
numofiter = 100000
learningrate = 0.01
weights = train_data(xtrain, ytrain, numofiter,learningrate)
classes = np.unique(ytrain)
predictions4 = predict_class(xvalidation,classes,weights)

# Accuracy

In [91]:
acc4 = accuracy(predictions4, yvalidation)
acc4

87.5

# Confusion matrix

In [92]:
print(confusion_matrix(predictions4,yvalidation))

[[17  0  0  0]
 [ 1 11  0  0]
 [ 0  1  0  0]
 [ 2  0  0  0]]


# Samle Test And Train

In [10]:
train_path = r"G:\second_sem\SMAI\Assignment_3\q2\sample_train.txt"
train_images = []
train_labels = []
train_file = open(train_path,"r")
for train_line in train_file:
    path_label = train_line.split(" ")
    train_f = path_label[0]
    train_label = path_label[1].replace('\n', '')
    train_images.append(np.asarray(Image.open(train_f).convert('L').resize((64, 64))).flatten())
    train_labels.append(train_label)
train_train = np.array(train_images)
train_scalar = MinMaxScaler()
train_train = train_scalar.fit_transform(train_train)
train_train,train_eigenvects = apply_pca(train_train)
train_labels = np.array(train_labels)
train_file.close()

In [11]:
train_xtrain = train_train[:,:]

In [12]:
label_to_ordinal = {}
ordinal_to_label = {}
uniq_label = np.unique(train_labels)
for i in range(uniq_label.shape[0]):
    label_to_ordinal[uniq_label[i]] = i
for j in range(uniq_label.shape[0]):
    ordinal_to_label[j] = uniq_label[j]
train_ytrain = np.zeros(len(train_labels))
for i in range(len(train_labels)):
    train_ytrain[i] = label_to_ordinal[train_labels[i]]

In [13]:
test_path = r"G:\second_sem\SMAI\Assignment_3\q2\sample_test.txt"
test_images = []
test_file = open(test_path,"r")
for test_line in test_file:
    test_line= test_line.replace('\n','')
    test_images.append(np.asarray(Image.open(test_line).convert('L').resize( (64, 64))).flatten())
test_test = np.array(test_images)
test_test = train_scalar.transform(test_test)
test_test = transform_pca(train_eigenvects,test_test)
test_xtest = test_test[:,:]
test_file.close()

In [19]:
numofiter = 100000
learningrate = 0.01
t_weights = train_data(train_xtrain, train_ytrain, numofiter,learningrate)
t_classes = np.unique(train_ytrain)
predictions5 = predict_class(test_xtest,t_classes,t_weights)

In [20]:
predicted_labels = []
for i in predictions5:
    predicted_labels.append(ordinal_to_label[i])
predicted_labels

['abc', 'abc', 'alice', 'bob', 'bob']