In [1]:
import numpy as np
import glob
import imageio as magic
import pandas as pd
import collections
from sklearn.model_selection import train_test_split

In [2]:

class LogisticRegression:

    def __init__(self):
        print("Logistic Regression")

    def get_sigmoid(self, X, W):
        return 1 / (1 + np.exp(- self.get_hypothesis(X, W)))

    def get_hypothesis(self, X, W):
        return X @ W.T

    def __get_cost(self, X, Y, W, lamda):
        return -(1.0 / len(X)) * (np.sum((Y * np.log(self.get_sigmoid(X, W))) + ((1 - Y) * np.log(1 - self.get_sigmoid(X, W)))) - lamda)

    def __get_gradient(self, X, Y, W, lamda):
        return (1.0 / len(X)) * (np.sum(X * (self.get_hypothesis(X,W) - Y), axis=0) + (lamda * np.sum(W)))

    def __logistic_regression(self, X, Y, W, alpha, max_iterations, lamda):
        
        for i in range(max_iterations):
            
            W = W - alpha * self.__get_gradient(X, Y, W, lamda)
            cost = self.__get_cost(X, Y, W, lamda)
            
            if i % 100 == 0:
                print("Cost: ", cost)
            
        return W, cost

    def train(self, X, Y, W, alpha, max_iterations, lamda=0):
        return self.__logistic_regression(X, Y, W, alpha, max_iterations, lamda)

    def validate(self, X, Y, W):
        return self.__get_cost(X, Y, W, 0)

    def test(self, X, Y, W, lamda=0):
        return self.__get_cost(X, Y, W, 0)
    
    def predict(self,X,W):
        return self.get_sigmoid(X,W)

In [3]:

image_data = []
label = []
for file_name in glob.iglob('/home/lognod/Desktop/nhcd/numerals/**/*.jpg', recursive=True):
    image_array = magic.imread(file_name,as_gray=True)
    label=int(file_name[-12:-11])
    pixel_data = (255.0-image_array.flatten())/255.0
    pixel_data = np.append(label,pixel_data)
    image_data.append(pixel_data)


image_data = np.array(image_data)
np.random.shuffle(image_data)
image_data_pd = pd.DataFrame(image_data)
image_data_pd.head()

Unnamed: 0,0,1,2,3,4,5,6,7,8,9,...,775,776,777,778,779,780,781,782,783,784
0,7.0,0.0,0.05098,0.007843,0.0,0.007843,0.023529,0.023529,0.0,0.0,...,0.027451,0.039216,0.062745,0.027451,0.0,0.023529,0.054902,0.015686,0.011765,0.031373
1,8.0,0.023529,0.031373,0.011765,0.023529,0.058824,0.003922,0.0,0.007843,0.035294,...,0.011765,0.015686,0.023529,0.027451,0.015686,0.003922,0.035294,0.0,0.0,0.035294
2,9.0,0.003922,0.011765,0.007843,0.047059,0.062745,0.007843,0.0,0.043137,0.066667,...,0.003922,0.0,0.043137,0.0,0.0,0.003922,0.0,0.0,0.0,0.0
3,0.0,0.062745,0.0,0.039216,0.0,0.0,0.027451,0.0,0.023529,0.05098,...,0.086275,0.011765,0.0,0.0,0.003922,0.0,0.0,0.0,0.0,0.0
4,3.0,0.0,0.019608,0.0,0.0,0.007843,0.0,0.0,0.023529,0.043137,...,0.0,0.023529,0.05098,0.0,0.0,0.105882,0.301961,0.317647,0.156863,0.019608


In [4]:
X = image_data_pd.iloc[:,1:]
ones = np.ones([len(X),1])
X = np.concatenate((ones,X), axis = 1)
Y = image_data_pd.iloc[:,0:1].values
print(X.shape)
print(Y)
X_train,X_rest,Y_train,Y_rest =  train_test_split(X,Y,test_size=0.4)
X_validate,X_test,Y_validate,Y_test = train_test_split(X_rest,Y_rest,test_size=0.5)

(2880, 785)
[[7.]
 [8.]
 [9.]
 ...
 [5.]
 [9.]
 [7.]]


In [5]:
print(X_train.shape[0])
W = np.zeros((1,len(X_train[0,:])))
print(W.shape)

1728
(1, 785)


In [26]:
logistic_regression = LogisticRegression()
weight_list =[]
cost_list = []

for i in range(10):
    W = np.zeros((1,len(X_train[0,:])))
    print("Learning: ", float(i))
    Y_train_one = (Y_train == float(i)).astype(int)
    weight,cost = logistic_regression.train(X_train,Y_train_one,W,0.01,300,0.01)
    weight_list.append(weight.flatten())
    cost_list.append(cost)

Logistic Regression
Learning:  0.0
Cost:  0.7112669426043642
Cost:  0.6822786553232426
Cost:  0.6792615586015494
Cost:  0.6780761537670915
Learning:  1.0
Cost:  0.710111432440517
Cost:  0.6737549066782935
Cost:  0.6716885234818998
Cost:  0.6709051897954605
Learning:  2.0
Cost:  0.7081731466552008
Cost:  0.702094505625266
Cost:  0.698825565880226
Cost:  0.6973888988849288
Learning:  3.0
Cost:  0.7096396298177997
Cost:  0.7112105675462583
Cost:  0.7071910342262131
Cost:  0.7052697868441494
Learning:  4.0
Cost:  0.7085806395018179
Cost:  0.6990266427031622
Cost:  0.6947982657683732
Cost:  0.6932019943367592
Learning:  5.0
Cost:  0.7073278372742765
Cost:  0.7035979771118896
Cost:  0.7000423842429321
Cost:  0.6985020898276292
Learning:  6.0
Cost:  0.7079251821106157
Cost:  0.6966139970910629
Cost:  0.691777944592922
Cost:  0.6901296136579679
Learning:  7.0
Cost:  0.7093863794352512
Cost:  0.6891380583761313
Cost:  0.6855259121549591
Cost:  0.684238551378084
Learning:  8.0
Cost:  0.706657442

In [27]:
weights = np.array(weight_list)
weights.shape

(10, 785)

In [28]:
weights_data = pd.DataFrame(weights)
weights_data.head()

Unnamed: 0,0,1,2,3,4,5,6,7,8,9,...,775,776,777,778,779,780,781,782,783,784
0,-0.017949,-0.003,-0.00492,-0.00238,-0.000367,-0.000958,0.000208,0.003088,0.005196,0.00789,...,-0.001189,-0.003185,-0.004328,-0.003749,-0.003844,-0.003653,-0.002709,-0.001738,0.000456,0.001991
1,0.020113,0.001586,0.000428,-0.002097,-0.003105,-0.002292,-0.001406,0.000509,0.002366,0.001386,...,0.001184,0.000917,-0.000746,-0.000594,-0.000204,-0.00069,-0.001089,-0.00066,-0.001121,-0.001867
2,0.009234,-0.002823,-0.004967,-0.005391,-0.005814,-0.006919,-0.004537,0.00036,-0.000579,-0.000819,...,-0.00292,-0.004015,-0.003853,-0.003008,-0.00497,-0.006323,-0.000276,-0.000939,0.004748,0.005652
3,0.007966,-0.001643,-0.004017,-0.001746,0.000261,0.000927,0.002873,0.006653,0.008146,0.005505,...,-0.002357,-0.000611,0.001283,-0.001419,-0.000109,0.00121,-0.000305,0.009905,0.020892,0.01285
4,0.013393,0.000336,-4.3e-05,0.001852,0.005945,0.005029,0.001264,0.00178,0.001645,-0.002979,...,0.007261,0.005225,0.005211,0.004119,0.002151,0.001463,-0.00095,-0.004775,-0.007005,-0.004558


In [29]:
weights_data.to_csv("/home/lognod/MiniML/mini_logistic_with_regularization.csv")

In [30]:
print(weights.shape)
logistic_regression.validate(X_validate,Y_validate,weights)

(10, 785)


2.891084572788009

In [31]:
logistic_regression = LogisticRegression()
weights_2 = pd.read_csv("/home/lognod/MiniML/mini_logistic_with_regularization.csv")
y = np.zeros(10)

# image_array = magic.imread("/home/lognod/Desktop/nhcd/numerals/5/042_02.jpg",as_gray=True)
# pixel_data = (255.0-image_array.flatten())/255.0

prediction = []
for image in X_validate:
    for i in range(10):
        y[i]=logistic_regression.predict(image,weights_2.iloc[i,1:])
    p = np.where(y == np.amax(y))
    prediction.append(int(p[0]))
    


Logistic Regression


In [32]:
Y= Y_validate.flatten().tolist()
Y = list(map(int, Y))


In [33]:
count = 0
for i in range(len(Y)):
    if(prediction[i] == Y[i]):
        count+=1
    

In [34]:
accuracy = (count/len(Y)) * 100

In [35]:
accuracy

83.15972222222221