In [69]:
import numpy as np
import pandas as pd
import PIL.Image as Image

In [70]:
IMAGE_SIZE = (28, 28)
EPOCH = 5
LEARNING_RATE = 0.000000001

# Import data
data = pd.read_csv(r"../digit-recognizer/train.csv")
data_test = pd.read_csv(r"../digit-recognizer/test.csv")    

# Get labels and image array from data
labels : np.ndarray = data.values[:, 0]
images : np.ndarray = data.values[:, 1:].astype('uint8')

images_test : np.ndarray = data_test.values.astype('uint8')

![title](image.png)

In [71]:
class Model:
    def __init__(self, input, hidden, output, lr):
        # Initialize weight and bias
        self.w_input = np.random.normal(size=((input, output)))
        # self.b_input = np.zeros(output)
        # self.w_hidden = np.random.rand(hidden, output)
        # self.b_hidden = np.random.rand(output)
        
        self.lr = lr

    def sigmoid(self, x, derive=False):
        if derive:
            return self.sigmoid(x) * (1-self.sigmoid(x))
        return 1 / (1 + np.exp(-x))

    def softmax(self, x : np.ndarray):
        new_x : np.ndarray = np.zeros(10)
        # x = np.subtract(x, np.max(x))
        for i in range(10):
            new_x[i] = np.exp(x[i]) / np.sum(np.exp(x), axis=0)
            # print(np.exp(x[i]))
        # print(new_x)
        return new_x 

    def forward(self, x):
        x = np.dot(x, self.w_input)
        # x = self.sigmoid(x)
        x = self.softmax(x)
        
        return x

    def train(self, input: np.ndarray, pred: np.ndarray, label: np.ndarray, yes=False):
        # ============================================
        # # sigmoid with MSELoss (Mean Squared Error)
        # z = weight * input
        # pred = sigmoid(z) {a(L)}
        # cost = (pred-label)**2
        # z = np.dot(input, self.w_input)
        # d_pred = self.sigmoid(z, derive=True)     
        # d_cost = -2*(pred-label)
        
        # # Weight input gradient descent
        # # y1 = w_input * input = z
        # step = z * d_pred  * d_cost * self.lr
        # np.subtract(self.w_input, step, out=self.w_input)
        # ============================================
        
        # ============================================
        # softmax with Cross Entropy Loss
        
        # label only have one 1 value (the correct one), so technically, -ln(pred) is also correct
        # cost = -sum(label * ln(pred)) = -ln(pred)
        # pred = softmax(out)
        # out = w * input
        # d_cost = -1 / (pred)
        # d_pred = pred * (1 - pred) # From StatQuest
        # d_cost * d_pred = pred[i] - 1
        # self.w_input[j, i] -= step

        d_cost = -1 / pred # d_cost
        d_pred = 0
        for i, p in enumerate(pred): # d_pred (derivative of softmax)
            # d_softmax/d_y1 = (e**y1 * (e**y2 + e**y3 + ...)) / (e**y1 + e**y2 + e**y3 + ...)**2
            d_pred = np.exp(p) * np.sum(np.exp(pred[pred != p])) / np.sum(np.exp(pred))**2
            for j in range(len(input)):
                d_x = input[j]
                self.w_input[j, i] -= d_cost[i] * d_pred * d_x * self.lr

        if (yes):
            print("Cost: ", np.log(pred[np.argmax(label)]))
            # print("Step: ", step)
            # print("d_pred: ", d_pred)
            # print("d_cost: ", d_cost)
            
        # ============================================

In [72]:
model = Model(IMAGE_SIZE[0]*IMAGE_SIZE[1], 16, 10, LEARNING_RATE)

for i in range(EPOCH):
    wrong = 0
    for j, d in enumerate(images):
        # High input breaks the neural network
        # Problem such as NaN, inf, etc, because e sucks
        d = d / 255
        
        pred = model.forward(d)
        
        # 0 will also break log e, because e sucks
        # Clip pred so the value only ranges around -1e7 and 1e7
        pred = np.clip(pred, 1e-5, 1e5)

        if (np.argmax(pred) != labels[j]):
            wrong += 1
            
        label = np.zeros(10)
        label[labels[j]] = 1
        model.train(d, pred, label)

        if ((j+1) % 200 == 0):
            print(model.w_input)
            model.train(d, pred, label, True)
            print(f"Data {j+1}: Wrong = {wrong}, Accuracy: {100-wrong/j*100}%")

    print(f"Epoch: {i+1} --> Wrong: {wrong}, Accuracy: {100-wrong / labels.size * 100}%\n")    

[[ 1.17410333 -0.2019409  -0.73265736 ...  0.41431706  1.24183869
   0.45326292]
 [ 0.70090448 -0.0928412  -1.18342639 ... -0.69657885 -0.76910717
  -0.02050563]
 [-1.57341618  0.07243978  0.33982243 ... -0.75974629 -0.33878717
  -1.57016816]
 ...
 [-0.86086794 -1.56644917 -0.67923678 ...  0.57748175 -0.7218526
  -0.08903769]
 [ 0.29320947  1.87318492 -1.10875534 ... -0.22560674  1.35670438
  -1.48744519]
 [-0.0594869  -1.21783297  0.69436731 ... -1.34097032  0.60382342
   2.21919131]]
Cost:  11.512925464970229
d_pred:  0.027076976878997527
d_cost:  [-1.00000000e+05 -1.00000000e+05 -1.00000000e+05 -1.00000000e+05
 -1.00017092e+00 -5.96252885e+03 -1.00000000e+05 -1.00000000e+05
 -1.00000000e+05 -1.00000000e+05]
Data 200: Wrong = 193, Accuracy: 3.0150753768844254%
[[ 1.17410333 -0.2019409  -0.73265736 ...  0.41431706  1.24183869
   0.45326292]
 [ 0.70090448 -0.0928412  -1.18342639 ... -0.69657885 -0.76910717
  -0.02050563]
 [-1.57341618  0.07243978  0.33982243 ... -0.75974629 -0.33878717

KeyboardInterrupt: 

In [None]:
# testing = np.array([
#     [1, 1, 1],
#     [1, 0, 1],
#     [0, 1, 1],
#     [0, 0, 1],
#     [0, 0, 0],
#     [1, 0, 0],
#     [1, 1, 0]
# ])

# label = np.array(
#     [1, 1, 1, 0, 0, 0, 1]
# )

# model = Model(3, 0, 1, LEARNING_RATE)
# for i, d in enumerate(testing):
#     res = float(model.forward(d))
    
#     print(res)
#     model.train(d, res, label[i])