In [1]:
import numpy as np
import pandas as pd

In [2]:
def relu(x):
    return np.maximum(0,x)

def softmax(x):
    exp_x = np.exp(x)
    return exp_x / np.sum(exp_x, axis= 1, keepdims = True)

In [3]:
def forward(X, W1, W2):
    z1 = np.dot(X,W1)
    a1 = relu(z1)
    z2 = np.dot(a1, W2)
    y_hat = softmax(z2)
    return y_hat, a1, z1

In [4]:
def compute_loss(y_hat, y):
    loss = -np.mean(y * np.log(y_hat))
    return loss

In [5]:
def backprop(X, y, y_hat, a1, z1, W1, W2):
    m = X.shape[0]
    dL_dz2 = (y_hat - y) / m
    dL_dW2 = np.dot(a1.T, dL_dz2)
    dL_da1 = np.dot(dL_dz2, W2.T)
    dL_dz1 = dL_da1 * (z1 > 0)
    dL_dW1 = np.dot(X.T, dL_dz1)
    return dL_dW1, dL_dW2

In [6]:
import random

input_size = 10
hidden_size = 100
output_size = 3
learning_rate = 0.01

W1 = np.random.randn(input_size, hidden_size)
W2 = np.random.randn(hidden_size, output_size)
print('Weights W1')
print(W1)
print('Weights W2')
print(W2)

X_train = np.random.randn(1000, 10)
y_train = np.random.randint(0,3, size=(1000,))
print('X_train')
print(X_train)
print('y_train')
print(y_train)

num_classes =3
y_train_one_hot = np.eye(num_classes)[y_train]
print('y_train_one_hot')
print(y_train_one_hot)

Weights W1
[[-1.00304098e+00 -2.28711019e-01 -2.53147788e-01  2.19449290e+00
   3.11255086e-01 -5.46425593e-01  1.44922005e+00  1.55776227e+00
   1.38279340e+00  1.52318253e+00 -8.35736065e-01 -1.15515399e+00
   5.49900361e-01  2.27910519e-01 -8.93099715e-01 -1.14986616e+00
   2.65460772e-01 -1.28966833e+00 -3.11862359e+00  1.02247315e+00
  -9.61626828e-01 -1.70041808e-01 -4.29094322e-02  5.94001559e-01
  -4.86282179e-01  3.76047407e-01 -7.50654239e-01 -1.01574115e-02
  -5.63713339e-01  8.46817958e-01 -1.22863993e-01 -2.02535333e-01
  -1.69348745e+00  1.67251218e-01  2.34900076e+00 -1.25709509e+00
   1.01514538e+00  1.23449285e+00  1.27360854e+00 -3.82778749e-02
   5.74380065e-01 -1.55331842e+00  3.55116601e-02  1.21909230e+00
   4.80750987e-01  2.25176201e-01  3.30920026e-01 -1.40103670e+00
  -1.68935230e+00 -2.17768370e+00 -1.35464964e+00 -1.06016032e+00
  -3.80749892e-01 -5.09897599e-01 -4.06194767e-01  8.69151004e-01
   6.69350036e-01  1.90342191e-01 -6.45953867e-01  2.00356155e-01

In [8]:
for epoch in range (1000):
    y_hat, a1, z1 = forward(X_train, W1, W2)
    #print("Predictions")
    #print(y_hat)
    
    loss = compute_loss(y_hat, y_train_one_hot)
    
    if epoch % 50 == 0:
        print('Epoch: ', epoch, 'Loss: ', loss)
        
    dL_dw1, dL_dw2 = backprop(X_train, y_train_one_hot, y_hat, a1, z1, W1, W2)
    
    W1 -= learning_rate * dL_dw1
    W2 -= learning_rate * dL_dw2
    
    predictions = np.argmax(y_hat, axis = 1)
    #print('Prediction Class Labels')
    #print(predictions)

Epoch:  0 Loss:  1.4009319656697152
Epoch:  50 Loss:  1.2375005536551198
Epoch:  100 Loss:  1.0999452218365293
Epoch:  150 Loss:  0.9849406836606013
Epoch:  200 Loss:  0.8904651458900351
Epoch:  250 Loss:  0.8153684726183686
Epoch:  300 Loss:  0.7552359278139846
Epoch:  350 Loss:  0.70707016238362
Epoch:  400 Loss:  0.6674982802681101
Epoch:  450 Loss:  0.6348078092753713
Epoch:  500 Loss:  0.6077511084170252
Epoch:  550 Loss:  0.5852771759410581
Epoch:  600 Loss:  0.5661838409612262
Epoch:  650 Loss:  0.5498386904904298
Epoch:  700 Loss:  0.53567844910454
Epoch:  750 Loss:  0.5231632954595764
Epoch:  800 Loss:  0.5120892534369176
Epoch:  850 Loss:  0.5020509773821786
Epoch:  900 Loss:  0.49283601392487714
Epoch:  950 Loss:  0.4844287141577714
