In [200]:
import numpy as np
import pandas as pd
from matplotlib import pyplot as plt

In [201]:
data = pd.read_csv('train.csv')

In [202]:
data = np.array(data)
m, n = data.shape

np.random.shuffle(data)

# cross validation data
data_dev = data[0:1000].T

Y_dev = data_dev[0]
X_dev = data_dev[1:n]

data_train = data[1000:m].T
Y_train = data_train[0]
X_train = data_train[1:n]

# Normalize the training and validation data
X_train = X_train / 255.0
X_dev = X_dev / 255.0


In [203]:
def __init__params():
    W1 = np.random.randn(10, 784) * .1
    b1 = np.random.rand(10, 1) * .1
    W2 = np.random.randn(10, 10)
    b2 = np.random.rand(10, 1)
    return W1, b1, W2, b2
    
def forward_prop(W1, b1, W2, b2, X):
    z1 = np.dot(W1, X) + b1
    A1 = ReLu(z1)
    z2 = np.dot(W2, A1) + b2
    A2 = softmax(z2)
    return z1, A1, z2, A2

def softmax(z):
    exp_z = np.exp(z - np.max(z, axis=0, keepdims=True))  # Ensure numerical stability
    return exp_z / np.sum(exp_z, axis=0, keepdims=True)
    
def ReLu(z):
    return np.maximum(0, z)

def one_hot(y):
    one_hot_y = np.zeros((y.size, y.max() + 1)) # Create a zero matrix with size (number of samples, number of classes)
    one_hot_y[np.arange(y.size), y] = 1 # Set the corresponding class indices to 1
    return one_hot_y.T
    
def deriv_ReLu(z):
    return z > 0

def back_prop(z1, A1, z2, A2, W2, x, y):
    m = y.size
    one_hot_y = one_hot(y)
    dz2 = A2 - one_hot_y
    dW2 = 1 / m * np.dot(dz2, A1.T)
    db2 = 1 / m * np.sum(dz2, axis=1, keepdims=True)
    dz1 = np.dot(W2.T, dz2) * deriv_ReLu(z1)
    dW1 = 1 / m * np.dot(dz1, x.T)
    db1 = 1 / m * np.sum(dz1, axis=1, keepdims=True)
    return dW1, db1, dW2, db2

def update_params(W1, b1, W2, b2, dW1, db1, dW2, db2, alpha):
    W1 = W1 - alpha * dW1
    b1 = b1 - alpha * db1
    W2 = W2 - alpha * dW2
    b2 = b2 - alpha * db2
    return W1, b1, W2, b2

In [214]:
def get_predictions(A2):
    return np.argmax(A2, 0)

def get_accuracy(predictions, Y):
    print(predictions, Y)
    return np.sum(predictions == Y) / Y.size

# def gradient_descent(X, Y, iterations, alpha):
#     W1, b1, W2, b2 = __init__params()
#     for i in range(iterations):
#         z1, A1, z2, A2 = forward_prop(W1, b1, W2, b2, X)
#         dw1, db1, dw2, db2 = back_prop(z1, A1, z2, A2, W2, X, Y)
#         W1, b1, W2, b2 = update_params(W1, b1, W2, b2, dw1, db1, dw2, db2, alpha)
#         if i % 200 == 0:
#             print("Iteration: ", i)
#             print("Accuracy: ", get_accuracy(get_predictions(A2), Y))
#     return W1, b1, W2, b2

def gradient_descent(X_train, Y_train, X_dev, Y_dev, iterations, alpha):
    # Initialize parameters (weights and biases)
    W1, b1, W2, b2 = __init__params()
    
    for i in range(iterations):
        # Forward propagation on training data
        z1, A1, z2, A2 = forward_prop(W1, b1, W2, b2, X_train)
        
        # Backward propagation to compute gradients
        dW1, db1, dW2, db2 = back_prop(z1, A1, z2, A2, W2, X_train, Y_train)
        
        # Update parameters using the gradients
        W1, b1, W2, b2 = update_params(W1, b1, W2, b2, dW1, db1, dW2, db2, alpha)
        
        # Print accuracies every 50 iterations
        if i % 200 == 0:
            # Training set accuracy
            print(f"Iteration: {i}")
            train_predictions = get_predictions(A2)
            train_accuracy = get_accuracy(train_predictions, Y_train)
            print(f"Training Accuracy: {train_accuracy}")
            
            # Evaluate on development set (dev set) without updating weights
            _, _, _, A2_dev = forward_prop(W1, b1, W2, b2, X_dev)
            dev_predictions = get_predictions(A2_dev)
            dev_accuracy = get_accuracy(dev_predictions, Y_dev)
            print(f"Dev Accuracy: {dev_accuracy}")
            
    return W1, b1, W2, b2

# Assuming your data is preprocessed and split into X_train, Y_train, X_dev, Y_dev
W1, b1, W2, b2 = gradient_descent(X_train, Y_train, X_dev, Y_dev, 2000, 0.1)


        

Iteration: 0
[9 9 9 ... 1 9 9] [4 7 1 ... 6 0 8]
Training Accuracy: 0.09875609756097561
[3 0 3 3 3 3 3 3 3 3 0 4 3 3 3 3 3 3 3 8 3 3 3 3 3 3 3 3 3 3 3 3 3 3 3 3 3
 3 8 3 3 8 3 2 3 3 0 3 3 0 3 3 3 2 3 3 3 3 3 3 3 3 3 3 3 2 3 3 7 3 3 3 3 3
 3 0 3 3 3 3 8 3 3 3 3 3 3 3 3 3 3 3 3 3 3 3 3 3 3 3 3 3 2 3 3 2 3 3 3 3 3
 3 3 3 1 3 3 3 3 3 3 3 3 3 3 3 0 3 3 3 3 3 3 3 3 3 4 3 3 3 3 3 2 3 1 3 3 2
 3 3 0 3 3 3 3 3 2 8 2 3 3 3 3 3 3 3 3 3 3 3 3 4 1 3 3 3 3 3 3 3 3 0 3 3 3
 3 3 3 3 2 3 3 3 3 3 3 3 3 3 3 3 4 3 3 3 3 3 3 3 3 3 3 3 3 2 3 3 3 3 3 3 0
 3 0 3 3 3 0 3 0 3 3 3 3 3 3 3 0 3 3 3 3 3 3 3 3 3 3 2 3 3 0 2 0 3 3 3 1 3
 3 3 3 3 3 3 3 3 3 3 3 3 3 3 3 3 3 3 3 3 7 0 3 3 3 3 2 3 9 3 3 3 3 3 3 3 3
 3 1 2 3 3 3 3 3 3 3 3 3 3 3 3 3 3 4 3 7 2 8 3 3 3 4 3 3 2 3 3 2 8 3 3 2 3
 3 7 3 3 3 3 3 3 0 3 0 3 3 3 3 3 3 3 3 3 2 3 2 3 3 3 2 3 3 3 3 2 3 3 3 3 3
 3 3 3 3 3 3 3 3 0 3 3 3 3 3 3 3 3 3 3 3 3 3 3 3 3 3 3 3 3 3 3 3 3 3 3 3 1
 3 3 3 3 3 3 3 3 3 3 3 3 0 3 3 3 3 3 3 3 3 2 3 3 3 3 3 3 3 3 3 0 3 3 3 3 3
 3 7 3 0 3 3

Iteration: 400
[4 7 1 ... 6 0 8] [4 7 1 ... 6 0 8]
Training Accuracy: 0.8664634146341463
[2 6 0 1 0 0 7 0 5 4 4 4 7 6 5 3 0 3 7 2 1 3 2 8 9 7 5 6 4 6 0 8 3 7 3 7 0
 7 8 6 1 4 3 7 3 2 6 6 8 8 1 1 4 8 9 7 6 4 1 9 6 6 1 5 3 9 2 0 7 5 8 7 3 1
 7 6 4 0 1 1 9 2 9 2 7 5 8 7 0 1 2 0 1 4 1 3 3 7 5 6 0 1 7 0 1 2 3 2 7 6 1
 6 8 4 5 3 1 7 9 3 8 8 8 5 6 0 3 0 9 7 5 5 4 0 6 8 4 4 6 3 7 3 7 4 7 6 6 2
 6 1 2 9 1 1 1 6 8 9 8 6 5 1 6 2 5 2 5 3 4 3 0 9 4 1 2 2 3 7 6 4 2 8 4 0 8
 6 8 0 6 3 8 7 6 3 3 2 5 5 7 3 5 0 0 5 1 5 2 7 3 8 4 8 6 0 5 9 3 6 4 1 9 9
 5 4 0 1 9 4 8 9 5 0 5 1 8 0 1 4 0 1 8 8 0 6 0 4 3 0 8 3 1 9 5 5 2 3 3 6 3
 7 5 1 1 2 0 4 2 6 4 1 9 6 9 0 0 5 6 6 3 9 9 3 0 8 5 8 0 4 6 3 9 0 4 3 6 1
 3 4 8 8 5 6 8 2 2 0 8 4 1 7 1 5 1 4 1 9 5 9 7 4 5 4 4 2 1 2 9 8 3 2 9 7 2
 2 3 7 5 5 9 9 1 3 3 6 0 9 4 0 0 6 0 5 4 3 9 8 1 3 0 7 1 4 4 6 8 6 2 8 4 0
 3 5 7 7 3 0 9 8 0 9 0 2 5 0 8 9 4 3 0 1 9 9 2 8 1 2 6 4 0 6 3 7 2 2 8 2 9
 1 3 0 1 9 0 5 9 3 7 9 0 2 9 4 5 6 8 1 7 6 5 6 0 1 0 4 9 6 9 0 6 3 6 9 7 5
 7 2 8 0 6 

Iteration: 800
[4 7 1 ... 6 0 8] [4 7 1 ... 6 0 8]
Training Accuracy: 0.8862682926829268
[2 6 0 1 0 0 7 0 5 4 4 4 7 6 5 3 0 3 7 2 1 3 2 8 9 7 5 6 4 6 0 8 3 7 3 7 0
 7 8 6 1 4 3 7 3 2 6 6 8 5 1 1 4 8 9 7 4 4 1 9 6 6 1 5 3 9 2 0 7 5 8 7 3 1
 7 6 4 0 1 1 9 2 9 2 7 8 8 7 0 1 2 0 1 4 1 5 3 7 5 6 0 1 7 0 1 2 5 2 7 6 1
 6 8 4 5 3 1 7 9 5 8 8 8 5 6 0 3 0 9 7 5 5 8 0 6 8 4 4 6 3 7 3 7 4 7 6 6 2
 6 1 2 9 1 1 1 6 8 9 8 6 5 1 6 2 5 2 5 3 4 3 0 9 4 1 2 2 3 7 6 4 2 8 4 0 8
 6 8 0 6 3 8 7 6 7 3 2 5 5 7 3 3 0 0 5 1 5 2 7 3 8 6 8 6 0 5 9 3 6 4 1 9 9
 5 9 0 1 9 4 8 9 5 0 5 1 8 0 1 4 0 1 8 8 0 6 5 4 3 0 8 3 1 9 5 9 2 3 3 6 3
 7 5 1 1 2 0 4 2 6 4 1 9 6 9 0 0 5 6 6 3 9 9 3 0 8 5 8 0 4 6 3 9 7 4 3 6 1
 3 4 8 8 5 6 8 2 2 0 8 4 1 7 1 5 1 4 1 9 5 9 7 4 5 7 4 2 1 2 9 5 3 2 9 7 2
 2 3 7 5 5 4 9 1 3 3 6 0 9 4 0 0 6 0 5 4 3 9 8 1 3 0 7 1 4 4 6 8 6 2 8 4 0
 5 5 7 7 3 0 9 8 6 9 0 2 5 0 8 9 4 3 0 1 9 9 2 8 1 2 6 4 0 6 3 7 2 2 8 2 9
 1 3 0 1 9 0 5 9 3 7 9 0 2 9 4 5 6 8 1 7 6 5 6 0 1 0 4 9 6 9 9 6 3 6 9 7 5
 7 2 8 0 6 

Iteration: 1200
[4 7 1 ... 6 0 8] [4 7 1 ... 6 0 8]
Training Accuracy: 0.899219512195122
[2 6 0 1 0 0 7 0 5 4 4 4 7 6 5 3 0 3 7 2 1 3 2 8 9 7 5 6 4 6 0 8 3 7 3 7 0
 7 8 6 1 4 3 7 3 2 2 6 8 5 1 1 4 8 1 7 4 4 1 9 6 6 1 5 3 9 2 0 7 5 8 7 3 1
 7 6 4 0 1 1 9 2 9 2 7 8 8 7 0 1 2 0 1 4 1 5 3 7 5 6 0 1 7 0 1 2 5 2 7 8 1
 6 8 4 5 3 1 7 9 3 8 8 8 5 6 0 3 0 9 7 5 5 8 0 6 8 4 4 6 5 7 3 7 4 7 6 6 2
 6 1 2 9 1 1 1 6 8 9 8 6 5 1 6 2 5 2 5 3 4 3 0 9 6 3 2 2 3 7 6 4 2 8 4 0 8
 6 8 0 6 3 8 7 6 3 3 2 5 5 7 3 3 0 0 5 1 5 2 7 3 8 6 8 6 0 5 9 3 6 4 1 9 9
 5 4 0 1 9 4 8 9 5 0 5 1 8 0 1 4 0 1 8 8 0 6 5 4 3 0 8 3 1 9 5 9 2 3 3 6 1
 7 5 1 1 2 0 4 2 6 4 1 9 6 9 0 0 5 6 6 3 9 9 3 0 8 5 8 0 4 6 3 9 7 4 8 6 1
 3 4 8 8 5 6 8 2 2 0 8 4 1 7 1 5 1 4 1 9 5 9 7 4 5 4 4 2 1 2 9 5 3 2 9 7 2
 2 3 7 5 5 4 9 1 3 3 6 0 9 4 0 0 6 0 5 4 3 9 8 1 3 0 7 1 4 4 6 8 6 2 8 4 0
 5 5 7 7 3 0 9 8 6 9 0 2 5 0 8 9 4 3 0 1 9 8 2 8 1 2 6 4 0 6 3 7 2 2 8 2 9
 1 3 0 1 9 0 5 9 3 7 9 0 2 9 4 5 6 8 1 7 6 5 6 0 1 0 4 9 6 9 9 6 3 6 9 7 5
 7 2 8 0 6 

Iteration: 1600
[4 7 1 ... 6 0 8] [4 7 1 ... 6 0 8]
Training Accuracy: 0.9057317073170732
[2 6 0 1 0 0 7 0 5 4 4 4 7 6 5 3 0 3 7 2 1 3 2 8 9 7 5 6 4 6 0 8 3 7 3 7 0
 7 8 6 1 4 3 7 3 2 2 6 8 5 1 1 4 8 1 7 4 4 1 9 6 6 1 5 3 9 2 0 7 5 8 7 3 1
 7 6 4 0 1 1 9 2 9 2 7 8 8 7 0 1 2 0 1 4 1 5 3 7 5 6 0 1 7 0 1 2 5 2 7 8 1
 6 8 4 5 3 1 7 9 3 5 8 8 5 6 0 3 0 9 7 5 5 8 0 6 8 4 4 6 5 7 3 7 4 7 6 6 2
 6 1 2 9 1 1 1 6 8 9 8 6 5 1 6 2 5 2 5 3 4 3 0 9 6 3 2 2 3 7 6 4 2 8 4 0 2
 6 8 0 6 3 8 7 6 3 3 2 5 5 7 3 3 0 0 5 1 5 2 7 3 8 6 8 6 0 5 9 3 6 4 1 9 9
 5 4 0 1 9 4 8 9 5 0 5 1 8 0 1 4 0 1 8 8 0 6 5 4 3 0 8 3 1 9 5 9 2 3 3 6 1
 7 5 1 1 2 0 4 2 6 4 1 9 6 9 0 0 5 6 6 3 9 9 3 0 8 5 8 0 4 6 3 9 7 4 8 6 1
 3 4 8 8 5 6 8 2 2 0 8 4 1 7 1 8 1 4 1 9 5 9 7 4 5 4 4 2 1 2 9 8 3 2 9 7 2
 2 3 7 5 5 4 9 1 3 3 6 0 9 4 0 0 6 0 5 4 3 9 8 1 3 0 7 1 4 4 6 8 6 2 8 4 0
 5 5 7 7 3 0 9 8 6 9 0 2 5 0 8 9 4 3 0 1 9 9 2 8 1 2 6 4 0 6 3 7 2 2 8 2 9
 1 3 0 1 9 0 5 9 3 7 9 0 2 9 4 5 6 8 1 7 6 5 6 0 1 0 4 9 6 9 9 6 3 6 9 7 5
 7 2 8 0 6

In [207]:
W1, b1, W2, b2, = gradient_descent(X_train, Y_train, 2000, .1)

TypeError: gradient_descent() missing 2 required positional arguments: 'iterations' and 'alpha'

In [112]:
# class Layer_Dense:
#     def __init__(self, n_inputs, n_neurons):
#         self.weights = np.random.randn(n_inputs, n_neurons)
#         self.biases = np.zeros((1, n_neurons))
#     def forward(self, inputs):
#         self.inputs = inputs
#         self.outputs = np.dot(inputs, self.weights) + self.biases
        
#     def backward(self, dvalues):
#         # Gradients on parameters
#         self.dweights = np.dot(self.inputs.T, dvalues)
#         self.dbiases = np.sum(dvalues, axis=0, keepdims=True)
#         # Gradient on the values to pass to the previous layer
#         self.dinputs = np.dot(dvalues, self.weights.T)
        
# class ReLu:
#     def forward(self, inputs):
#         self.inputs =inputs
#         self.outputs = np.maximum(0, inputs)
        
#     def backward(self, dvalues):
#         self.dinputs = dvalues.copy()

# class Loss:
#     def calculate(self, outputs, y):
#         sample_losses = self.forward(outputs, y)
#         batch_loss = np.mean(sample_losses)
#         return batch_loss
    
# class SoftMax:
#     def forward(self, inputs):
#         self.inputs = inputs
#         exp_values = np.exp(inputs - np.max(inputs, axis=1, keepdims=True))
#         self.outputs = exp_values / np.sum(exp_values, axis=1, keepdims=True)
        
#     def backward(self, dvalues):
#         samples = len(dvalues)
#         self.dinputs = self.outputs.copy()
#         self.dinputs[range(samples), dvalues] -= 1
#         self.dinputs /= samples

# # Modify the backward pass to use categorical cross-entropy loss gradient
# class Loss_CategoricalCrossentropy(Loss):
#     def forward(self, y_pred, y_true):
#         samples = len(y_pred)
#         y_pred_clipped = np.clip(y_pred, 1e-7, 1-1e-7)
        
#         if len(y_true.shape) == 1:
#             correct_confidences = y_pred_clipped[range(samples), y_true]
#         elif len(y_true.shape) == 2:
#             correct_confidences = np.sum(y_pred_clipped * y_true, axis=1)
        
#         negative_log_likelihoods = -np.log(correct_confidences)
#         return negative_log_likelihoods

#     def backward(self, y_pred, y_true):
#         samples = len(y_pred)
#         y_pred_clipped = np.clip(y_pred, 1e-7, 1-1e-7)

#         if len(y_true.shape) == 1:
#             self.dinputs = y_pred_clipped.copy()
#             self.dinputs[range(samples), y_true] -= 1
#         elif len(y_true.shape) == 2:
#             self.dinputs = y_pred_clipped - y_true
        
#         self.dinputs /= samples

# class Optimizer_Adam:
#     def __init__(self, learning_rate=0.001, beta_1=0.9, beta_2=0.999, epsilon=1e-7):
#         self.learning_rate = learning_rate
#         self.beta_1 = beta_1
#         self.beta_2 = beta_2
#         self.epsilon = epsilon
#         self.iterations = 0
#         self.m = None
#         self.v = None

#     def update_params(self, layer):
#         # If m and v are not initialized, initialize them with zeros
#         if self.m is None:
#             self.m = {'weights': np.zeros_like(layer.weights), 'biases': np.zeros_like(layer.biases)}
#             self.v = {'weights': np.zeros_like(layer.weights), 'biases': np.zeros_like(layer.biases)}

#         # Update time step
#         self.iterations += 1

#         # Momentum (moving average of the gradients)
#         self.m['weights'] = self.beta_1 * self.m['weights'] + (1 - self.beta_1) * layer.dweights
#         self.m['biases'] = self.beta_1 * self.m['biases'] + (1 - self.beta_1) * layer.dbiases

#         # RMSprop (moving average of squared gradients)
#         self.v['weights'] = self.beta_2 * self.v['weights'] + (1 - self.beta_2) * layer.dweights**2
#         self.v['biases'] = self.beta_2 * self.v['biases'] + (1 - self.beta_2) * layer.dbiases**2

#         # Bias correction
#         m_hat_weights = self.m['weights'] / (1 - self.beta_1**self.iterations)
#         m_hat_biases = self.m['biases'] / (1 - self.beta_1**self.iterations)
#         v_hat_weights = self.v['weights'] / (1 - self.beta_2**self.iterations)
#         v_hat_biases = self.v['biases'] / (1 - self.beta_2**self.iterations)

#         # Update weights and biases
#         layer.weights -= self.learning_rate * m_hat_weights / (np.sqrt(v_hat_weights) + self.epsilon)
#         layer.biases -= self.learning_rate * m_hat_biases / (np.sqrt(v_hat_biases) + self.epsilon)


In [113]:
# # Initialize layers and optimizer
# dense1 = Layer_Dense(X_train.shape[0], 64)
# activation1 = ReLu()
# dense2 = Layer_Dense(64, 10)
# softmax = SoftMax()
# loss_function = Loss_CategoricalCrossentropy()
# optimizer = Optimizer_Adam(learning_rate=0.001)

# # Number of epochs (iterations)
# epochs = 100

# for epoch in range(epochs):
#     # Forward pass
#     dense1.forward(X_train.T)
#     activation1.forward(dense1.outputs)
#     dense2.forward(activation1.outputs)
#     softmax.forward(dense2.outputs)
    
#     # Loss calculation
#     loss = loss_function.calculate(softmax.outputs, Y_train)
    
#     # Accuracy
#     predictions = np.argmax(softmax.outputs, axis=1)
#     accuracy = np.mean(predictions == Y_train)
    
#     print(f'Epoch: {epoch}, Loss: {loss}, Accuracy: {accuracy * 100:.2f}%')
    
#     # Backward pass
#     loss_function.backward(softmax.outputs, Y_train)
#     softmax.backward(loss_function.dinputs)
#     dense2.backward(softmax.dinputs)
#     activation1.backward(dense2.dinputs)
#     dense1.backward(activation1.dinputs)
    
#     # Update weights and biases
#     optimizer.update_params(dense1)
#     optimizer.update_params(dense2)

Epoch: 0, Loss: 14.563770923734609, Accuracy: 9.63%


IndexError: arrays used as indices must be of integer (or boolean) type