In [1]:
## Python Package Imports
import numpy as np
import pandas as pd
import pickle

## Custom Module Imports
from activation_functions.SoftMax import SoftMax
from activation_functions.ReLU import ReLU
from loss_functons.mean_square_error import mean_square_error
from activation_functions.LeakyReLU import LeakyReLU
from sklearn.datasets import load_diabetes
from sklearn.model_selection import train_test_split
from sklearn.preprocessing import StandardScaler

In [11]:
class nn_layer:
    """
    Represents a weight matrix (rows, cols) = (num_neurons, input_size)
    num_neurons is the number of neurons we wish to put in this layer
    input_size is the fixed value defined by the last layer's outputs

    The relationship between input size and number of neurons for multiple layers is ---
    input_size = num_neurons_prev
    input_size_next/output_size = num_neurons
    ... etc
    """
    def __init__(self, input_size, num_neurons, activ_func, scale_inputs):
        self.num_in_n = input_size
        self.num_out_n = num_neurons
        # self.weight_matrix = np.array([np.random.rand(input_size) for _ in range(num_neurons)])
        # Above line has been upgraded to line below
        self.weight_matrix = np.random.randn(self.num_out_n, self.num_in_n)/2
        self.bias = np.random.randn(self.num_out_n)/2
        self.activation_func = activ_func
        self.scale_inputs = scale_inputs


    def batch_input(self, input_matrix):
        if(self.scale_inputs):
            scaler = StandardScaler()
            self.input_matrix = scaler.fit_transform(input_matrix)
        else:
            self.input_matrix = input_matrix
        self.batch_size = input_matrix.shape[0]
        """
        Returns the matrix product [input_matrix] * [weight_matrix]^T of dimensions
        (batch_size, num_in_neurons) * (num_in_neurons, num_out_neurons) = (batch_size, num_out_neurons)
        Where the output columns of the matrix are the output of the i^{th} layer of neurons

        
        (batch_size, num_out_neurons) + (num_out_neurons) is XW^T + bias, where the bias is added row-wise (to each row/neuron layer)
        """     
        self.raw_output = np.dot(self.input_matrix, self.weight_matrix.T) + self.bias
        self.activation_output = self.activation_func.forward(self.raw_output)
        return self.activation_output
    

    def backward(self, error_vector):
        """
        Given the error vector dC/da^(l), returns the new error vector for the next layer, dC/da^(l-1)
        C = cost func
        a^(l) = vector of activation functions at layer l, dim(a^(l))=num_neurons
        z = w*x + b

        Individual parials:
        dC/da_i = error_vector_i = (1)
        da_i/dz_i = self.activation_func.derivative(raw_output[:, i]) = (batch_size,)
        dz_i/dw_ij = (X_1j, X_2j, ..., X_num_inputsj) = self.input_matrix[:, j] = (batch_size,)
        
        dC/da = error_vector = (1,)
        da/dz = a_prime = self.activation_func.derivative(raw_output) = (batch_size, num_out_neurons)
        dz/dw = z_prime = self.input_matrix = (batch_size, num_in_neurons)

        np.outer()

        col_avg ( dC/da * da/dz * dz/dw ) = Grad = (num_out_n)
        """
        # print('batch_size', self.batch_size)
        # print('num_n_in', self.num_in_n)
        # print('num_n_out', self.num_out_n)
        # print('raw output shape', self.raw_output.shape)
        # print('input matrix shape', self.input_matrix.shape)
        # print('activation output shape', self.activation_output.shape)

        eta = 0.0001
        self.num_in_n
        self.num_out_n
        self.batch_size
        dC_da_1 = error_vector # derivative of cost wrt activation function at current layer, a vector indicating the change in cost at this (num_out_n)
        da_dz = self.activation_func.derivative(self.raw_output) # (batch_size, num_out_n)
        dC_dz = da_dz * dC_da_1 # (batch_size, num_out_n)
        dz_dw = self.input_matrix # (batch_size, num_in_n)
        # print('da_dz of (batch_size, num_out_n)', da_dz.shape)
        # print('dz_dw of (batch_size, num_in_n)', dz_dw.shape)
        
        # Below computes tensor dot product along specified axes, here we compute the dot product of tensors along (axis 0,axis 0), then sum along the axis.
        # Note these axis have to be the same length
        # Description of np.tensordot
        # axes=0 gives outer product
        # axes=1 gives inner product
        # axes=2 gives tensor contraction
        dC_dw_avg = np.tensordot(dC_dz, dz_dw, axes=(0,0)) / self.batch_size  # (num_out_n, num_in_n) = dim(W)
        # print('da_dw of size (num_out_n, num_in_n)', da_dw.shape)
        # print('dC_dz shape', dC_dz.shape, 'W shape', self.weight_matrix.shape)
        dC_da_0 = np.matmul(dC_dz, self.weight_matrix) # (batch_size, num_in_n)
        # print('dC_da_0 shape (batch_size, num_out_n)', dC_da_0.shape)
        # sum average gradient across all batches
        dC_da_0_avg = np.sum(dC_da_0, axis=0) / self.batch_size # (num_out_n)
        self.weight_matrix = self.weight_matrix - (eta * dC_dw_avg)
        print(np.max(error_vector))
        return dC_da_0_avg
    

class simple_neural_network:
    """
    Represents a neural network as an array of 'nn_layer' objects
    """
    def __init__(self, input_size):
        self.nn_array = []
        self.input_size = input_size


    def add_layer(self, num_neurons, activ_func, scale_inputs=False):
        """
        New layer must have input size corresponding to previous layer's output size
        num_neurons - is the number of neurons in the current layer
        activ_func - is the activation function that should be applied to the outputs of this layer
        """
        if(len(self.nn_array) == 0):
            self.nn_array.append(nn_layer(self.input_size,
                num_neurons, 
                activ_func,
                scale_inputs=scale_inputs))
        else:
            prev_output_size = self.nn_array[-1].weight_matrix.shape[0]
            self.nn_array.append(nn_layer(
                input_size = prev_output_size, 
                num_neurons = num_neurons, 
                activ_func=activ_func,
                scale_inputs=scale_inputs))


    def describe_network(self):
        # weight matrix shape is (num_neurons, input_size)
        for layer in self.nn_array:
            print(layer)

    def forward_pass(self, input_matrix):
        for i in range(len(self.nn_array)):
            layer = self.nn_array[i]
            input_matrix = layer.batch_input(input_matrix)    
        return input_matrix
    
    def backward_pass(self, error_vector):
        for i in range(len(self.nn_array), 0, -1):
            layer = self.nn_array[i-1]
            error_vector = layer.backward(error_vector)

In [12]:
from sklearn.preprocessing import StandardScaler
diabetes = load_diabetes()
scaler = StandardScaler()
scaler.fit(diabetes.data)
X_transformed = scaler.transform(diabetes.data)
scaler.fit(diabetes.target.reshape(-1,1))
y_transformed = scaler.transform(diabetes.target.reshape(-1,1))
X_train, X_test, y_train, y_test = train_test_split(X_transformed, y_transformed, test_size=0.2, random_state=42, shuffle=True)
nn = simple_neural_network(10)
nn.add_layer(8, LeakyReLU(0.01))
nn.add_layer(32, LeakyReLU(0.01), scale_inputs=True)
nn.add_layer(64, LeakyReLU(0.01))
nn.add_layer(32, LeakyReLU(0.01), scale_inputs=True)
nn.add_layer(8, LeakyReLU(0.01), scale_inputs=True)
nn.add_layer(1, LeakyReLU(0.01))

# batches
mse_func = mean_square_error()
X_train_batches = [X_train[i: i+10] for i in range(0, len(X_train), 10)]
y_train_batches = [y_train[i:i+10] for i in range(0, len(y_train), 10)]

num_epochs = 100
for i in range(num_epochs):
    # Epoch
    # for i in range(len(X_train_batches)):
    for i in range(2):
        X_train_batch = X_train_batches[i]
        y_train_batch = y_train_batches[i]
        y_pred_batch = nn.forward_pass(X_train_batch)
        nn.backward_pass(mse_func.derivative(y_train_batch, y_pred_batch))
    y_true = y_train
    y_pred = nn.forward_pass(X_train)
    print('MSE Loss:', mse_func.compute(y_true=y_true, y_pred=y_pred))

# scaler.inverse_transform(y_pred.reshape(-1,1))
# print('MSE Loss:', mse_func.compute(y_true=y_true, y_pred=y_pred))
# print('dC/da', mse_func.derivative(y_true, y_pred))


1.7734318334591108
1.7877836385774866
2.1093512146024187
17.283734483003524
74.11187976027036
47.0206231110922
1.5385810461329288
0.9095014746361822
1.0510560616013864
9.172122426301447
44.00994173945523
42.25587293138026
MSE Loss: 3.521676116390247
1.8180347547796463
1.8727075880382285
2.1780451282596487
18.170254353658528
76.49732584145407
47.38182099663592
1.35398344011589
0.6795750616464596
0.8074425148683442
6.730634923929445
30.268339689890535
30.245440593393045
MSE Loss: 3.5173184580897505
1.898289098903507
2.026660140371141
2.3489962442091747
20.02005744616527
83.20512056928655
47.73707619866638
1.1059114903145404
0.42273785821074056
0.5056734910437033
4.001704525454802
18.367501817485913
22.523253730518125
MSE Loss: 3.5179372920082375
2.1924705990020668
2.647175021817054
3.1657786547353797
27.602998728680042
113.89150747224978
54.91982199558155
0.6882111589801786
0.12488079255712403
0.15020287827230444
1.1673129104260438
5.345202853398842
10.106647259583468
MSE Loss: 3.4626607

  temp **= 2
  new_unnormalized_variance -= correction**2 / new_sample_count
  new_unnormalized_variance -= correction**2 / new_sample_count
  upper_bound = n_samples * eps * var + (n_samples * mean * eps) ** 2
  updated_mean = (last_sum + new_sum) / updated_sample_count
  T = new_sum / new_sample_count
  new_unnormalized_variance -= correction**2 / new_sample_count
  updated_mean = (last_sum + new_sum) / updated_sample_count
  T = new_sum / new_sample_count
  new_unnormalized_variance -= correction**2 / new_sample_count
  temp **= 2
  new_unnormalized_variance -= correction**2 / new_sample_count
  new_unnormalized_variance -= correction**2 / new_sample_count
  upper_bound = n_samples * eps * var + (n_samples * mean * eps) ** 2
  updated_mean = (last_sum + new_sum) / updated_sample_count
  T = new_sum / new_sample_count
  new_unnormalized_variance -= correction**2 / new_sample_count
  updated_mean = (last_sum + new_sum) / updated_sample_count
  T = new_sum / new_sample_count
  new_unno

nan
nan
nan
nan
nan
nan
nan
nan
nan
nan
nan
nan
MSE Loss: nan
nan
nan
nan
nan
nan
nan
nan
nan
nan
nan
nan
nan
MSE Loss: nan
nan
nan
nan
nan
nan
nan
nan
nan
nan
nan
nan
nan
MSE Loss: nan
nan
nan
nan
nan
nan
nan
nan
nan
nan
nan
nan
nan
MSE Loss: nan
nan
nan
nan
nan
nan
nan
nan
nan
nan
nan
nan
nan
MSE Loss: nan
nan
nan
nan
nan
nan
nan
nan
nan
nan
nan
nan
nan
MSE Loss: nan
nan
nan
nan
nan
nan
nan
nan
nan
nan
nan
nan
nan
MSE Loss: nan
nan
nan
nan
nan
nan
nan
nan
nan
nan
nan
nan
nan
MSE Loss: nan
nan
nan
nan
nan
nan
nan
nan
nan
nan
nan
nan
nan
MSE Loss: nan
nan
nan
nan
nan
nan
nan
nan
nan
nan
nan
nan
nan
MSE Loss: nan
nan
nan
nan
nan
nan
nan
nan
nan
nan
nan
nan
nan
MSE Loss: nan
nan
nan
nan
nan
nan
nan
nan
nan
nan
nan
nan
nan


  updated_mean = (last_sum + new_sum) / updated_sample_count
  T = new_sum / new_sample_count
  new_unnormalized_variance -= correction**2 / new_sample_count
  updated_mean = (last_sum + new_sum) / updated_sample_count
  T = new_sum / new_sample_count
  new_unnormalized_variance -= correction**2 / new_sample_count
  updated_mean = (last_sum + new_sum) / updated_sample_count
  T = new_sum / new_sample_count
  new_unnormalized_variance -= correction**2 / new_sample_count
  updated_mean = (last_sum + new_sum) / updated_sample_count
  T = new_sum / new_sample_count
  new_unnormalized_variance -= correction**2 / new_sample_count
  updated_mean = (last_sum + new_sum) / updated_sample_count
  T = new_sum / new_sample_count
  new_unnormalized_variance -= correction**2 / new_sample_count
  updated_mean = (last_sum + new_sum) / updated_sample_count
  T = new_sum / new_sample_count
  new_unnormalized_variance -= correction**2 / new_sample_count
  updated_mean = (last_sum + new_sum) / updated_samp

MSE Loss: nan
nan
nan
nan
nan
nan
nan
nan
nan
nan
nan
nan
nan
MSE Loss: nan
nan
nan
nan
nan
nan
nan
nan
nan
nan
nan
nan
nan
MSE Loss: nan
nan
nan
nan
nan
nan
nan
nan
nan
nan
nan
nan
nan
MSE Loss: nan
nan
nan
nan
nan
nan
nan
nan
nan
nan
nan
nan
nan
MSE Loss: nan
nan
nan
nan
nan
nan
nan
nan
nan
nan
nan
nan
nan


  updated_mean = (last_sum + new_sum) / updated_sample_count
  T = new_sum / new_sample_count
  new_unnormalized_variance -= correction**2 / new_sample_count
  updated_mean = (last_sum + new_sum) / updated_sample_count
  T = new_sum / new_sample_count
  new_unnormalized_variance -= correction**2 / new_sample_count
  updated_mean = (last_sum + new_sum) / updated_sample_count
  T = new_sum / new_sample_count
  new_unnormalized_variance -= correction**2 / new_sample_count
  updated_mean = (last_sum + new_sum) / updated_sample_count
  T = new_sum / new_sample_count
  new_unnormalized_variance -= correction**2 / new_sample_count
  updated_mean = (last_sum + new_sum) / updated_sample_count
  T = new_sum / new_sample_count
  new_unnormalized_variance -= correction**2 / new_sample_count
  updated_mean = (last_sum + new_sum) / updated_sample_count
  T = new_sum / new_sample_count
  new_unnormalized_variance -= correction**2 / new_sample_count
  updated_mean = (last_sum + new_sum) / updated_samp

MSE Loss: nan
nan
nan
nan
nan
nan
nan
nan
nan
nan
nan
nan
nan
MSE Loss: nan
nan
nan
nan
nan
nan
nan
nan
nan
nan
nan
nan
nan
MSE Loss: nan
nan
nan
nan
nan
nan
nan
nan
nan
nan
nan
nan
nan
MSE Loss: nan
nan
nan
nan
nan
nan
nan
nan
nan
nan
nan
nan
nan
MSE Loss: nan
nan
nan
nan
nan
nan
nan
nan
nan
nan
nan
nan
nan
MSE Loss: nan
nan
nan
nan
nan
nan
nan
nan
nan
nan
nan
nan
nan


  updated_mean = (last_sum + new_sum) / updated_sample_count
  T = new_sum / new_sample_count
  new_unnormalized_variance -= correction**2 / new_sample_count
  updated_mean = (last_sum + new_sum) / updated_sample_count
  T = new_sum / new_sample_count
  new_unnormalized_variance -= correction**2 / new_sample_count
  updated_mean = (last_sum + new_sum) / updated_sample_count
  T = new_sum / new_sample_count
  new_unnormalized_variance -= correction**2 / new_sample_count
  updated_mean = (last_sum + new_sum) / updated_sample_count
  T = new_sum / new_sample_count
  new_unnormalized_variance -= correction**2 / new_sample_count
  updated_mean = (last_sum + new_sum) / updated_sample_count
  T = new_sum / new_sample_count
  new_unnormalized_variance -= correction**2 / new_sample_count
  updated_mean = (last_sum + new_sum) / updated_sample_count
  T = new_sum / new_sample_count
  new_unnormalized_variance -= correction**2 / new_sample_count
  updated_mean = (last_sum + new_sum) / updated_samp

MSE Loss: nan
nan
nan
nan
nan
nan
nan
nan
nan
nan
nan
nan
nan
MSE Loss: nan
nan
nan
nan
nan
nan
nan
nan
nan
nan
nan
nan
nan
MSE Loss: nan
nan
nan
nan
nan
nan
nan
nan
nan
nan
nan
nan
nan
MSE Loss: nan
nan
nan
nan
nan
nan
nan
nan
nan
nan
nan
nan
nan
MSE Loss: nan
nan
nan
nan
nan
nan
nan
nan
nan
nan
nan
nan
nan
MSE Loss: nan
nan
nan
nan
nan
nan
nan
nan
nan
nan
nan
nan
nan
MSE Loss: nan
nan
nan
nan
nan
nan
nan
nan
nan
nan
nan
nan
nan
MSE Loss: nan
nan
nan
nan
nan
nan
nan
nan
nan
nan
nan
nan
nan
MSE Loss: nan


  updated_mean = (last_sum + new_sum) / updated_sample_count
  T = new_sum / new_sample_count
  new_unnormalized_variance -= correction**2 / new_sample_count
  updated_mean = (last_sum + new_sum) / updated_sample_count
  T = new_sum / new_sample_count
  new_unnormalized_variance -= correction**2 / new_sample_count
  updated_mean = (last_sum + new_sum) / updated_sample_count
  T = new_sum / new_sample_count
  new_unnormalized_variance -= correction**2 / new_sample_count
  updated_mean = (last_sum + new_sum) / updated_sample_count
  T = new_sum / new_sample_count
  new_unnormalized_variance -= correction**2 / new_sample_count
  updated_mean = (last_sum + new_sum) / updated_sample_count
  T = new_sum / new_sample_count
  new_unnormalized_variance -= correction**2 / new_sample_count
  updated_mean = (last_sum + new_sum) / updated_sample_count
  T = new_sum / new_sample_count
  new_unnormalized_variance -= correction**2 / new_sample_count
  updated_mean = (last_sum + new_sum) / updated_samp

nan
nan
nan
nan
nan
nan
nan
nan
nan
nan
nan
nan
MSE Loss: nan
nan
nan
nan
nan
nan
nan
nan
nan
nan
nan
nan
nan
MSE Loss: nan
nan
nan
nan
nan
nan
nan
nan
nan
nan
nan
nan
nan
MSE Loss: nan
nan
nan
nan
nan
nan
nan
nan
nan
nan
nan
nan
nan
MSE Loss: nan
nan
nan
nan
nan
nan
nan
nan
nan
nan
nan
nan
nan
MSE Loss: nan
nan
nan
nan
nan
nan
nan
nan
nan
nan
nan
nan
nan
MSE Loss: nan
nan
nan
nan
nan
nan
nan
nan
nan
nan
nan
nan
nan
MSE Loss: nan
nan
nan
nan
nan
nan
nan
nan
nan
nan
nan
nan
nan
MSE Loss: nan
nan
nan
nan
nan
nan
nan
nan
nan
nan
nan
nan
nan
MSE Loss: nan


  new_unnormalized_variance -= correction**2 / new_sample_count
  updated_mean = (last_sum + new_sum) / updated_sample_count
  T = new_sum / new_sample_count
  new_unnormalized_variance -= correction**2 / new_sample_count
  updated_mean = (last_sum + new_sum) / updated_sample_count
  T = new_sum / new_sample_count
  new_unnormalized_variance -= correction**2 / new_sample_count
  updated_mean = (last_sum + new_sum) / updated_sample_count
  T = new_sum / new_sample_count
  new_unnormalized_variance -= correction**2 / new_sample_count
  updated_mean = (last_sum + new_sum) / updated_sample_count
  T = new_sum / new_sample_count
  new_unnormalized_variance -= correction**2 / new_sample_count
  updated_mean = (last_sum + new_sum) / updated_sample_count
  T = new_sum / new_sample_count
  new_unnormalized_variance -= correction**2 / new_sample_count
  updated_mean = (last_sum + new_sum) / updated_sample_count
  T = new_sum / new_sample_count
  new_unnormalized_variance -= correction**2 / new_s

In [194]:
# Testing single instance
from sklearn.preprocessing import StandardScaler
diabetes = load_diabetes()
scaler = StandardScaler()
scaler.fit(diabetes.data)
X_transformed = scaler.transform(diabetes.data)
scaler.fit(diabetes.target.reshape(-1,1))
y_transformed = scaler.transform(diabetes.target.reshape(-1,1))
X_train, X_test, y_train, y_test = train_test_split(X_transformed, y_transformed, test_size=0.2, random_state=42, shuffle=True)
nn = simple_neural_network(10)
nn.add_layer(8, LeakyReLU(0.01))
nn.add_layer(32, LeakyReLU(0.01), scale_inputs=True)
nn.add_layer(64, LeakyReLU(0.01))
nn.add_layer(32, LeakyReLU(0.01), scale_inputs=True)
nn.add_layer(8, LeakyReLU(0.01), scale_inputs=True)
nn.add_layer(1, LeakyReLU(0.01))

# batches
mse_func = mean_square_error()
X_train_batches = [X_train[i: i+10] for i in range(0, len(X_train), 10)]
y_train_batches = [y_train[i:i+10] for i in range(0, len(y_train), 10)]



X_train_batch = X_train_batches[0]
y_train_batch = y_train_batches[0]
y_pred_batch = nn.forward_pass(X_train_batch)
print(y_pred_batch, y_train_batch)
nn.backward_pass(mse_func.derivative(y_train_batch, y_pred_batch))
print('MSE Loss:', mse_func.compute(y_true=y_train_batch, y_pred=y_pred_batch))
print('dC/da', mse_func.derivative(y_train_batch, y_pred_batch))

in shape (10, 10)
out shape (10, 8)
max 4.0308200787099135
in shape (10, 8)
out shape (10, 32)
max 10.234638768056447
in shape (10, 32)
out shape (10, 64)
max 29.820762473688468
in shape (10, 64)
out shape (10, 32)
max 23.61414052606527
in shape (10, 32)
out shape (10, 8)
max 12.381890547879182
in shape (10, 8)
out shape (10, 1)
max 4.9194369214155005
[[ 4.91943692]
 [-0.11702976]
 [-0.13332933]
 [-0.02297819]
 [-0.10346715]
 [-0.09525626]
 [-0.06595026]
 [-0.06131726]
 [-0.01257931]
 [ 2.93727237]] [[-0.10562178]
 [-0.02770552]
 [ 1.66048019]
 [-0.35235662]
 [-1.20943552]
 [-1.13151925]
 [ 1.67346624]
 [ 1.62152206]
 [ 0.19305723]
 [-0.80686816]]
MSE Loss: 5.0800836176597155
dC/da 1.145956581189623
