# Introduction
Here, we'll explore neural networks applied to the fashion-mnist problem.

In [7]:
import numpy as np
from sklearn.linear_model import LogisticRegression
import csv
import pandas as pd

In [8]:
# Functions

def formatArray (dataFrame, columnToExtract) :
    array = dataFrame.values
    target = array[:,columnToExtract]
    params = np.delete(array, columnToExtract, axis = 1)
    return params, target

def loadFashionTrainData():
    return pd.read_csv("fashion-mnist-dataset/fashion-mnist_train.csv")

def loadFashionTestData():
    return pd.read_csv("fashion-mnist-dataset/fashion-mnist_test.csv")

def split_train_test(data, test_ratio):
    np.random.seed(42)
    shuffled_indices = np.random.permutation(len(data))
    test_set_size = int(len(data) * test_ratio)
    test_indices = shuffled_indices[:test_set_size]
    train_indices = shuffled_indices[test_set_size:]
    return data.iloc[train_indices], data.iloc[test_indices]

# The dataset
First and foremost, we'll open train and test data. The training data is split to obtain validation items and the the target values are also separated from the original data.

In [9]:
fashionTrainDataset = loadFashionTrainData()
fashionTestDataset = loadFashionTestData()
fashionTrain, fashionValidation = split_train_test(fashionTrainDataset, 0.2)
fashionTrain, fashionTarget = formatArray(fashionTrain, 0)

print (fashionTrain[:5])
print (type(fashionTrain))

[[0 0 0 ... 0 0 0]
 [0 0 0 ... 0 0 0]
 [0 0 0 ... 0 0 0]
 [0 0 0 ... 0 0 0]
 [0 0 0 ... 0 0 0]]
<class 'numpy.ndarray'>


# Activation and Softmax Functions
First, we'll start by implementing some useful functions seen in class

In [11]:
def sigmoid(n):
    return (1/(1+exp(-n)))

def derivative_sigmoid(n):
    x = sigmoid(n)
    return x * (1 - x)

In [13]:
def relu(n):
    if n < 0:
        return 0
    return n

def derivative_relu(n):
    if n < 0:
        return 0
    return 1

In [14]:
def leaky_relu(n):
    if n > 0:
        return n
    return 0.01 * n

def derivative_leaky_relu(n):
    if n < 0:
        return 0.01
    return 1

In [15]:
def softmax(n):
    return np.exp(n)/ np.sum(np.exp(x), axis=0)

## Forward Propagation
In this section, we define forward propagation related functions.

In [2]:
def initialize_1hl(input_dimension,hidden_layer_1_neurons, output_dimension):
    neural_data = {}
    np.random.seed(0)
    neural_data['w1'] = np.random.randn(input_dimension, hidden_layer_1_neurons)/ np.sqrt(input_dimension)
    neural_data['w2'] = np.random.randn(hidden_layer_1_neurons, output_dimension)/ np.sqrt(hidden_layer_1_neurons)
    
    neural_data['b1'] = np.zeros((1, hidden_layer_1_neurons))
    neural_data['b2'] = np.zeros((1, output_dimension))
    return neural_data

def initialize_2hl(input_dimension, hidden_layer_1_neurons, hidden_layer_2_neurons, output_dimension):
    neural_data = {}
    np.random.seed(0)

    neural_data['w1'] = np.random.randn(input_dimension, hidden_layer_1_neurons)/ np.sqrt(input_dimension)
    neural_data['w2'] = np.random.randn(hidden_layer_1_neurons, hidden_layer_2_neurons)/ np.sqrt(hidden_layer_1_neurons)
    neural_data['w3'] = np.random.randn(hidden_layer_2_neurons, output_dimension)/ np.sqrt(hidden_layer_2_neurons)

    neural_data['b1'] = np.zeros((1, hidden_layer_1_neurons))
    neural_data['b2'] = np.zeros((1, hidden_layer_2_neurons))
    neural_data['b3'] = np.zeros((1, output_dimension))
    return neural_data
    
def forward_prop_1hl(x, neural_data):
    w1 , w2, b1, b2 = neural_data['w1'], neural_data['w2'], neural_data['b1'], neural_data['b2']
    x1 = np.dot(x, w1) + b1 #Output of hidden layer
    y1 = [relu(n) for n in x1] #Output of hidden layer with activation function
    x2 = np.dot(y1, w2) + b2 #Output of last layer
    neural_data['x1'] = x1
    neural_data['x2'] = x2
    neural_data['y1'] = y1
    neural_data['o'] = softmax(y2)
    return neural_data['o']

def forward_prop_2hl(x, neural_data):
    w1 , w2, w3, b1, b2, b3 = neural_data['w1'], neural_data['w2'], neural_data['w3'], neural_data['b1'], neural_data['b2'], neural_data['b3']

    x1 = np.dot(x, w1) + b1
    y1 = [relu(n) for n in x1]
    x2 = np.dot(a1, w2) + b2
    y2 = [relu(n) for n in x2]
    x3 = np.dot(a2, w3) + b3
    
    neural_data['x1'] = x1
    neural_data['x2'] = x2
    neural_data['x3'] = x3

    neural_data['y1'] = y1
    neural_data['y2'] = y2
    
    neural_data['o'] = softmax(x3)
    return neural_data['o']

## Prediction Functions
Helper functions that return predictions, given our model.

In [11]:
def predict_1hl(x, neural_data):
    return np.argmax(forward_prop_1hl(x,neural_data))

def predict_2hl(x, neural_data):
    return np.argmax(forward_prop_2hl(x,neural_data))

## Cost function

In [12]:
def gradient_cost(fashionTrainOutput, fashionTarget, fashionTargetMinusOne, testCasesAmount):
    final_cost = 0
    for j in testCasesAmount:
        cost = np.add(np.dot(fashionTarget, np.log10(fashionTrainOutput[j])),np.dot(fashionTargetMinusOne, (np.ones(10) - np.log10(fashionTrainOutput[j]))))
    final_cost = np.sum(cost)/testCasesAmount
    return final_cost

def total_error(fashionTrainOutput, fashionTarget):
    error = 0
    for i in len()
    

## Training and backward propagation

Some notes:

### For one hidden layer:
- x1 = #Output of hidden layer
- x2 = #Output of last layer
- y1 = #Output of hidden layer with activation function
- o = Final output with Softmax

### For TWO hidden layers:
- x1 = #Output of first hidden layer
- x2 = #Output of second hidden layer
- x3 = Output of last layer
- y1 = #Output of first hidden layer with activation function
- y2 = #Output of second hidden layer with activation function
- o = Final output with Softmax


# One hidden layer

Here, we present our code and results achieved by a learning algorithm that uses a neural network with only one hidden layer.

In [2]:
def train_neural_network_1hl(input_dimension, hidden_layer_1_neurons, output_dimension, input_data, epochs):
#     Initializes weights and biases for our neural network
    neural_data = initialize_1hl(neural_data, hidden_layer_1_neurons, output_dimension)
    sigmoid = True
    relu = False
    leaky_relu = False

#     Performs Backpropagation
    for i in range(epochs):
        
    ##
    ##
    ##   TODO: ADD MINI-BATCH ALGORITHM TO GENERATE MINI-BATCH DATA
    ##
    ##
    ##
#         Performs Forward propagation
        probs = forward_prop_1hl(mini_batch_data, neural_data)
    
#         Performs Backward propagation
        delta3 = probs - miniBatchTarget
        dW2 = (neural_data['y1'].T).dot(delta3)
        db2 = np.sum(delta3, axis=0, keepdims=True)
        delta2 = delta3.dot(neural_data['w2'].T)
        aux = neural_data['y1']
        if sigmoid: 
            for i in aux:
                i = derivative_sigmoid(i)
        if relu:
            for i in aux:
                i = derivative_relu(i)
        if leaky_relu:
            for i in aux:
                i = derivative_leaky_relu(i)        
        delta2 = delta2 * aux
        dW1 = np.dot(mini_batch_data.T, delta2)
        db1 = np.sum(delta2, axis=0)
        
        
         # Performs regularization
        dW2 += regularization_rate * neural_data['w2']
        dW1 += regularization_rate * neural_data['w1']

        # Gradient descent parameter update
        neural_data['w1'] += -learning_rate * dW1
        neural_data['b1'] += -learning_rate * db1
        neural_data['w2'] += -learning_rate * dW2
        neural_data['b2'] += -learning_rate * db2
        
        print("Ended iteration", i)
        
    return neural_data

# Two hidden layers

Same as before, but for 2 hidden layers.

In [3]:
def train_neural_network_2hl(input_dimension, hidden_layer_1_neurons, hidden_layer_2_neurons, output_dimension, input_data, epochs):
#     Initializes weights and biases for our neural network
    neural_data = initialize_1hl(neural_data, hidden_layer_1_neurons, output_dimension)
    sigmoid = True
    relu = False
    leaky_relu = False

#     Performs Backpropagation
    for i in range(epochs):
        
    ##
    ##
    ##   TODO: ADD MINI-BATCH ALGORITHM TO GENERATE MINI-BATCH DATA
    ##
    ##
    ##
#         Performs Forward propagation
        probs = forward_prop_1hl(mini_batch_data, neural_data)
    
#         Performs Backward propagation
        delta4 = probs - miniBatchTarget
        dW3 = (neural_data['y2'].T).dot(delta4)
        db3 = np.sum(delta4, axis=0, keepdims=True)
        delta3 = delta4.dot(neural_data['w3'].T)
        aux = neural_data['y2']
        if sigmoid: 
            for i in aux:
                i = derivative_sigmoid(i)
        if relu:
            for i in aux:
                i = derivative_relu(i)
        if leaky_relu:
            for i in aux:
                i = derivative_leaky_relu(i)        
        delta3 = delta3 * aux
        dW2 = np.dot(mini_batch_data.T, delta3)
        db2 = np.sum(delta3, axis=0)
        delta2 = delta3.dot(neural_data['w2'].T)
        aux = neural_data['y1']
        if sigmoid: 
            for i in aux:
                i = derivative_sigmoid(i)
        if relu:
            for i in aux:
                i = derivative_relu(i)
        if leaky_relu:
            for i in aux:
                i = derivative_leaky_relu(i)
        delta2 = delta2 * aux
        dW1 = np.dot(mini_batch_data.T, delta2)
        db1 = np.sum(delta2, axis=0)      
        
         # Performs regularization
        dW3 += regularization_rate * neural_data['w3']
        dW2 += regularization_rate * neural_data['w2']
        dW1 += regularization_rate * neural_data['w1']

        # Gradient descent parameter update
        
        neural_data['w1'] += -learning_rate * dW1
        neural_data['b1'] += -learning_rate * db1
        neural_data['w2'] += -learning_rate * dW2
        neural_data['b2'] += -learning_rate * db2
        neural_data['w3'] += -learning_rate * dW3
        neural_data['b3'] += -learning_rate * db3
        
        print("Ended iteration", i)
        
    return neural_data

# Testing the neural networks

Now, we'll test our neural networks under multiple circumstances on the validation set, so we can gest the best possible models.

## Problem details
- Input dimension: 28x28 = 784 neurons
- Output dimension: 10 classes = 10 neurons

In [6]:
hidden_layers = 1
input_neurons = 784
hidden_layer_1_neurons = 15
hidden_layer_2_neurons = 15
learning_rate = 1
regularization_rate = 0