In [1]:
# This Python 3 environment comes with many helpful analytics libraries installed
# It is defined by the kaggle/python Docker image: https://github.com/kaggle/docker-python
# For example, here's several helpful packages to load

import numpy as np # linear algebra
import pandas as pd # data processing, CSV file I/O (e.g. pd.read_csv)

# Input data files are available in the read-only "../input/" directory
# For example, running this (by clicking run or pressing Shift+Enter) will list all files under the input directory

import os
for dirname, _, filenames in os.walk('/kaggle/input'):
    for filename in filenames:
        print(os.path.join(dirname, filename))
d = pd.read_csv("/kaggle/input/iris/Iris.csv")
d.head()
# You can write up to 20GB to the current directory (/kaggle/working/) that gets preserved as output when you create a version using "Save & Run All" 
# You can also write temporary files to /kaggle/temp/, but they won't be saved outside of the current session

/kaggle/input/digit-recognizer/sample_submission.csv
/kaggle/input/digit-recognizer/train.csv
/kaggle/input/digit-recognizer/test.csv
/kaggle/input/iris/Iris.csv
/kaggle/input/iris/database.sqlite


Unnamed: 0,Id,SepalLengthCm,SepalWidthCm,PetalLengthCm,PetalWidthCm,Species
0,1,5.1,3.5,1.4,0.2,Iris-setosa
1,2,4.9,3.0,1.4,0.2,Iris-setosa
2,3,4.7,3.2,1.3,0.2,Iris-setosa
3,4,4.6,3.1,1.5,0.2,Iris-setosa
4,5,5.0,3.6,1.4,0.2,Iris-setosa


This is a simple 2 layer MLP of the MNIST dataset classification, where sigmoid are used as activation functions of all layers.

All 42000 images belong to 10 classes.

# 1. FeedForward Propagation

Output of the neuron follows :

$\sum = wx + b$

$o = ReLu(\sum)$

Suppose $w_1,b_1$ be the weights and biases of the 1st layer, and $w_2, b_2$ be the weights and biases of the 2nd layer, thus, the output of each neuron in the output layer would be :

$o_2 = softmax(w_2\sum + b_2)$

# 2. BackwardPropagation

This algorithm uses gradient descent method as optimization method.

Suppose $\delta_1$ and $\delta_2$ be the **error** of 1st and 2nd layer respectively, $\vec t_m = (t_1,t_2,....,t_m)$ and $\vec y_m = (y_1, y_2,....,y_m)$ be the target and output vector respectively, we have in the second layer :

$\delta_2 = \frac{\partial E}{\partial w_ij} = (y_i - t_i)$

Hence the correction of weight in each iteration should be :

$\vardelta_ij = 

# 3. Activation Functions

1. ReLu
- Relu is the most popular activation function in machine learning, it is defined as :

$ReLu(x) = x$ for x>0

$ReLu(x) = 0$ for x<0

Relu is famous of it's simplifcity of it's derivative function, the derivative of ReLu is :

$ReLU'(x) = 0$ for x<0

$ReLU'(x) = 1$ for x>0

2. Softmax
- Softmax is another popular activation function, it can helps transform the output into probability, the softmax function is defined as :

$softmax(z)_i = \frac{e^{z_i}}{\sum_{j} e^{z_j}}$


In [2]:
import numpy as np
import pandas as pd
import scipy as sci

path = "/kaggle/input/digit-recognizer/train.csv"
data = pd.read_csv(path, engine='c')
label_1 = data['label']
label_1 = label_1.T
data = np.array(data)
m,n = data.shape
train_data = data[0:m].T
X_train = train_data[1:n]
X_train = X_train/255.0


class Model(object):

    def __init__(self, input_dim=784, output_dim=10):
        self.num_neurons = 512

        self.w_1 = np.random.randn(self.num_neurons,784) * np.sqrt(2./input_dim)
        self.w_2 = np.random.randn(10,self.num_neurons) * np.sqrt(2./10)
        self.b_1 = np.zeros((self.num_neurons, 1))
        self.b_2 = np.zeros((output_dim, 1))

        self.learning_rate = 0.000156
        self.alpha = 0.1
        self.epochs = 600
        
#Kaiming initialization
    def sigmoid(self,x):
        return sci.special.expit(x)
    
    def sigmoid_derivative(self,x):
        return self.sigmoid(x)*(1-self.sigmoid(x))
    
    def ReLU(self, x):
        return np.maximum(0, x)

    def ReLU_derivative(self, x):
        return np.where(x > 0, 1.0, 0.0)

    def ELU(self, x):
        return np.where(x >= 0.0, x, self.alpha * (np.exp(x) - 1))

    def ELU_deriv(self,x):
        return np.where(x >= 0, 1, self.alpha * np.exp(x))

    def softmax(self, z):
        e_z = np.exp(z-np.max(z))
        return sci.special.softmax(e_z)
    
    def softmax_backward(self,z):
        do_dz = self.softmax(1-self.softmax)
        
    
#Defining a set of activation functions for the convenience of changing act. functions below


    def Forward(self, X_train):
        self.sum_1 = np.dot(self.w_1, X_train) + self.b_1
        self.output_1 = self.sigmoid(self.sum_1)
        #First layer, using ReLU as activation

        self.sum_2 = np.dot(self.w_2, self.output_1) + self.b_2
        self.output_2 = self.sigmoid(self.sum_2)
        #Second Layer, using softmax as activation

        self.predictions = np.argmax(self.output_2, axis=0)
        return self.predictions
        #Backward Propagation
        
    def Backward(self, label_1, X_train):
        
        one_hot_labels = np.eye(10)[label_1].T
        self.error = self.output_2 - one_hot_labels
        self.delta_2 = self.error*self.sigmoid_derivative(self.output_2)
        self.d_w_2 = np.dot(self.delta_2, self.output_1.T)
        self.d_b_2 = np.sum(self.delta_2, axis=1, keepdims=True)
        
        self.delta_1 = np.dot(self.w_2.T, self.delta_2) * self.sigmoid_derivative(self.output_1)
        self.d_w_1 = np.dot(self.delta_1, X_train.T)
        self.d_b_1 = np.sum(self.delta_1, axis=1, keepdims=True)

#First Layer

    def update_params(self):
        self.w_1 -= self.learning_rate * self.d_w_1
        self.w_2 -= self.learning_rate * self.d_w_2
        self.b_1 -= self.learning_rate * self.d_b_1
        self.b_2 -= self.learning_rate * self.d_b_2
#Updating parameters

        return self.w_1,self.w_2,self.b_1,self.b_2

    def compute_accuracy(self, label_1):
        correct_predictions = np.sum(self.predictions == label_1)
        total_predictions = self.predictions.shape[0]
        self.accuracy = correct_predictions / total_predictions

    def fit(self, X_train, label_1):

        for epoch in range(self.epochs):
            self.Forward(X_train)
            self.Backward(label_1, X_train)
            self.update_params()
            self.compute_accuracy(label_1)
            print(f"Epoch {epoch + 1}/{self.epochs} Accuracy: {self.accuracy * 100}%")

# Create an instance of the Model class
model = Model()

# Train the model
model.fit(X_train, label_1)


Epoch 1/600 Accuracy: 10.47857142857143%
Epoch 2/600 Accuracy: 33.87619047619047%
Epoch 3/600 Accuracy: 10.359523809523811%
Epoch 4/600 Accuracy: 10.359523809523811%
Epoch 5/600 Accuracy: 14.892857142857144%
Epoch 6/600 Accuracy: 21.419047619047618%
Epoch 7/600 Accuracy: 24.68095238095238%
Epoch 8/600 Accuracy: 38.93809523809524%
Epoch 9/600 Accuracy: 18.714285714285715%
Epoch 10/600 Accuracy: 22.03333333333333%
Epoch 11/600 Accuracy: 33.78809523809524%
Epoch 12/600 Accuracy: 13.347619047619046%
Epoch 13/600 Accuracy: 20.202380952380953%
Epoch 14/600 Accuracy: 18.96904761904762%
Epoch 15/600 Accuracy: 46.635714285714286%
Epoch 16/600 Accuracy: 67.58809523809524%
Epoch 17/600 Accuracy: 68.96904761904761%
Epoch 18/600 Accuracy: 69.24761904761905%
Epoch 19/600 Accuracy: 75.4095238095238%
Epoch 20/600 Accuracy: 76.44999999999999%
Epoch 21/600 Accuracy: 79.69047619047619%
Epoch 22/600 Accuracy: 78.99047619047619%
Epoch 23/600 Accuracy: 81.79761904761905%
Epoch 24/600 Accuracy: 81.4214285714