# Simple 2-layer Neural Network with Softmax Function
+ Simple 2-layer neural network with softmax function for multi-class classification.
+ Classify red ball, green ball or blue ball for a given 2D input data.

In [None]:
import numpy as np
import random
import matplotlib.pyplot as plt
import matplotlib as mpl
eta = 0.7  # learning rate
epoch = 5000

### Activation Function
+  For softmax function, x - max(x)  is used to avoid overflow of exponent function. 

In [None]:
def sigmoid(x):
    return 1.0/(1+ np.exp(-x))

def sigmoid_deriv(x):
    return x * (1.0 - x)

def softmax(x):
    # softmax   
    e = np.exp(x - np.max(x)) # x-m is used to avoid overflow of exponent function
    return e / np.sum(e, axis=0) # add in column

### 2-layer Neural Network Model with Softmax Function
+ Softmax function is used at outputs of layer 2.
+ Total error is also obtained by summing up the individual error and averaging them. 
+ For more details on the definition of delta function, refer to the class note.

In [None]:
class neuralnetwork:
    # neural network model
    
    def __init__(self, x, w1, w2, y):
        self.inputs   = x.T
        self.weights1 = w1
        self.weights2 = w2
        self.b1 = np.zeros((4,1)) # bias at hidden layer 
        self.b2 = np.zeros((3,1)) # bias at output layer 
        self.target   = y.T       # target
        self.output   = np.zeros(self.target.shape)

    def forwardprop(self):
        # forward processing of inputs and weights using sigmoid activation function
        self.hiddenout = sigmoid(np.dot(self.weights1, self.inputs) + self.b1)
        self.output = softmax(np.dot(self.weights2, self.hiddenout) + self.b2)

    def backprop(self):
        # backward processing of appling the chain rule to find derivative of the loss function with respect to weights
        delta2 = (self.output - self.target)  
        delta1 = np.dot(self.weights2.T, delta2) * sigmoid_deriv(self.hiddenout)
        dw2 = np.dot(delta2, self.hiddenout.T)
        dw1 = np.dot(delta1, self.inputs.T)

        # update the weights with the derivative of the loss function
        self.weights1 -= eta * dw1
        self.weights2 -= eta * dw2

        # update biases with the derivative of the loss function
        self.b2 -= eta * np.sum(delta2, axis = 1, keepdims=True) # add in row axis, keeping column dimension
        self.b1 -= eta * np.sum(delta1, axis = 1, keepdims=True) # add in row axis, keeping column dimension

    def predict(self, x):
        # predict the output for a given input x
        self.hiddenout = sigmoid(np.dot(self.weights1, x))
        self.output = sigmoid(np.dot(self.weights2, self.hiddenout))
        return (self.output)
        
     # calculate error
    def calculate_error(self):
        error = np.sum(-self.target * np.log(self.output)) # cross entropy loss function for multi-class classification
        return error

### Data Set
+ For simplicity, it is assumed that the color of ball can be identified with two features, i.e., x_axis and y_axis values.
+ (x, y) values of red, green and blue balls are centered around (0, -3), (3, 3) and (-3, 3), respectively.
+ Target values for red, green and blue balls are [1, 0, 0], [0, 1, 0] and [0, 0, 1], respectively.
+ Formats of input data and targetvalue are shown below, which are basically lists of row vectors.
+ In program, these data are converted to column vectors.
+
+  input data = 
+ [[0.1,0.3],  # input data item1
+ [0.5,0.8],  # input data item2
+ [0.7,0.6],  # input data item3
+ [0.9,0.2]] # input data item4
+   
+  target data = 
+ [[0,1,0],  # target1, one-hot vector representation
+  [1,0,0],  # target2, one-hot vector representation
+  [0,0,1],  # target3, one-hot vector representation
+  [1,0,0]]  # target4, one-hot vector representation

In [None]:
if __name__ == "__main__":

    # get training data set
    samples = 10
    attributes = 2
    classes = 3
    
    np.random.seed(45)
    data1_set = np.random.randn(samples, attributes) + np.array([0, -3])  # red ball
    data2_set = np.random.randn(samples, attributes) + np.array([3,  3])  # green ball
    data3_set = np.random.randn(samples, attributes) + np.array([-3, 3])  # blue ball
   
    feature_set = np.vstack([data1_set, data2_set, data3_set])   
    labels = np.array([0]*samples + [1]*samples + [2]*samples)

    # Initializing colors and building a colormap
    cmap = mpl.colors.ListedColormap(['red', 'green', 'blue'])

    # display data training data set
    # plt.figure(figsize=(10,7)) 
    plt.scatter(feature_set[:,0], feature_set[:,1], c=labels, cmap =cmap, s= 200, alpha=0.5)  
    plt.show()

    one_hot_labels = np.zeros((samples*classes, classes))
    
    for i in range(samples*classes):  
        one_hot_labels[i, labels[i]] = 1

    inputdata = feature_set
    targetvalue = one_hot_labels

### Batch Gradient Descent Optimization
+ All the input data are processed in batch at both forward and backward propagations.
+ In comparison with SGD, batch GD optimizes more smoothly since the weights update are performed in batch.

In [None]:
    w2 = np.random.rand(3, 4)     # number of output labels is 3
    w1 = np.random.rand(4, inputdata.shape[1])  # number of nodes at a hidden layer is 4

    nn = neuralnetwork(inputdata, w1, w2, targetvalue)

    training_loss = []  
  
    # training 
    for i in range(epoch):    
        nn.forwardprop()
        nn.backprop()
        if (i % 500) == 0:
            print("Error: ", nn.calculate_error())
            training_loss.append(nn.calculate_error()) # store training loss 
 
    print("target")   
    print(targetvalue)
    print("output after training")   
    print(nn.output.T)

    # predicting and testing the output for a given input data
    # For a given input data [1, -1], it is predicated as red ball since the output is [0, 1, 0], i.e. the red ball.
    x_prediction = np.array([[1.0, -1.0]])
    predicted_output = nn.predict(x_prediction.T)
    print("Predicted data based on trained weights: ")
    print("Input: ", x_prediction)
    print("Output: ", predicted_output.T)

### Evaluation: Error display

In [None]:
#    print(training_loss)
    # Create count of the number of epochs
    epoch_count = range(1, len(training_loss) + 1)
#    plt.figure(figsize=(15,5))
    plt.plot(epoch_count, training_loss, 'b-')
    plt.legend(['Training Loss'])
    plt.xlabel('Epoch(500)')
    plt.ylabel('Loss')
    plt.show()