In [61]:
import numpy as np

# define the sigmoid function 
# Backpropagation is actually a major motivating factor in the historical use of sigmoid activation functions due to its convenient derivative:
def sigmoid(x, derivative=False):

    if (derivative == True):
        return sigmoid(x,derivative=False) * (1 - sigmoid(x,derivative=False))
    else:
        return 1 / (1 + np.exp(-x))

# choose a random seed for reproducible results
#np.random.seed(1)

# learning rate
alpha = .1

# number of nodes in the hidden layer
num_hidden = 2

# inputs (grade individual assigment, grade of group assigment)
#Student 1 to student n (number of rows)
number_students = 15
X = np.random.randint(10, size=(number_students, 2))


# outputs
# x.T is the transpose of x, making this a column vector
y = np.random.randint(10, size=(number_students, 1)).T


# initialize weights randomly with mean 0 and range [-1, 1]
# the +1 in the 1st dimension of the weight matrices is for the bias weight
hidden_weights = 2*np.random.random((X.shape[1] + 1, num_hidden)) - 1
output_weights = 2*np.random.random((num_hidden + 1, y.shape[1])) - 1

# number of iterations of gradient descent
num_iterations = 10000

# for each iteration of gradient descent
for i in range(num_iterations):

    # forward phase
    # np.hstack((np.ones(...), X) adds a fixed input of 1 for the bias weight
    input_layer_outputs = np.hstack((np.ones((X.shape[0], 1)), X))
    hidden_layer_outputs = np.hstack((np.ones((X.shape[0], 1)), sigmoid(np.dot(input_layer_outputs, hidden_weights))))
    output_layer_outputs = np.dot(hidden_layer_outputs, output_weights)

    # backward phase
    # output layer error term
    output_error = output_layer_outputs - y
    # hidden layer error term
    # [:, 1:] removes the bias term from the backpropagation
    hidden_error = hidden_layer_outputs[:, 1:] * (1 - hidden_layer_outputs[:, 1:]) * np.dot(output_error, output_weights.T[:, 1:])

    # partial derivatives
    hidden_pd = input_layer_outputs[:, :, np.newaxis] * hidden_error[: , np.newaxis, :]
    output_pd = hidden_layer_outputs[:, :, np.newaxis] * output_error[:, np.newaxis, :]

    # average for total gradients
    total_hidden_gradient = np.average(hidden_pd, axis=0)
    total_output_gradient = np.average(output_pd, axis=0)

    # update weights
    hidden_weights += - alpha * total_hidden_gradient
    output_weights += - alpha * total_output_gradient

# print the final outputs of the neural network on the inputs X

#transform output to predict values
y_hat = np.mean(output_layer_outputs, axis=0)
y_hat = y_hat.reshape(1,number_students)

#print("Output After Training: \n{}".format(output_layer_outputs))
print("Matrix of grades-assigments:X1,X2 \n")
print("Matrix of grades-EXAMs: Exam or Y \n")
print("Number of students:", number_students)
print("\n X1 , X2, EXAM, Y_HAT")
print(np.hstack((X, y.T, y_hat.T)))
#Total error after 1000 iterations
print('Total error',np.sum(output_error))

Matrix of grades-assigments:X1,X2 

Matrix of grades-EXAMs: Exam or Y 

Number of students: 15

 X1 , X2, EXAM, Y_HAT
[[1.         6.         6.         5.9999944 ]
 [6.         5.         3.         2.99999778]
 [0.         3.         8.         7.99999248]
 [5.         7.         3.         2.99999802]
 [2.         8.         3.         2.99999722]
 [0.         1.         3.         2.99999769]
 [0.         7.         1.         0.99999816]
 [9.         6.         8.         7.9999926 ]
 [5.         2.         5.         4.99999593]
 [8.         6.         4.         3.99999516]
 [3.         3.         7.         6.99999366]
 [5.         9.         8.         7.99999265]
 [5.         6.         9.         8.99999177]
 [7.         2.         8.         7.99999364]
 [3.         1.         8.         7.99999433]]
Total error -0.0011174157124990147
