In [1]:
import numpy as np
import pandas

In [2]:
data = pandas.read_csv('patientData.csv')
data

Unnamed: 0,Age,Weight,Height,Smoking,AtRisk
0,89,115,5.8,0,1
1,23,255,6.0,1,1
2,18,185,5.7,0,0
3,24,175,6.0,0,0
4,95,95,5.5,0,1
5,35,255,6.2,0,0
6,43,196,5.8,1,1
7,26,105,5.4,0,0


In [3]:
x_n = data.iloc[:,:-1].values
y_n = data.iloc[:,-1].values
x_n

array([[ 89. , 115. ,   5.8,   0. ],
       [ 23. , 255. ,   6. ,   1. ],
       [ 18. , 185. ,   5.7,   0. ],
       [ 24. , 175. ,   6. ,   0. ],
       [ 95. ,  95. ,   5.5,   0. ],
       [ 35. , 255. ,   6.2,   0. ],
       [ 43. , 196. ,   5.8,   1. ],
       [ 26. , 105. ,   5.4,   0. ]])

In [4]:
# normalize each column in the x-data to [0, 1]
def normalize_min_max_data(x_data):
    return (x_data - x_data.min(0)) / x_data.ptp(0)

In [5]:
x_n = normalize_min_max_data(x_n)
x_n

array([[0.92207792, 0.125     , 0.5       , 0.        ],
       [0.06493506, 1.        , 0.75      , 1.        ],
       [0.        , 0.5625    , 0.375     , 0.        ],
       [0.07792208, 0.5       , 0.75      , 0.        ],
       [1.        , 0.        , 0.125     , 0.        ],
       [0.22077922, 1.        , 1.        , 0.        ],
       [0.32467532, 0.63125   , 0.5       , 1.        ],
       [0.1038961 , 0.0625    , 0.        , 0.        ]])

In [12]:
# initial random weights
features = len(x_n[0])
np.random.seed(13)
input_weights = np.random.rand(features)
bias_weight = np.random.rand()

# learning rate and number of training iterations for gradient descent
# (each can be modified; try making learning_rate 100 instead and
# see what happens; what about increasing/decreasing epochs?)
learning_rate = 0.1
epochs = 10_000



0.9726011139048933


In [7]:
# some simple helper functions for the various components of
# the network, and to allow us to do gradient descent
def linear_sum(x, w, b):
    return np.dot(x, w) + b

def sigmoid(x):
    return 1 / (1 + np.exp(-x))

def sigmoid_derivative(x):
    return sigmoid(x) * (1 - sigmoid(x))

In [8]:
# compute the output the network, and compare it to the known
# y-values: we want our outputs to be as close to the y-values
# as possible (initially, we are probably far from close to the
# negative-class (0) outputs, given random positive weights)

# NOTE: all the transpose (.T) is doing here is re-arranging data
# from rows to columns for readability, nothing fancy mathematically
initial_output = sigmoid(linear_sum(x_n, input_weights, bias_weight))
np.vstack((initial_output, y_n)).T

array([[0.89393067, 1.        ],
       [0.94503891, 1.        ],
       [0.80460408, 0.        ],
       [0.85448016, 0.        ],
       [0.86451859, 1.        ],
       [0.90079787, 0.        ],
       [0.94007908, 1.        ],
       [0.74426353, 0.        ]])

In [9]:
# here we train the network; the math that explain why this code
# works is given in the accompanying PDF showing the derivation
for epoch in range(epochs):
    # vector of linear sums, one per data-point
    linear = linear_sum(x_n, input_weights, bias_weight)
    
    # vector of logistic outputs, one per data-point
    output = sigmoid(linear)
    
    # vector of errors, one per data-point
    basic_error = y_n - output
    
    # overall MSE (to be minimized)
    mse = (basic_error**2).mean(axis=None)
    if (epoch == 0) or ((epoch + 1) % 1000 == 0):
        print("Epoch {0:5d}: MSE = {1:.6f}".format(epoch, mse))
    
    # derivatives of the logistic at each data-point
    prediction_derivative = sigmoid_derivative(linear)
    
    # intermediate product of basic error and derivative terms
    output_delta = basic_error * prediction_derivative 
    
    # update the non-bias weights, factoring in the contribution of 
    # each corresponding feature-value; the use of the transpose (.T)
    # and dot-product (np.dot) is just a nicer way to express this
    # than writing a loop over each data point and accumulating that way
    weight_updates = np.dot(x_n.T, output_delta)
    input_weights += learning_rate * weight_updates
    
    # update bias according to all the delta-values on data (since
    # bias "input" is always assumed == 1, no need for dot-product)
    bias_weight += learning_rate * np.sum(output_delta)

Epoch     0: MSE = 0.347388
Epoch   999: MSE = 0.010076
Epoch  1999: MSE = 0.004786
Epoch  2999: MSE = 0.003102
Epoch  3999: MSE = 0.002284
Epoch  4999: MSE = 0.001804
Epoch  5999: MSE = 0.001489
Epoch  6999: MSE = 0.001266
Epoch  7999: MSE = 0.001101
Epoch  8999: MSE = 0.000974
Epoch  9999: MSE = 0.000872


In [10]:
# our final network output should be much closer than when we started,
# with output values that are *close* to 0/1 as appropriate
final_output = sigmoid(linear_sum(x_n, input_weights, bias_weight))
np.vstack((final_output, y_n)).T

array([[0.96163494, 1.        ],
       [0.96417037, 1.        ],
       [0.01086803, 0.        ],
       [0.02304013, 0.        ],
       [0.98062401, 1.        ],
       [0.0348307 , 0.        ],
       [0.9968816 , 1.        ],
       [0.04444612, 0.        ]])