In [87]:
import numpy

# scipy.special for the sigmoid function expit()
import scipy.special

# library for plotting arrays
import matplotlib.pyplot
# ensure the plots are inside the notebook, not an external window
%matplotlib inline


# Neural Network Class Definition

In [88]:
# nerual network class definition
class nerualNetwork:
    
    # initialise the neural network
    def __init__(self, inputnodes, hiddennodes, outputnodes, learningrate):
        # set number of nodes in each input, hidden, output layer
        self.inodes = inputnodes
        self.hnodes = hiddennodes
        self.onodes = outputnodes
        
        # link weight matrices, wih, who
        # weights inside the arrays are w_i_j,where linnk is from node i to node j in the next layer
        # w11 w21
        # w12 w22 etc
        # simple version
        # self.wih = (numpy.random.rand(self.hnodes, self.inodes) - 0.5)
        # self.who= (numpy.random.rand(self.onodes, self.hnodes) - 0.5)
        # Based upon distribution p133
        self.wih = numpy.random.normal(0.0, pow(self.hnodes, -0.5), (self.hnodes, self.inodes))
        self.who = numpy.random.normal(0.0, pow(self.onodes, -0.5), (self.onodes, self.hnodes))    
        
        #learning rate
        self.lr = learningrate
        
        # activation function is the sigmoid function
        self.activation_function = lambda x: scipy.special.expit(x)
        pass
    
        
    # train the neural network
    def train(self, inputs_list, targets_list):
        # convert inputs list to 2d array
        inputs = numpy.array(inputs_list, ndmin=2).T ##WHAT IS THIS???
        targets = numpy.array(targets_list, ndmin=2).T ##WHAT IS THIS???
        
        # calculate signals into hidden layer
        hidden_inputs = numpy.dot(self.wih, inputs)
        # calculate signals emerging from hidden layer
        hidden_outputs = self.activation_function(hidden_inputs)
        
        # calulate signals into final output layer
        final_inputs = numpy.dot(self.who, hidden_outputs)
        # calculate the signals emerging from final output layer
        final_outputs = self.activation_function(final_inputs) 
        
        # error is the (target - actual)
        output_errors = targets - final_outputs
        
        # hidden layer error is the output_errors, split by weights, recombined at hidden nodes
        hidden_errors = numpy.dot(self.who.T, output_errors)
        
        # update the weights for the links between the hidden and output layers
        self.who += self.lr * numpy.dot((output_errors * final_outputs * (1.0 - final_outputs)), numpy.transpose(hidden_outputs))
        
        # update the weights for the links between the input and hidden layers
        self.wih += self.lr * numpy.dot((hidden_errors * hidden_outputs * (1.0 - hidden_outputs)), numpy.transpose(inputs))
        
        
        pass
    
    
    # query the neural network
    def query(self, inputs_list):
        # convert inputs list to 2d array
        inputs = numpy.array(inputs_list, ndmin=2).T ##WHAT IS THIS???
        
        # calculate signals into hidden layer
        hidden_inputs = numpy.dot(self.wih, inputs)
        # calculate signals emerging from hidden layer
        hidden_outputs = self.activation_function(hidden_inputs)
        
        # calulate signals into final output layer
        final_inputs = numpy.dot(self.who, hidden_outputs)
        # calculate the signals emerging from final output layer
        final_outputs = self.activation_function(final_inputs)
        
        return final_outputs

In [89]:
# Look at training data
#with open ("EM_SAVE_TRAIN_DT11.csv", 'r') as f:
#    training_data_list = f.readlines()
#f.closed


f = open("EM_SAVE_TRAIN_DT11.csv", 'r')
training_data_list = f.readlines() ## for large files better to read a line at a time
f.close()



In [90]:
# Read header row
record_count = 0

for record in training_data_list:
    if record_count == 0:
        all_values = record.split(',')
        print(all_values)
        pass
    record_count += 1
    pass
        


['target_date', 'household_number', 'day_00_', 'day_01_', 'day_02_', 'day_03_', 'day_07_', 'first28_ViewTV', '_dataobs_', '_WARN_\n']


In [106]:
# number of input, hidden, output nodes
input_nodes = 5
hidden_nodes = 40
output_nodes = 3

# learning rate
learning_rate = 0.3

# create instance of neural network
n = nerualNetwork(input_nodes, hidden_nodes, output_nodes, learning_rate)

# load the mnist training data csv file into a list
f = open("EM_SAVE_TRAIN_DT11.csv", 'r')
training_data_list = f.readlines() ## for large files better to read a line at a time
f.close()

# train the nerual network

record_count = 0

# Read and format the training file
# target_date, household number, day_00_, day_01_, day_02_, day_03, day_07_, first28_ViewTV, dataobs_, _WARN_ 
for record in training_data_list:
    # first record is header, do not process
    if record_count >0: #and record_count <=10:
        # split the record by the ","commas
        all_values = record.split(',')
        #print(record_count, ": Original All: ", all_values)
        # Give values to No_Data, No_View, View_TV
        dic = {"No_Data":0.0 / 2.0 * 0.99 + 0.01, "No_View":1.0 / 2.0 * 0.99 + 0.01, "View_TV": 2.0 / 2.0 * 0.99 + 0.01}    
        all_values = [dic[n] if n in dic else n for n in all_values]
        # Rescale first28_ViewTV
        all_values[7] = float(all_values[7]) / 28.0 * 0.99 + 0.01 

        # inputs
        inputs = numpy.asfarray(all_values[3:8])
        #print(record_count, ": Processed All: ", all_values)
        #print(record_count, ": Inputs: ", inputs)
        # create the target output values (all 0.01, except the desired label which is 0.99)
        targets = numpy.zeros(output_nodes) + 0.01
        # all_values[2] is the target label for this record
        targets[int(all_values[2] * 2)] = 0.99
        #print(record_count, ": Target: ", targets)
        #Boost the number of "No_view" as only 5% of data
        #counter = 0
        #while all_values[2] == 1.0 and counter < 1:
        n.train(inputs, targets)
        #counter += 1
        pass
    record_count += 1
    pass


In [107]:
# test the neural network

# scorecard for how well the network performs, initially empty 
#scorecard_0 = []
#scorecard_1 = []
#scorecard_2 = []
scorecard = []

# load the test data csv file into a list
test_data_file = open("EM_SAVE_TEST.csv", 'r')
test_data_list = test_data_file.readlines()
test_data_file.close()


record_count = 0

# Read and format the test file
# target_date, household number, day_00_, day_01_, day_02_, day_03, day_07_, first28_ViewTV, dataobs_, _WARN_ 
for record in test_data_list:
    # first record is header, do not process
    #print(record_count)
    if record_count > 0:
        # split the record by the ","commas
        all_values = record.split(',')
        #print(all_values)
        # Give values to No_Data, No_View, View_TV
        dic = {"No_Data":0.0 / 2.0 * 0.99 + 0.01, "No_View":1.0 / 2.0 * 0.99 + 0.01, "View_TV": 2.0 / 2.0 * 0.99 + 0.01}    
        all_values = [dic[n] if n in dic else n for n in all_values]
        # Rescale first28_ViewTV
        all_values[7] = float(all_values[7]) / 28.0 * 0.99 + 0.01
        #print(all_values)
        # inputs
        inputs = numpy.asfarray(all_values[3:8])
        #print(inputs)
        # outputs
        outputs = n.query(inputs)
        # correct answer is in index 2
        correct_label = int(all_values[2] * 2)
        #print(correct_label)
        # the index of the highest value corresponds to the label
        label = numpy.argmax(outputs)
        #print(label)
        if (label == correct_label):
            # network's answer matches correct answer, add 1 to scorecard
            scorecard.append([1, float(outputs[0]), float(outputs[1]), float(outputs[2]), float(outputs[label]) / outputs.sum(), correct_label, label])
        else:
            # networks answer doesn't match correct answer, add 0 to scorecard
            scorecard.append([0,float(outputs[0]), float(outputs[1]), float(outputs[2]), float(outputs[label] / outputs.sum()), correct_label, label])
            pass        
                
        pass
    record_count += 1
    pass
    

In [108]:
# calculate the performance score, the function of correct answers
scorecard_array = numpy.asarray(scorecard)
print("Performance = ", sum(scorecard_array[:, 0]) / len(scorecard_array))



Performance =  0.939510964613


In [109]:
score_summary = numpy.zeros((3, 3))

for x in range(len(scorecard_array)):
    target_index = int(scorecard_array[x, 5])
    model_index = int(scorecard_array[x, 6])
    score_summary[target_index, model_index] += 1
    #print(target_index, model_index, score_summary[target_index, model_index])

print("No Data: ", score_summary[0,0] / sum(score_summary[0, :]))
print("No View: ", score_summary[1,1] / sum(score_summary[1, :]))
print("View TV: ", score_summary[2,2] / sum(score_summary[2, :]))


No Data:  0.976452816776
No View:  0.632062146893
View TV:  0.945500690647


In [None]:
# LR = 0.3
No Data:  0.976648302826
No View:  0.389712806026
View TV:  0.96863454131

# LR = 0.1    
No Data:  0.975866358628
No View:  0.173375706215
View TV:  0.979347984772  
    
# LR = 0.5
No Data:  0.976346188022
No View:  0.129472693032
View TV:  0.982537311757
    
# LR = 0.4
No Data:  0.97620401635
No View:  0.31450094162
View TV:  0.974002493065
    
# LR = 0.2    
No Data:  0.976097387596
No View:  0.22233992467
View TV:  0.976338338181
    
# Nodes = 20
No Data:  0.975262129021
No View:  0.47634180791
View TV:  0.957179916224 
    
# Nodes = 30
No Data:  0.976275102186
No View:  0.631002824859
View TV:  0.945040259638

# Nodes = 40    
No Data:  0.976186244891
No View:  0.630532015066
View TV:  0.947039204016 
    
# No_View x 7
No Data:  0.976186244891
No View:  0.630532015066
View TV:  0.947039204016

In [83]:
print(score_summary)
score_summary.sum()


print(sum(score_summary[0, :]) / score_summary.sum())
print(sum(score_summary[1, :]) / score_summary.sum())
print(sum(score_summary[2, :]) / score_summary.sum())

[[  5.49300000e+04   1.71000000e+02   1.16900000e+03]
 [  7.30000000e+01   5.35700000e+03   3.06600000e+03]
 [  1.25100000e+03   3.46500000e+03   8.43310000e+04]]
0.365833837192
0.0552359033372
0.578930259471
