In [6]:
# You can train neural networks to convert a given dataset of "what you know" to a dataset of "what you want to know"
# Basically, you can train the network to interpret observations.
# First, convert the observation dataset into matrices so the information is interpretable for the network.
    # Convention: use one row for one observation (each set of on/off lights on a 3-light streetlight) 
    # and one column per observed item (whether each light in the set is on or off). 
    # Ideally, you want a "lossless representation" - the data and the matrix can be perfectly converted between each other.
    
import numpy as np
weights = np.array([0.5, 0.48, -0.7])
alpha = 0.1

# input data pattern
# 0 = light is off, 1 = light is on in a 3-light horizontal stoplight at a crosswalk
streetlights = np.array([[1, 0, 1],
                       [0, 1, 1],
                       [0, 0, 1],
                       [1, 1, 1],
                       [0, 1, 1],
                       [1, 0, 1]])
# output data pattern 
# 0 = stop, 1 = walk
walk_vs_stop = np.array([[0],
                        [1],
                        [0],
                        [1],
                        [1],
                        [0]])

# First, we can turn streetlights into walk_vs_stop with a neural network, as before.
# Uses nice numpy arrays to do elementwise addition/multiplication easily, otherwise is same as previous neural networks.
print(streetlights[0] * [2, 2, 2], "elementwise multiplication")
print(streetlights[0] + [2, 2, 2], "elementwise addition")

for iteration in range(40):
    error_for_all_lights = 0
    for row in range(len(walk_vs_stop)):
        input = streetlights[row]
        goal_prediction = walk_vs_stop[row]
        
        # dot product = weighted sum: input * weights and addition of all items in vector to return a single number
        # The weighted sum of inputs finds perfect correlation between input and output by weighting decorrelated inputs to 0.
        # Basically, if the light is off (marked 0), it will have no effect on the outcome because 0 * anything = 0.
        # So anytime a light has an effect, it will not be 0 and it will be accounted for as a value that affects the outcome.
        prediction = input.dot(weights) 
        error = (goal_prediction - prediction) ** 2
        error_for_all_lights += error
        
        delta = prediction - goal_prediction 
        weights = weights - (alpha * (input * delta))

print("Error:" + str(error_for_all_lights) + " Prediction:" + str(prediction))

# Stochastic Gradient Descent
# The network goes through the training examples one at a time and iterates over it several times. This lets it update the 
# weights for all examples until the network is capable of predicting the correct answer when faced with all training examples.
# This was essentially what we did in ch5 to train the handwriting neural network. We did not have a separate error for the
# entire dataset, however. We just updated the error for each digit 0-9 and used that error for all instances of that digit
# in the dataset. This doesn't seem to make a difference for the network's learning because we don't actually use the error 
# value to learn. We use delta, which is just kind of related.

# (Average/Full) Gradient Descent
# The network goes through the entire set of training examples and calculates the average weight_delta for the whole dataset.
# Then, the network changes the weights one time. The network does not change the weights for every data point.

# Batch Gradient Descent
# Updates the weights after n data points. Batch size is chosen by the user and is typically between 8 and 256. This will be
# discussed more later.

[2 0 2] elementwise multiplication
[3 2 3] elementwise addition
Error:[0.00053374] Prediction:-0.0026256193329783125


In [None]:
# Overfitting
# There is an edge case where the network will predict the right answer but not actually learn. For example, what if the left
# and right weights were 0.5 and -0.5 respectively and our data point was [1, 0, 1]? Then the weighted sum (prediction) would 
# be 0. The prediction was correct (stop), but the network did not learn anything.

# Error is shared among all weights. If some weight configuration accidentally creates perfect correlation between the 
# prediction and the expected output (error = 0), then weights will not be updated properly and the network will not learn from
# this data point.

# Overfitting is really only a problem if you only train on data points that the network cannot learn off of. The other data
# points should bump the weights out of this configuration and you can continue learning as long as you see other data points.

# Networks should be exposed to plenty of data in order to make sure they learn the rule. They need to learn to generalize 
# instead of memorizing some specific examples and reacting accordingly.

# Conflicting Pressure
# 