<a href="https://colab.research.google.com/github/invtrdan/Machine-Learning/blob/main/Neural_Network_Training_(XOR%2C_Spirals)%2C_Model_Diagrams%2C_Decision_Boundaries.ipynb" target="_parent"><img src="https://colab.research.google.com/assets/colab-badge.svg" alt="Open In Colab"/></a>

In [None]:
# Import our standard libraries.
import numpy as np
from matplotlib import pyplot as plt
import seaborn as sns  # for nicer plots
sns.set(style='darkgrid')  # default style
import tensorflow as tf

---
## Helper function(s)

* `draw_neural_net(...)`: Draw a neural network diagram
* `plot_decision_boundary(...)`: Plot a model's learned decision boundary, along with the labeled data points


In [None]:
def draw_neural_net(ax, layer_sizes, coefs_, intercepts_,
                    left=0.1, right=.9, bottom=0.1, top=.9):
    '''
    Draw a neural network cartoon using matplotilb.
    Adapted: https://gist.github.com/craffel/2d727968c3aaebd10359

    Args:
        - ax : matplotlib.axes.AxesSubplot
            The axes on which to plot the cartoon (get e.g. by plt.gca())
        - layer_sizes : list of int
            List of layer sizes, including input and output dimensionality
    '''
    n_layers = len(layer_sizes)
    v_spacing = (top - bottom)/float(max(layer_sizes))
    h_spacing = (right - left)/float(len(layer_sizes) - 1)

    # Input-Arrows
    layer_top_0 = v_spacing*(layer_sizes[0] - 1)/2. + (top + bottom)/2.
    for m in range(layer_sizes[0]):
        plt.arrow(left-0.18, layer_top_0 - m*v_spacing, 0.12, 0,  lw=1, head_width=0.01, head_length=0.02)

    # Nodes
    for n, layer_size in enumerate(layer_sizes):
        layer_top = v_spacing*(layer_size - 1)/2. + (top + bottom)/2.
        for m in range(layer_size):
            circle = plt.Circle((n*h_spacing + left, layer_top - m*v_spacing), v_spacing/8.,
                                color='w', ec='k', zorder=4)
            if n == 0:
                plt.text(left-0.125, layer_top - m*v_spacing, r'$X_{'+str(m+1)+'}$', fontsize=15)
            elif (n_layers == 3) & (n == 1):
                plt.text(n*h_spacing + left+0.00, layer_top - m*v_spacing+ (v_spacing/8.+0.01*v_spacing), r'$H_{'+str(m+1)+'}$', fontsize=15)
            elif n == n_layers -1:
                plt.text(n*h_spacing + left+0.10, layer_top - m*v_spacing, r'$y_{'+str(m+1)+'}$', fontsize=15)
            ax.add_artist(circle)

    # Bias-Nodes
    for n, layer_size in enumerate(layer_sizes):
        if n < n_layers -1:
            x_bias = (n+0.5)*h_spacing + left
            y_bias = top + 0.005
            circle = plt.Circle((x_bias, y_bias), v_spacing/8., color='w', ec='k', zorder=4)
            plt.text(x_bias-(v_spacing/8.+0.10*v_spacing-0.01), y_bias, r'$1$', fontsize=15)
            ax.add_artist(circle)

    # Edges
    # Edges between nodes
    for n, (layer_size_a, layer_size_b) in enumerate(zip(layer_sizes[:-1], layer_sizes[1:])):
        layer_top_a = v_spacing*(layer_size_a - 1)/2. + (top + bottom)/2.
        layer_top_b = v_spacing*(layer_size_b - 1)/2. + (top + bottom)/2.
        for m in range(layer_size_a):
            for o in range(layer_size_b):
                line = plt.Line2D([n*h_spacing + left, (n + 1)*h_spacing + left],
                                  [layer_top_a - m*v_spacing, layer_top_b - o*v_spacing], c='k')
                ax.add_artist(line)
                xm = (n*h_spacing + left)
                xo = ((n + 1)*h_spacing + left)
                ym = (layer_top_a - m*v_spacing)
                yo = (layer_top_b - o*v_spacing)
                rot_mo_rad = np.arctan((yo-ym)/(xo-xm))
                rot_mo_deg = rot_mo_rad*180./np.pi
                xm1 = xm + (v_spacing/8.+0.05)*np.cos(rot_mo_rad)
                if n == 0:
                    if yo > ym:
                        ym1 = ym + (v_spacing/8.+0.08)*np.sin(rot_mo_rad)
                    else:
                        ym1 = ym + (v_spacing/8.+0.1)*np.sin(rot_mo_rad) + .01
                else:
                    if yo > ym:
                        ym1 = ym + (v_spacing/8.+0.08)*np.sin(rot_mo_rad)
                    else:
                        ym1 = ym + (v_spacing/8.+0.08)*np.sin(rot_mo_rad)
                plt.text(xm1, ym1,
                         str(round(coefs_[n][m, o],2)),
                         rotation = rot_mo_deg,
                         fontsize = 10)

    # Edges between bias and nodes
    for n, (layer_size_a, layer_size_b) in enumerate(zip(layer_sizes[:-1], layer_sizes[1:])):
        if n < n_layers-1:
            layer_top_a = v_spacing*(layer_size_a - 1)/2. + (top + bottom)/2.
            layer_top_b = v_spacing*(layer_size_b - 1)/2. + (top + bottom)/2.
        x_bias = (n+0.5)*h_spacing + left
        y_bias = top + 0.005
        for o in range(layer_size_b):
            line = plt.Line2D([x_bias, (n + 1)*h_spacing + left],
                          [y_bias, layer_top_b - o*v_spacing], c='k')
            ax.add_artist(line)
            xo = ((n + 1)*h_spacing + left)
            yo = (layer_top_b - o*v_spacing)
            rot_bo_rad = np.arctan((yo-y_bias)/(xo-x_bias))
            rot_bo_deg = rot_bo_rad*180./np.pi
            xo2 = xo - (v_spacing/8.+0.01)*np.cos(rot_bo_rad)
            yo2 = yo - (v_spacing/8.+0.01)*np.sin(rot_bo_rad)
            xo1 = xo2 -0.08 * np.cos(rot_bo_rad)
            yo1 = yo2 -0.05 * np.sin(rot_bo_rad)
            plt.text(xo1, yo1,
                 str(round(intercepts_[n][o],2)),
                 rotation = rot_bo_deg,
                 fontsize = 10)

    # Output-Arrows
    layer_top_0 = v_spacing*(layer_sizes[-1] - 1)/2. + (top + bottom)/2.
    for m in range(layer_sizes[-1]):
        plt.arrow(right+0.015, layer_top_0 - m*v_spacing, 0.16*h_spacing, 0,  lw =1, head_width=0.01, head_length=0.02)

    ax.set_xlim([0, 1])
    ax.set_ylim([0, 1])

In [None]:
def plot_decision_boundary(X, Y, model, steps=100, size=3, cmap='bwr'):
    """
    Function to plot the decision boundary and data points of a model.
    Data points are colored based on their actual label.
    """
    cmap = plt.get_cmap(cmap)

    # Define region of interest by data limits
    xmin, xmax = X[:,0].min() - 1, X[:,0].max() + 1
    ymin, ymax = X[:,1].min() - 1, X[:,1].max() + 1
    x_span = np.linspace(xmin, xmax, steps)
    y_span = np.linspace(ymin, ymax, steps)
    xx, yy = np.meshgrid(x_span, y_span)

    # Make predictions across region of interest
    labels = model.predict(np.c_[xx.ravel(), yy.ravel()])

    # Plot decision boundary in region of interest
    z = labels.reshape(xx.shape)

    fig, ax = plt.subplots()
    ax.contourf(xx, yy, z, cmap=cmap, alpha=0.3)

    # Get predicted labels on training data and plot
    train_labels = model.predict(X)
    ax.scatter(X[:,0], X[:,1], c=Y, cmap=cmap, s=size)

    return fig, ax

---
## The XOR Problem

We'll start by generating data to replicate the XOR logical operator. As a reminder, XOR(x1, x2) = 1 if *exactly* one of inputs x1 or x2 is 1.

In [None]:
# Replicate the XOR operator's behavior.
X = np.array([
    [0, 0],
    [0, 1],
    [1, 0],
    [1, 1]
    ])

Y = np.array([
    0,
    1,
    1,
    0
    ])

# Plot XOR data.
pos_X = X[Y==1]
neg_X = X[Y==0]
plt.scatter(x=pos_X[:,0], y=pos_X[:,1], marker='+', c='r', s=1000, linewidth=4)
plt.scatter(x=neg_X[:,0], y=neg_X[:,1], marker='_', c='k', s=1000, linewidth=4)
plt.xlim(-0.15, 1.15)
plt.ylim(-0.15, 1.15)
plt.show()

### Linear Models

The XOR dataset is *not* linearly separable. We cannot draw a single linear decision boundary to separate the {-} and {+} data points.

Here, we'll confirm that a linear model will perform poorly when trained to solve XOR.

In [None]:
def build_linear_xor_model():
  tf.keras.backend.clear_session()
  tf.keras.utils.set_random_seed(0)

  model = tf.keras.Sequential()

  # Set input shape in advance
  model.add(tf.keras.Input(shape=(2,), name='Input'))
  # Only one layer (i.e., a binary classifier)
  model.add(tf.keras.layers.Dense(units=1, activation='sigmoid', name='Output'))
  model.compile(loss='binary_crossentropy',
                optimizer=tf.keras.optimizers.SGD(learning_rate=1))
  return model

In [None]:
# Build the model and show a summary.
xor_model = build_linear_xor_model()
print(xor_model.summary())

# Train a model and show loss
history = xor_model.fit(
    x=X,
    y=Y,
    epochs=100,
    batch_size=4,
    verbose=0)
losses = history.history['loss']
plt.plot(losses)
plt.xlabel('Training Step')
plt.ylabel('Cross-Entropy Loss')
plt.show()

# Show data and predictions
preds = xor_model.predict(X)
for i in range(len(Y)):
  (x1,x2) = X[i]
  y = Y[i]
  y_hat = preds[i]
  print("x [%d %d]  y [%d]  ŷ [%.4f]" %(x1, x2, y, y_hat))

# Show learned model
w, b = xor_model.layers[0].get_weights()
print('f(x) = sigmoid(%.4f + %.4f*x1 + %.4f*x2)' %(b[0], w[0], w[1]))

In [None]:
fig = plt.figure(figsize=(5, 5))
ax = fig.gca()
ax.axis('off')
ax.set_xlim([0, 1])
ax.set_ylim([0, 1])
draw_neural_net(ax, [2, 1], {0:w}, {0:b})

The cells above show that:
* The model weights in the diagram are all 0, so the *score* assigned to any input is always 0.
* $\hat{y}$ = z(0) = 0.5 for every datapoint. That means our model classifies every input as being equally likely to belong to either class. In other words, *our model is performing no better than guessing the label at random*.


### Neural Network Models

Now, we'll build a neural network with hidden, intermediate layers as discussed during lecture. In addition to adding more layers, we will also introduce "nonlinearities" called "activation functions" inbetween the layers. You've already seen some of these, such as the sigmoid function from binary classification.

We'll see that this network will be able to solve XOR and achieve better performance than the linear model.

In [None]:
def build_ffnn_xor_model(hidden_layers=[]):
  tf.keras.backend.clear_session()
  tf.keras.utils.set_random_seed(1)

  model = tf.keras.Sequential()

  # Set input shape in advance
  model.add(tf.keras.Input(shape=(2,), name='Input'))

  # Add intermediate layers to our model
  #
  # Example: if `hidden_layers`=[2, 4, 2],
  # this create 3 intermediate layers with 2, 4, and 2 units, respectively.
  for num_nodes in hidden_layers:
    model.add(tf.keras.layers.Dense(units=num_nodes, # Number of nodes in the layer
                                    activation='relu', # Activation function -ve values to 0
                                    name='Hidden'))

  # Set output to produce a single value using `sigmoid` as our final activation function
  model.add(tf.keras.layers.Dense(units=1, activation='sigmoid', name='Output'))

  model.compile(loss='binary_crossentropy',
                optimizer=tf.keras.optimizers.SGD(learning_rate=1))
  return model

In [None]:
# Build the model and show a summary.
xor_model = build_ffnn_xor_model(hidden_layers=[2])
print(xor_model.summary())

In [None]:
# Train a model and plot the cross-entropy loss.
history = xor_model.fit(
    x = X,
    y = Y,
    epochs=2000,
    batch_size=4,
    verbose=0)
losses = history.history['loss']
plt.plot(losses)
plt.xlabel('Training Step')
plt.ylabel('Cross-Entropy Loss')
plt.show()

# Make predictions for each datapoint.
preds = xor_model.predict(X)
for i in range(len(Y)):
  (x1, x2) = X[i]
  y = Y[i]
  y_hat = preds[i]
  print("x [%d %d]  y [%d]  ŷ [%.4f]" %(x1, x2, y, y_hat))

In [None]:
# Get model weights.
w0, b0 = xor_model.layers[0].get_weights()
w1, b1 = xor_model.layers[1].get_weights()

# Plot model weights.
fig = plt.figure(figsize=(8, 8))
ax = fig.gca()
ax.axis('off')
ax.set_xlim([0, 1])
ax.set_ylim([0, 1])
draw_neural_net(ax, [2, 2, 1], {0:w0, 1:w1}, {0:b0, 1:b1})

### Forward Pass Prediction

Use the learned weights shown in the model diagram to fill in the intermediate values $h_1$ and $h_2$ and the final predictions $\hat{y}$. Compare the predictions to the true labels $y$.

Remember to use the right activation function for each node corresponding to how we designed our neural network!

Hint:
* h_1 = z(x_1 * w0[0, 0] + x_2 * w0[1, 0] + 1 * b0[0])
* h_2 = z(x_1 * w0[0, 1] + x_2 * w0[1, 1] + 1 * b0[1])
* y = z(h_1 * w1[0, 0] + h_2 * w1[1, 0] + 1 * b1[0])

$x_1$ | $x_2$ | $h_1$ | $h_2$ | $\hat{y}$ | $y$
-|-|-|-|-|-
0|0| TODO | TODO | TODO | TODO
0|1| TODO | TODO | TODO | TODO
1|0| TODO | TODO | TODO | TODO
1|1| TODO | TODO | TODO | TODO

In [None]:
def relu(x):
  return max(0, x)

def sigmoid(x):
  return 1 / (1 + np.exp(-x))

for x1 in [0, 1]:
  for x2 in [0, 1]:
    print("x1={0}, x2={1}:".format(x1, x2))
    h1 = relu(x1 * w0[0, 0] + x2 * w0[1, 0] + b0[0])
    h2 = relu(x1 * w0[0, 1] + x2 * w0[1, 1] + b0[1])
    print("h1:", h1)
    print("h2:", h2)
    y_pred = sigmoid(h1 * w1[0, 0] + h2 * w1[1, 0] + b1[0])
    print("y_pred:", y_pred)

### Decision Boundary

Use the helper function to draw the decision boundary learned by the XOR model.

As a reminder, a decision boundary represents some line, curve, or "hyperplane" that a machine learning classifer learns to distinguish datapoints belonging to different classes.

In [None]:
plot_decision_boundary(X, Y, xor_model, size=30)
plt.show()

## The Spiral Problem

Here is another interesting dataset that is also not linearly separable, and as a result cannot be solved by a linear model. We'll talk about this more during the upcoming lectures, so this is just a sneak peek.

In [None]:
def generate_spiral_data(num=1000):
  np.random.seed(1)
  X = []

  # The delta values correspond to different offsets for the 2 classes.
  for delta in [0, np.pi]:
    for i in range(num):
      r = 1.0 * i / num * 5
      t = 1.75 * i / num * 2 * np.pi + delta
      x0 = r * np.sin(t) + (2 * np.random.rand() - 1) * 0.1
      x1 = r * np.cos(t) + (2 * np.random.rand() - 1) * 0.1
      X.append([x0, x1])

  X = np.array(X)
  Y = np.concatenate([np.zeros(num), np.ones(num)]).astype(int)

  shuf_idx = np.random.permutation(len(Y))
  X = X[shuf_idx]
  Y = Y[shuf_idx]

  return X, Y

X, Y = generate_spiral_data(1000)
plt.scatter(X[:,0], X[:,1], c=Y, cmap='bwr', s=3)
plt.show()

In [None]:
def build_spiral_model(hidden_layer_sizes=[], seed=10):
  tf.keras.backend.clear_session()
  tf.keras.utils.set_random_seed(seed)

  model = tf.keras.Sequential()
  model.add(tf.keras.Input(shape=(2,), name='Input'))
  for hidden_layer_size in hidden_layer_sizes:
    model.add(tf.keras.layers.Dense(units=hidden_layer_size,
                                    activation='relu'))
  model.add(tf.keras.layers.Dense(units=1, activation='sigmoid', name='Output'))
  model.compile(loss='binary_crossentropy',
                optimizer=tf.keras.optimizers.SGD(learning_rate=.05))
  return model

### Neural Network Models

In [None]:
layer_sizes = [8, 8, 4, 4]
spiral_model = build_spiral_model(hidden_layer_sizes=layer_sizes)
print(spiral_model.summary())

history = []

plot_decision_boundary(X, Y, spiral_model, size=3)
plt.show()

for i in range(20):
  history = spiral_model.fit(
      x=X, y=Y, epochs=50, batch_size=128, verbose=0)

  plot_decision_boundary(X, Y, spiral_model, size=3)
  plt.show()


losses = history.history['loss']
plt.plot(losses)
plt.xlabel('Training Step')
plt.ylabel('Cross-Entropy Loss')
label = '{:.2f}'.format(losses[-1])
plt.annotate(label, (len(losses), losses[-1]))
plt.show()

In [None]:
weights_as_dict = {}
biases_as_dict = {}
for i in range(len(layer_sizes) + 1):
  w, b = spiral_model.layers[i].get_weights()
  weights_as_dict[i] = w
  biases_as_dict[i] = b

fig = plt.figure(figsize=(12, 12))
ax = fig.gca()
ax.axis('off')
draw_neural_net(ax, [2]+layer_sizes+[1], weights_as_dict, biases_as_dict)

### Decision Boundary

In [None]:
plot_decision_boundary(X, Y, spiral_model, size=3)
plt.show()