In [1]:
# Python ≥3.5 is required
import sys
assert sys.version_info >= (3, 5)

# Scikit-Learn ≥0.20 is required
import sklearn
assert sklearn.__version__ >= "0.20"

try:
    # %tensorflow_version only exists in Colab.
    %tensorflow_version 2.x
except Exception:
    pass

# TensorFlow ≥2.0 is required
import tensorflow as tf
assert tf.__version__ >= "2.0"

# Common imports
import numpy as np
import os

# to make this notebook's output stable across runs
np.random.seed(42)

# To plot pretty figures
%matplotlib inline
import matplotlib as mpl
import matplotlib.pyplot as plt
mpl.rc('axes', labelsize=14)
mpl.rc('xtick', labelsize=12)
mpl.rc('ytick', labelsize=12)

# Where to save the figures
PROJECT_ROOT_DIR = "."
CHAPTER_ID = "ann"
IMAGES_PATH = os.path.join(PROJECT_ROOT_DIR, "images", CHAPTER_ID)
os.makedirs(IMAGES_PATH, exist_ok=True)

In [2]:
# Perceptrons

import numpy as np
from sklearn.datasets import load_iris
from sklearn.linear_model import Perceptron

iris = load_iris()
X = iris.data[:, (2, 3)]  # petal length, petal width
y = (iris.target == 0).astype(int)

per_clf = Perceptron(max_iter=1000, tol=1e-3, random_state=42)
per_clf.fit(X, y)

y_pred = per_clf.predict([[2, 0.5]])

In [3]:
y_pred

array([1])

In [4]:
# Perceptron class is equivalent to Stochastic Gradient Descent with following hyper param:
# loss="percetron", learning_rate="constant", eta0=1 (learning rate), pentalty=None (no regularization)

# Perceptrons do not output a class probabilities, but make predictions based on a hard threshold (Logistic Regression better for this case)

# One preceptron is a linear classification model - downside - cannot solve trivial problems such as XOR logic
# However, some of the limitations can be eliminated by stacking multiple perceptrons together -> Multi-Layer Perceptrons (MLP)
# - can solve XOR problem

In [5]:
# Multi-Layer Perceptron (MLP) and Backpropagation (TLU - threshold logic unit)

# Backpropagation training algorithm:
# - basically a Gradient Descent using an efficient technique for computing the gradients automatically
# - in just two passes (forward and backward) - can compute the gradient of the network's error w.r.t. every single model parameter
# -> how each connection weight and bias term should be tweaked to reduce the error
# - then just performs a regular Gradient Descent step to tweak the weights
# -> repeat until convergence

# Many methods to compute gradients automatically - one used here colled reverse-mode autodiff
# - fast and precise - well suited when the function to differentiate has manny variables (e.g. connection weights) and a few outputs (one loss)
# - description on page 287,288

# It is importnat to initialize the hidden layers' weights randomly
# - if all of them are the same, e.g. all weights/biases=0, then back propagation will affect them all exactly the same, so they remain identical

# For backprogagation to work - new activation function
# - replaced step function with the logistic function - adds gradient for Gradient Descent
# - function needs to be continuous and differentiable
# - many options exist

In [6]:
# Regression MLPs
# - generally do not want to use an activation function for an output neuron - should be free to output any range of values
# - but can put some restrictions, e.g. if you want an output to only be positive - ReLU function
# - logistic function/hyperbolic tangent - can be used for predictions to fall into a given range by scaling to the appropriate range

# Loss is usually mean squared error
# - Mean absolute error can also be used, especially if you have a lot of outliers in the training set
# - Huber loss - combination of both

In [8]:
# Classification MLPs
# - for binary classification - just need one output neruon with logistic activation function - number between 0 - 1 (e.g. estimated probability of positive class)
# - multilabel binary classification tasks - more output neruons necessary - 1 per binary label for positive class
#   - output probabilities do not necessarily add up to 1 - allows the model to outout any combination of labels

# Multiclass classification
# in the case each instance belongs only to a single class, out of 3 or more - e.g. number classification (0 - 9)
# - need to have one output neuron peer class, with softmax activation function for the whole output layers 
#  -> ensures that all the estimated probabilities are between 0 and 1 and they add up to 1, required when the classes are exclussive

# When predicting probabilitiy distributions - cross-entropy is generally a good choice