In [None]:
from IPython.display import Image

AI

Machine Learning

Neural Network

Deep Learning

# AI in general

https://www.tutorialspoint.com/artificial_intelligence/artificial_intelligence_overview.htm

A computer program without AI can answer the specific questions it is meant to solve.

A computer program with AI can answer the generic questions it is meant to solve.

In [None]:
Image(filename='ai_venn.jpg', width = 700) 

In [None]:
Image(filename='function.png', width = 500) 

## Why Python?

NumPy, Pandas, Scikit-Learn, Matplotlib, Tensorflow, PyTorch, Keras, OpenCV

# Machine Learning

Machine learning (“ML“) is the scientific study of algorithms and statistical models that computer systems use to perform a specific task without using explicit instructions, relying instead on patterns and inference derived from data.

In [None]:
Image(filename='Types-of-Machine-Learning-algorithms.jpg', width = 900)

## Reinforcement Learning

An algorithm that learns to perform a task simply by trying to maximise rewards it receives for its actions.

This technique is most often used in game-like situations, e.g. playing games such as Go, self-driving cars, trading strategies, balancing electricity grid loads or optimising auction pricing in real time.

In [None]:
Image(filename='Reinforcement_Learning.png', width = 700)

## Supervised Learning

https://scikit-learn.org/stable/supervised_learning.html#supervised-learning

An algorithm uses training data and feedback from humans to learn the relationship of given inputs to desired outputs.

The training data is labelled by humans, e.g. photo X = cat, photo Y = potato etc. The labeling, together with the human-influenced feedback loop to improve the machine-generated results, explains why we term this type of ML “supervised”.

In [None]:
Image(filename='ml_road_map.png', width = 900)

### Classification

Classification is used to predict discrete responses, that is things with fixed values, e.g. number of students in a class (you can’t have half a student). In a legal context, ML classification algorithms are used to classify whether clause X describes French governing law or English governing law.

Eventually, the performance of a classifier, computational power as well as
predictive power, depends heavily on the underlying data that are available for
learning. The five main steps that are involved in training a machine learning
algorithm can be summarized as follows:
1. Selection of features.
2. Choosing a performance metric.
3. Choosing a classifier and optimization algorithm.
4. Evaluating the performance of the model.
5. Tuning the algorithm.

#### Perceptron

In [None]:
Image(filename='perceptron_1.png', width = 900)

In [None]:
Image(filename='perceptron_2.png', width = 900)

In [None]:
Image(filename='perceptron_3.png', width = 900)

In [None]:
Image(filename='cost_func.png', width = 600)

In [None]:
Image(filename='grad_desc.png', width = 800)

In [None]:
Image(filename='grad_desc_math.png', width = 900)

https://medium.com/@tiago.tmleite/neural-networks-multilayer-perceptron-and-the-backpropagation-algorithm-a5cd5b904fde

In [None]:
Image(filename='backprop_math.png', width = 900)

In [None]:
import numpy as np
class Perceptron(object):
    """Perceptron classifier.
    Parameters
    ------------
    eta : float
    Learning rate (between 0.0 and 1.0)
    n_iter : int
    Passes over the training dataset.
    Attributes
    -----------
    w_ : 1d-array
    Weights after fitting.
    errors_ : list
    Number of misclassifications in every epoch.
    """
    def __init__(self, eta=0.01, n_iter=10):
        self.eta = eta
        self.n_iter = n_iter
        
    def fit(self, X, y):
        """Fit training data.
        Parameters
        ----------
        X : {array-like}, shape = [n_samples, n_features]
        Training vectors, where n_samples
        is the number of samples and
        n_features is the number of features.
        y : array-like, shape = [n_samples]
        Target values.
        Returns
        -------
        self : object
        """
        self.w_ = np.zeros(1 + X.shape[1])
        self.errors_ = []
        for _ in range(self.n_iter):
            errors = 0
            for xi, target in zip(X, y):
                update = self.eta * (target - self.predict(xi))
                self.w_[1:] += update * xi
                self.w_[0] += update
                errors += int(update != 0.0)
            self.errors_.append(errors)
        return self
    
    def net_input(self, X):
        """Calculate net input"""
        return np.dot(X, self.w_[1:]) + self.w_[0]
    
    def predict(self, X):
        """Return class label after unit step"""
        return np.where(self.net_input(X) >= 0.0, 1, -1)

In [None]:
import pandas as pd
df = pd.read_csv('https://archive.ics.uci.edu/ml/machine-learning-databases/iris/iris.data', header=None)
df.tail()

first feature column (sepal length) and the third feature column (petal length)

In [None]:
import matplotlib.pyplot as plt
import numpy as np
plt.rcParams["figure.figsize"] = (15, 10)

y = df.iloc[0:100, 4].values
y = np.where(y == 'Iris-setosa', -1, 1)
X = df.iloc[0:100, [0, 2]].values
plt.scatter(X[:50, 0], X[:50, 1], color='red', marker='o', label='setosa')
plt.scatter(X[50:100, 0], X[50:100, 1], color='blue', marker='x', label='versicolor')
plt.xlabel('petal length')
plt.ylabel('sepal length')
plt.legend(loc='upper left')
plt.show()

In [None]:
ppn = Perceptron(eta=0.1, n_iter=10)
ppn.fit(X, y)

"""
from sklearn.linear_model import Perceptron
ppn = Perceptron(n_iter=40, eta0=0.1, random_state=0)
ppn.fit(X, y)
"""

In [None]:
plt.plot(range(1, len(ppn.errors_) + 1), ppn.errors_, marker='o')
plt.xlabel('Epochs')
plt.ylabel('Number of misclassifications')
plt.show()

In [None]:
from matplotlib.colors import ListedColormap
def plot_decision_regions(X, y, classifier, test_idx=None, resolution=0.02):
    # setup marker generator and color map
    markers = ('s', 'x', 'o', '^', 'v')
    colors = ('red', 'blue', 'lightgreen', 'gray', 'cyan')
    cmap = ListedColormap(colors[:len(np.unique(y))])
    # plot the decision surface
    x1_min, x1_max = X[:, 0].min() - 1, X[:, 0].max() + 1
    x2_min, x2_max = X[:, 1].min() - 1, X[:, 1].max() + 1
    xx1, xx2 = np.meshgrid(np.arange(x1_min, x1_max, resolution), np.arange(x2_min, x2_max, resolution))
    Z = classifier.predict(np.array([xx1.ravel(), xx2.ravel()]).T)
    Z = Z.reshape(xx1.shape)
    plt.contourf(xx1, xx2, Z, alpha=0.4, cmap=cmap)
    plt.xlim(xx1.min(), xx1.max())
    plt.ylim(xx2.min(), xx2.max())
    # plot class samples
    for idx, cl in enumerate(np.unique(y)):
        plt.scatter(x=X[y == cl, 0], y=X[y == cl, 1], alpha=0.8, c=cmap(idx), marker=markers[idx], label=cl)
        
    # highlight test samples
    if test_idx:
        X_test, y_test = X[test_idx, :], y[test_idx]
        plt.scatter(X_test[:, 0], X_test[:, 1], c='', alpha=1.0, linewidth=1, marker='o', s=55, label='test set')

In [None]:
plot_decision_regions(X, y, classifier=ppn)
plt.xlabel('sepal length [cm]')
plt.ylabel('petal length [cm]')
plt.legend(loc='upper left')
plt.show()

https://www.adrianstoll.com/ml/perceptron-learning-algorithm/

#### ADAptive LInear NEuron (Adaline)

In [None]:
Image(filename='adaline.png', width = 1000)

#### SVM (support vector machine)

Another powerful and widely used learning algorithm is the support vector
machine (SVM), which can be considered as an extension of the perceptron. Using
the perceptron algorithm, we minimized misclassification errors. However, in SVMs,
our optimization objective is to maximize the margin. The margin is defined as the
distance between the separating hyperplane (decision boundary) and the training
samples that are closest to this hyperplane, which are the so-called support vectors. 

In [None]:
Image(filename='svm.png', width = 1000)

In [None]:
from sklearn import datasets
import numpy as np
iris = datasets.load_iris()
X = iris.data[:, [2, 3]]
y = iris.target

from sklearn.model_selection import train_test_split
X_train, X_test, y_train, y_test = train_test_split(
X, y, test_size=0.3, random_state=0)

from sklearn.preprocessing import StandardScaler
sc = StandardScaler()
sc.fit(X_train)
X_train_std = sc.transform(X_train)
X_test_std = sc.transform(X_test)

X_combined_std = np.vstack((X_train_std, X_test_std))
y_combined = np.hstack((y_train, y_test))

In [None]:
from sklearn.svm import SVC
svm = SVC(kernel='linear', C=1.0, random_state=0)
svm.fit(X_train_std, y_train)

In [None]:
plot_decision_regions(X_combined_std, y_combined, classifier=svm, test_idx=range(105,150))
plt.xlabel('petal length [standardized]')
plt.ylabel('petal width [standardized]')
plt.legend(loc='upper left')
plt.show()

#### SVM with RBF kerner (Radial Basis Function kernel)

XOR problem

In [None]:
np.random.seed(0)
X_xor = np.random.randn(200, 2)
y_xor = np.logical_xor(X_xor[:, 0] > 0, X_xor[:, 1] > 0)
y_xor = np.where(y_xor, 1, -1)
plt.scatter(X_xor[y_xor==1, 0], X_xor[y_xor==1, 1],
c='b', marker='x', label='1')
plt.scatter(X_xor[y_xor==-1, 0], X_xor[y_xor==-1, 1],
c='r', marker='s', label='-1')
plt.ylim(-3.0)
plt.legend()
plt.show()

In [None]:
svm = SVC(kernel='rbf', random_state=0, gamma=0.10, C=10.0)
svm.fit(X_xor, y_xor)

In [None]:
plot_decision_regions(X_xor, y_xor, classifier=svm)
plt.legend(loc='upper left')
plt.show()

In [None]:
Image(filename='svm_rbf.png', width = 1000)

### Regression

Regression is used for predicting continuous responses, that is a value within a range, e.g. the height of students in a class is not fixed but rather a sliding scale of all possible human heights. In a legal context, ML regression algorithms could be used to predict the ideal fee quote for a matter with X, Y and Z variables.

In [None]:
Image(filename='svm_reg.png', width = 1000)

## Unsupervised Learning

Unlike supervised learning, unsupervised learning does not require labelled data. This is because unsupervised learning techniques are designed to identify patterns inherent in the structure of the data.

For instance, in a legal context, you might use an unsupervised learning algorithm to identify logical groupings of contracts based on their shared syntax. Upon further human inspection, these groupings might reveal useful insights, e.g. documents with certain counterparties being more similar than not vs. documents without other counterparties.

### Dimensionality Reduction

#### PCA (Principal Component Analysis)

In [None]:
import pandas as pd
url = "https://archive.ics.uci.edu/ml/machine-learning-databases/iris/iris.data"
# load dataset into Pandas DataFrame
df = pd.read_csv(url, names=['sepal length','sepal width','petal length','petal width','target'])
df

In [None]:
from sklearn.preprocessing import StandardScaler
features = ['sepal length', 'sepal width', 'petal length', 'petal width']
# Separating out the features
x = df.loc[:, features].values
# Separating out the target
y = df.loc[:,['target']].values
# Standardizing the features
x = StandardScaler().fit_transform(x)

In [None]:
from sklearn.decomposition import PCA
pca = PCA(n_components=2)
principalComponents = pca.fit_transform(x)
principalDf = pd.DataFrame(data = principalComponents, columns = ['principal component 1', 'principal component 2'])

In [None]:
finalDf = pd.concat([principalDf, df[['target']]], axis = 1)
finalDf

In [None]:
fig = plt.figure(figsize = (8,8))
ax = fig.add_subplot(1,1,1) 
ax.set_xlabel('Principal Component 1', fontsize = 15)
ax.set_ylabel('Principal Component 2', fontsize = 15)
ax.set_title('2 component PCA', fontsize = 20)
targets = ['Iris-setosa', 'Iris-versicolor', 'Iris-virginica']
colors = ['r', 'g', 'b']
for target, color in zip(targets,colors):
    indicesToKeep = finalDf['target'] == target
    ax.scatter(finalDf.loc[indicesToKeep, 'principal component 1'],
               finalDf.loc[indicesToKeep, 'principal component 2'], 
               c = color,
               s = 50)
ax.legend(targets)
ax.grid()

t-SNE (t-distributed stochastic neighbor embedding)

### Clustering

https://scikit-learn.org/stable/modules/clustering.html

##### MNIST

http://yann.lecun.com/exdb/mnist/

In [None]:
Image(filename='mnist.png', width = 1000)

#### K-Means clustering on the handwritten digits data

In [None]:
import numpy as np
from sklearn.datasets import load_digits

data, labels = load_digits(return_X_y=True)
(n_samples, n_features), n_digits = data.shape, np.unique(labels).size

print(
    f"# digits: {n_digits}; # samples: {n_samples}; # features {n_features}"
)

# %%
# Define our evaluation benchmark
# -------------------------------
#
# We will first our evaluation benchmark. During this benchmark, we intend to
# compare different initialization methods for KMeans. Our benchmark will:
#
# * create a pipeline which will scale the data using a
#   :class:`~sklearn.preprocessing.StandardScaler`;
# * train and time the pipeline fitting;
# * measure the performance of the clustering obtained via different metrics.
from time import time
from sklearn import metrics
from sklearn.pipeline import make_pipeline
from sklearn.preprocessing import StandardScaler
import matplotlib.cm as cm
from matplotlib.colors import Normalize

cmap = cm.autumn
norm = Normalize(vmin=0, vmax=9)
colors = [cmap(norm(label)) for label in labels]

def bench_k_means(kmeans, name, data, labels):
    """Benchmark to evaluate the KMeans initialization methods.

    Parameters
    ----------
    kmeans : KMeans instance
        A :class:`~sklearn.cluster.KMeans` instance with the initialization
        already set.
    name : str
        Name given to the strategy. It will be used to show the results in a
        table.
    data : ndarray of shape (n_samples, n_features)
        The data to cluster.
    labels : ndarray of shape (n_samples,)
        The labels used to compute the clustering metrics which requires some
        supervision.
    """
    t0 = time()
    estimator = make_pipeline(StandardScaler(), kmeans).fit(data)
    fit_time = time() - t0
    results = [name, fit_time, estimator[-1].inertia_]

    # Define the metrics which require only the true labels and estimator
    # labels
    clustering_metrics = [
        metrics.homogeneity_score,
        metrics.completeness_score,
        metrics.v_measure_score,
        metrics.adjusted_rand_score,
        metrics.adjusted_mutual_info_score,
    ]
    results += [m(labels, estimator[-1].labels_) for m in clustering_metrics]

    # The silhouette score requires the full dataset
    results += [
        metrics.silhouette_score(data, estimator[-1].labels_,
                                 metric="euclidean", sample_size=300,)
    ]

    # Show the results
    formatter_result = ("{:9s}\t{:.3f}s\t{:.0f}\t{:.3f}\t{:.3f}"
                        "\t{:.3f}\t{:.3f}\t{:.3f}\t{:.3f}")
    print(formatter_result.format(*results))


# %%
# Run the benchmark
# -----------------
#
# We will compare three approaches:
#
# * an initialization using `kmeans++`. This method is stochastic and we will
#   run the initialization 4 times;
# * a random initialization. This method is stochastic as well and we will run
#   the initialization 4 times;
# * an initialization based on a :class:`~sklearn.decomposition.PCA`
#   projection. Indeed, we will use the components of the
#   :class:`~sklearn.decomposition.PCA` to initialize KMeans. This method is
#   deterministic and a single initialization suffice.
from sklearn.cluster import KMeans
from sklearn.decomposition import PCA

print(82 * '_')
print('init\t\ttime\tinertia\thomo\tcompl\tv-meas\tARI\tAMI\tsilhouette')

kmeans = KMeans(init="k-means++", n_clusters=n_digits, n_init=4,
                random_state=0)
bench_k_means(kmeans=kmeans, name="k-means++", data=data, labels=labels)

kmeans = KMeans(init="random", n_clusters=n_digits, n_init=4, random_state=0)
bench_k_means(kmeans=kmeans, name="random", data=data, labels=labels)

pca = PCA(n_components=n_digits).fit(data)
kmeans = KMeans(init=pca.components_, n_clusters=n_digits, n_init=1)
bench_k_means(kmeans=kmeans, name="PCA-based", data=data, labels=labels)

print(82 * '_')

# %%
# Visualize the results on PCA-reduced data
# -----------------------------------------
#
# :class:`~sklearn.decomposition.PCA` allows to project the data from the
# original 64-dimensional space into a lower dimensional space. Subsequently,
# we can use :class:`~sklearn.decomposition.PCA` to project into a
# 2-dimensional space and plot the data and the clusters in this new space.
import matplotlib.pyplot as plt

reduced_data = PCA(n_components=2).fit_transform(data)
kmeans = KMeans(init="k-means++", n_clusters=n_digits, n_init=4)
kmeans.fit(reduced_data)

# Step size of the mesh. Decrease to increase the quality of the VQ.
h = .02     # point in the mesh [x_min, x_max]x[y_min, y_max].

# Plot the decision boundary. For that, we will assign a color to each
x_min, x_max = reduced_data[:, 0].min() - 1, reduced_data[:, 0].max() + 1
y_min, y_max = reduced_data[:, 1].min() - 1, reduced_data[:, 1].max() + 1
xx, yy = np.meshgrid(np.arange(x_min, x_max, h), np.arange(y_min, y_max, h))

# Obtain labels for each point in mesh. Use last trained model.
Z = kmeans.predict(np.c_[xx.ravel(), yy.ravel()])

# Put the result into a color plot
Z = Z.reshape(xx.shape)
plt.figure(1)
plt.clf()
plt.imshow(Z, interpolation="nearest",
           extent=(xx.min(), xx.max(), yy.min(), yy.max()),
           cmap=plt.cm.Paired, aspect="auto", origin="lower")

plt.plot(reduced_data[:, 0], reduced_data[:, 1], 'k.', markersize=2)
# Plot the centroids as a white X
centroids = kmeans.cluster_centers_
plt.scatter(centroids[:, 0], centroids[:, 1], marker="x", s=169, linewidths=3,
            color="w", zorder=10)
plt.title("K-means clustering on the digits dataset (PCA-reduced data)\n"
          "Centroids are marked with white cross")
plt.xlim(x_min, x_max)
plt.ylim(y_min, y_max)
plt.xticks(())
plt.yticks(())
plt.show()

# Neural Networks

## MLP (multi-layer perceptron)

In [None]:
Image(filename='mlp.png', width = 800)

### MNIST

In [None]:
from keras.datasets import mnist

(X_train, y_train), (X_test, y_test) = mnist.load_data()
print(X_train.shape)
print(y_train.shape)
print(X_test.shape)
print(y_test.shape)

In [None]:
print(y_train[21])
plt.imshow(X_train[21], cmap='gray')

In [None]:
print(y_train[122])
plt.imshow(X_train[122], cmap='gray')

In [None]:
Image(filename='gray_rgb.png', width = 900)

In [None]:
print("max: ", X_train.max())
print("min: ", X_train.min())

#### Normalization

In [None]:
X_train = X_train.astype('float32') / 255
X_test = X_test.astype('float32') / 255

In [None]:
print("max: ", X_train.max())
print("min: ", X_train.min())

In [None]:
Image(filename='norm.jpg', width = 600)

#### 28 x 28 --> 784

In [None]:
print("X_train: ", X_train.shape)
print("X_test: ", X_test.shape)

In [None]:
X_train = X_train.reshape(X_train.shape[0], 784)
X_test = X_test.reshape(X_test.shape[0], 784)

In [None]:
print("X_train: ", X_train.shape)
print("X_test: ", X_test.shape)

#### One-Hot vector

In [None]:
Image(filename='one_hot.jpg', width = 800)

In [None]:
print("y_train: ", y_train.shape)
print("y_test: ", y_test.shape)

In [None]:
from keras.utils.np_utils import to_categorical

y_train = to_categorical(y_train, 10)
y_test = to_categorical(y_test, 10)

In [None]:
print("y_train: ", y_train.shape)
print("y_test: ", y_test.shape)

In [None]:
print(y_train[122])

### Model definition

In [None]:
from keras.models import Model
from keras.layers import Input, Dense
from keras.optimizers import Adam

In [None]:
ip = Input(shape=(784,), name="input")
x = Dense(units=512, name="hiddel_layer", activation="relu")(ip)
op = Dense(units=10, name="prediction", activation="softmax")(x)

model = Model(inputs=ip, outputs=op, name="full_model")

In [None]:
model.summary()

In [None]:
print(784 * 512)

In [None]:
print(784 * 512 + 512)

In [None]:
Image(filename='sigmoid.jpg', width = 800)

In [None]:
Image(filename='relu.png', width = 800)

In [None]:
Image(filename='softmax.png', width = 800)

In [None]:
learning_rate = 0.001
optimizer = Adam(learning_rate, amsgrad=True)

In [None]:
Image(filename='optimizers.gif', width = 800)

In [None]:
model.compile(loss="categorical_crossentropy", optimizer=optimizer, metrics=["accuracy"])

### Model training

In [None]:
history = model.fit(X_train,
                   y_train,
                   batch_size=100,
                   epochs=5,
                   verbose=1,
                   validation_data=(X_test, y_test))

In [None]:
history.history

In [None]:
plt.plot(history.history['loss'])
plt.plot(history.history['val_loss'])
plt.gca().legend(('loss','val_loss'))
plt.grid()

In [None]:
plt.plot(history.history['accuracy'])
plt.plot(history.history['val_accuracy'])
plt.gca().legend(('acc','val_accuracy'))
plt.grid()

In [None]:
Image(filename='mnist_heatmap.png', width = 1000)

In [None]:
origin = X_test[121].reshape(28, 28)
print(origin.shape)
plt.imshow(origin, cmap='gray')

In [None]:
prediction = model.predict(X_test[121].reshape(1, -1), batch_size=1)
print(prediction)

In [None]:
plt.bar(range(10), prediction[0])
plt.xticks(range(10), range(10))

In [None]:
prediction.argmax()

In [None]:
origin = X_test[21].reshape(28, 28)
print(origin.shape)
plt.imshow(origin, cmap='gray')

In [None]:
prediction = model.predict(X_test[21].reshape(1, -1), batch_size=1)
print(prediction)

In [None]:
plt.bar(range(10), prediction[0])
plt.xticks(range(10), range(10))

https://www.gwern.net/Tanks

In [None]:
Image(filename='overfitting.png', width = 1000)

In [None]:
Image(filename='model_comp.png', width = 800)

In [None]:
ip = Input(shape=(784,), name="input")
x = Dense(units=2048, name="hiddel_layer", activation="relu")(ip)
x = Dense(units=2048, name="hiddel_layer_2", activation="relu")(x)
x = Dense(units=512, name="hiddel_layer_3", activation="relu")(x)
op = Dense(units=10, name="prediction", activation="softmax")(x)

model = Model(inputs=ip, outputs=op, name="full_model")

learning_rate = 0.001
optimizer = Adam(learning_rate, amsgrad=True)
model.compile(loss="categorical_crossentropy", optimizer=optimizer, metrics=["accuracy"])

model.summary()

In [None]:
history = model.fit(X_train[:1000],
                   y_train[:1000],
                   batch_size=100,
                   epochs=10,
                   verbose=1,
                   validation_data=(X_test, y_test))

In [None]:
plt.plot(history.history['accuracy'])
plt.plot(history.history['val_accuracy'])
plt.gca().legend(('acc','val_accuracy'))
plt.grid()

Regularization: L1, L2, augmentation

In [None]:
Image(filename='augmentation.png', width = 1000)

https://playground.tensorflow.org/#activation=tanh&batchSize=10&dataset=circle&regDataset=reg-plane&learningRate=0.03&regularizationRate=0&noise=5&networkShape=3,2&seed=0.25772&showTestData=false&discretize=false&percTrainData=50&x=true&y=true&xTimesY=false&xSquared=false&ySquared=false&cosX=false&sinX=false&cosY=false&sinY=false&collectStats=false&problem=classification&initZero=false&hideText=false

# Deep Learning

Deep learning ( “DL“) is a subtype of machine learning. DL can process a wider range of data resources, requires less data preprocessing by humans (e.g. feature labelling), and can sometimes produce more accurate results than traditional ML approaches (although it requires a larger amount of data to do so).

In [None]:
Image(filename='MLvsDL.png', width = 900)

In [None]:
Image(filename='sheep.jpeg', width = 900)

In [None]:
Image(filename='3d_detection.jpg', width = 900)

In [None]:
Image(filename='pose.png', width = 900)

## CNN (convolutional neural network)

In [None]:
Image(filename='convolution.gif', width = 600)

In [None]:
Image(filename='pooling.gif', width = 800)

In [None]:
Image(filename='cnn.jpeg', width = 900)

In [None]:
Image(filename='cnn_2.png', width = 900)

In [None]:
import numpy as np
from keras.datasets import mnist
from tensorflow.keras.models import Sequential
from tensorflow.keras.layers import Conv2D, MaxPooling2D, Dense, Flatten
from tensorflow.keras.utils import to_categorical

(train_images, train_labels), (test_images, test_labels) = mnist.load_data()

# Normalize the images.
train_images = (train_images / 255) - 0.5
test_images = (test_images / 255) - 0.5

# Reshape the images.
train_images = np.expand_dims(train_images, axis=3)
test_images = np.expand_dims(test_images, axis=3)

num_kernels = 8  # filters
kernel_size = 3
pool_size = 2

# Build the model.
model = Sequential([Conv2D(num_kernels, kernel_size, input_shape=(28, 28, 1)),
                    MaxPooling2D(pool_size=pool_size),
                    Flatten(),
                    Dense(10, activation='softmax'),
                    ])

# Compile the model.
model.compile('adam', loss='categorical_crossentropy', metrics=['accuracy'])

# Train the model.
model.fit(train_images,
          to_categorical(train_labels),
          epochs=5,
          validation_data=(test_images, to_categorical(test_labels)),
          )

## RNN (Recurrent neural network)

In [None]:
Image(filename='rnn.gif', width = 900)

### LSTM (Long short-term memory)

In [None]:
Image(filename='lstm.gif', width = 900)

### NLP (Natural Language Processing)

In [None]:
Image(filename='nlp.jpg', width = 700)