# Machine learning

<br><br><br>

## Solutions to the first project (do not peek!)

In [None]:
import numpy as np
import pandas as pd
import matplotlib.pyplot as plt

In [None]:
penguins = pd.read_csv("data/penguins.csv")
measurements = penguins[["flipper_length_mm", "body_mass_g"]].dropna().values

In [None]:
def body_mass(flipper_length, a, b):
    return a * flipper_length + b

In [None]:
def badness_of_fit(a, b, measurements):
    badness = 0

    for measured_length, measured_mass in measurements:
        badness += (body_mass(measured_length, a, b) - measured_mass)**2
    
    return badness

In [None]:
def better_fit(i, a, b, measurements):
    lowest_badness = badness_of_fit(a, b, measurements)
    
    if i % 2 == 0:
        for new_a in [a + 10, a + 1, a + 0.1, a - 0.1, a - 1, a - 10]:
            trial = badness_of_fit(new_a, b, measurements)
            if trial < lowest_badness:
                lowest_badness = trial
                a = new_a

    else:
        for new_b in [b + 1000, b + 100, b + 10, b - 10, b - 100, b - 1000]:
            trial = badness_of_fit(a, new_b, measurements)
            if trial < lowest_badness:
                lowest_badness = trial
                b = new_b

    return a, b

In [None]:
i = 0   # iteration number
a = 30
b = -3000

In [None]:
a, b = better_fit(i, a, b, measurements)
i += 1

fig, ax = plt.subplots()

ax.scatter(measurements[:, 0], measurements[:, 1], marker=".")

x = np.linspace(165, 240, 10)
y = body_mass(x, a, b)
ax.plot(x, y, color="orange")

badness = badness_of_fit(a, b, measurements)

ax.legend([], [], title=f"i = {i}\na = {a:.2f}\nb = {b:.0f}\nbadness = {badness:.2e}", loc="upper left")

None

In [None]:
my_best_a = a
my_best_b = b

<br><br><br><br><br><br><br><br><br><br><br><br><br><br><br><br><br><br><br><br><br><br><br><br><br>

## Linear fit in Scikit-Learn

<img src="img/sklearn-logo.svg" width="300">

In [None]:
import sklearn.linear_model

In [None]:
linear = sklearn.linear_model.LinearRegression()

In [None]:
linear.fit(measurements[:, [0]], measurements[:, [1]])

In [None]:
a = linear.coef_[0, 0]
b = linear.intercept_[0]

a, b

In [None]:
fig, ax = plt.subplots()

ax.scatter(measurements[:, 0], measurements[:, 1], marker=".")

x = np.linspace(165, 240, 10)
y = linear.predict(x[:, np.newaxis])
ax.plot(x, y, color="orange")

badness = badness_of_fit(a, b, measurements)

ax.legend([], [], title=f"a = {a:.2f}\nb = {b:.0f}\nbadness = {badness:.2e}", loc="upper left")

None

<br><br><br>

But we didn't do a _standard_ linear fit, in which badness is measured as `(prediction - measurement)**2`, we did a an alternate fit, optimizing `abs(prediction - measurement)`.

Fortunately, this is also in Scikit-Learn. Scikit-Learn has _all_ the models!

<br><br><br>

What does the "badness versus `a` and `b`" function look like near the minimum?

In [None]:
fig, ax = plt.subplots()

a_grid, b_grid = np.meshgrid(np.linspace(a - 10, a + 10, 401), np.linspace(b - 3000, b + 3000, 401))
contour = ax.contour(a_grid, b_grid, badness_of_fit(a_grid, b_grid, measurements), levels=[badness + 0.5e7, badness + 1e7, badness + 1.5e7])
ax.clabel(contour)

ax.scatter([a], [b], marker="+", s=800, color="red")
ax.scatter([my_best_a], [my_best_b], marker="*", s=800, color="red")

ax.set_xlabel("parameter a")
ax.set_ylabel("parameter b")

None

It's a long, flat valley in a direction that's diagonal in `a` and `b`.

It's a hard minimum to find if we only take steps in the `a` direction or the `b` direction!

<br><br><br>

This tool can do linear fits in arbitrary numbers of dimensions. How about 4D?

In [None]:
measurements4D = penguins[["bill_length_mm", "bill_depth_mm", "flipper_length_mm", "body_mass_g"]].dropna().values
measurements4D

In [None]:
linear4D = sklearn.linear_model.LinearRegression()

<br><br><br>

This fits a model that uses bill length, bill depth, and flipper length to predict body mass.

* 2D fit: 1 dimension predicts 1 dimension, best fit is a _line_,
* 3D fit: 2 dimensions predict 1 dimension, best fit is a _plane_,
* 4D fit: 3 dimensions predict 1 dimension, best fit is a _hyperplane_...

In [None]:
linear4D.fit(measurements4D[:, [0, 1, 2]], measurements4D[:, [3]])

Now there are three `a`'s:

In [None]:
linear4D.coef_

But still only one `b`:

In [None]:
linear4D.intercept_

<br><br><br>

We can't easily visualize this, but we can see how well the model predicts some penguin masses.

In [None]:
for bill_length, bill_depth, flipper_length, body_mass in measurements4D:

    prediction = linear4D.predict([[bill_length, bill_depth, flipper_length]])[0, 0]
    actual = body_mass

    print(f"{bill_length = } {bill_depth = } {flipper_length = } | {prediction = :.1f} {actual = }")

<br><br><br>

In [None]:
fig, ax = plt.subplots(figsize=(6, 6))

predictions = linear4D.predict(measurements4D[:, [0, 1, 2]])[:, 0]
actuals = measurements4D[:, 3]

ax.scatter(predictions, actuals)

ax.set_xlabel("predictions (g)")
ax.set_ylabel("actuals (g)")

None

<br><br><br>

There's a pretty good correlation between the predicted value and the actual value.

(Random guesses would be a correlation of 0 and exactly right would be a correlation of 1.)

In [None]:
pd.Series(predictions).corr(pd.Series(actuals))

<br><br><br>

## The breadth of machine learning

Scikit-Learn is a toolbox full of machine learning models:

In [None]:
import sklearn.base
import sklearn.cluster as d1
import sklearn.compose as d2
import sklearn.covariance as d3
import sklearn.cross_decomposition as d4
import sklearn.decomposition as d5
import sklearn.ensemble as d6
import sklearn.feature_extraction as d7
import sklearn.feature_selection as d8
import sklearn.gaussian_process as d9
import sklearn.impute as d10
import sklearn.linear_model as d11
import sklearn.manifold as d12
import sklearn.mixture as d13
import sklearn.model_selection as d14
import sklearn.neighbors as d15
import sklearn.neural_network as d16
import sklearn.preprocessing as d17
import sklearn.semi_supervised as d18
import sklearn.svm as d19
import sklearn.tree as d20

for d in [d1, d2, d3, d4, d5, d6, d7, d8, d9, d10, d11, d12, d13, d14, d15, d16, d17, d18, d19, d20]:
    print("*", d.__name__)
    for name in dir(d):
        obj = getattr(d, name)
        if isinstance(obj, type) and issubclass(obj, sklearn.base.BaseEstimator):
            if d.__name__ == "sklearn.linear_model" and name == "LinearRegression":
                highlight = "  <---   This is what we've seen so far!"
            elif d.__name__ == "sklearn.neural_network" and name == "MLPClassifier":
                highlight = "  <---   We'll also look at this one: neural networks"
            else:
                highlight = ""
            print("  -", name, highlight)

<br><br><br>

Artificial intelligence, machine learning, and neural networks have been around for a while.

The current renaissance is due:

* theoretical improvements in how to implement neural networks _well_,
* very large datasets to train (fit) these neural networks—particularly, the existence of the world wide web,
* computational hardware capable of dealing with very large datasets—particularly, GPUs.

Rise and fall and rise again of words associated with machine learning: frequency of their appearance in books scanned by Google:

In [1]:
%%html
<div style="overflow: hidden;"><iframe src="https://books.google.com/ngrams/graph?content=artificial+intelligence%2Cmachine+learning%2Cdata+mining%2Cneural+network%2Cdeep+learning%2Cmachine+translation&year_start=1950&year_end=2019&case_insensitive=on&corpus=en-2019&smoothing=0" width="100%" height="800" scrolling="no" style="border: none;"></div>