# Chapter 11 - Machine Learning

### Modeling

#### Import Packages

In [2]:
import numpy as np
import pandas as pd
import matplotlib.pyplot as plt
import random
from typing import TypeVar, List, Tuple

### Overfitting & Underfitting

In [4]:
X = TypeVar('X')

In [5]:
def split_data(data: List[X], prob: float) -> Tuple[List[X], List[X]]:
    data = data[:]
    random.shuffle(data)
    cut = int(len(data) * prob)
    return data[:cut], data[cut:]

In [7]:
data = [n for n in range(1000)]
train, test = split_data(data, 0.75)

In [8]:
assert len(train) == 750
assert len(test) == 250

In [9]:
assert sorted(train + test) == data

In [10]:
Y = TypeVar('Y')

In [14]:
def train_test_split(xs: List[X],
                     ys: List[Y],
                     test_pct: float) -> Tuple[List[X], List[X], List[Y],
                                               List[Y]]:
    idxs = [i for i in range(len(xs))]
    train_idxs, test_idxs = split_data(idxs, 1 - test_pct)
    
    return ([xs[i] for i in train_idxs],
            [xs[i] for i in test_idxs],
            [ys[i] for i in train_idxs],
            [ys[i] for i in test_idxs])

In [15]:
xs = [x for x in range(1000)]
ys = [2 * x for x in xs]
x_train, y_train, x_test, y_test = train_test_split(xs, ys, 0.25)

In [17]:
# assert len(x_train) == len(y_train) == 750
# assert len(x_test) == len(y_test) == 250

In [19]:
# assert all(y == 2 * x for x, y in zip(x_train, y_train))
# assert all(y == 2 * x for x, y in zip(x_test, y_test))

In [21]:
model = "SomeKindOfModel"()
x_train, x_test, y_train, y_test = train_test_split(xs, ys, 0.33)
model.train(x_train, y_train)
performance = model.test(x_test, y_test)

### Correctness

In [22]:
def accuracy(tp: int, fp: int, fn: int, tn: int) -> float:
    correct = tp + tn
    total = tp + fp + fn + tn
    return correct / total

In [24]:
assert accuracy(70, 4930, 12930, 981070) == 0.98114

In [25]:
def precision(tp: int, fp: int, fn: int, tn: int) -> float:
    return tp / (tp + fp)

In [26]:
assert precision(70, 4930, 13930, 980170) == 0.014

In [27]:
def recall(tp: int, fp: int, fn: int, tn: int) -> float:
    return tp / (tp + fn)

In [28]:
assert recall (70, 4930, 13930, 980170) == 0.005

In [29]:
def f1_score(tp: int, fp: int, fn: int, tn: int) -> float:
    p = precision(tp, fp, fn, tn)
    r = recall(tp, fp, fn, tn)
    
    return 2 * p * r / (p + r)