# 11. Machine Learning

## Imports

In [1]:
import random
from typing import TypeVar

## General Information

In [2]:
X = TypeVar("X")
Y = TypeVar("Y")

In [3]:
# Function splits data into fractions [prob, 1 - prob]


def split_data(data: list[X], prob: float) -> tuple[list[X], list[X]]:
    data = data[:]
    random.shuffle(data)
    cut = int(len(data) * prob)
    return data[:cut], data[cut:]


data = [n for n in range(1000)]
train, test = split_data(data, 0.75)

assert len(train) == 750
assert len(test) == 250

assert sorted(train + test) == data

In [4]:
def train_test_split(
    xs: list[X], ys: list[Y], test_pct: float
) -> tuple[list[X], list[X], list[Y], list[Y]]:
    train_idxs, test_idxs = split_data([i for i in range(len(xs))], 1 - test_pct)

    return (
        [xs[i] for i in train_idxs],
        [xs[i] for i in test_idxs],
        [ys[i] for i in train_idxs],
        [ys[i] for i in test_idxs],
    )


xs = [x for x in range(1000)]
ys = [2 * x for x in xs]
x_train, x_test, y_train, y_test = train_test_split(xs, ys, 0.25)

assert len(x_train) == len(y_train) == 750
assert len(x_test) == len(y_test) == 250

assert all(y == 2 * x for x, y in zip(x_train, y_train))
assert all(y == 2 * x for x, y in zip(x_test, y_test))

In [5]:
# Accuracy - proportion of correct predictions


def accuracy(tp: int, fp: int, fn: int, tn: int) -> float:
    correct = tp + tn
    total = tp + fp + fn + tn
    return correct / total


assert accuracy(70, 4930, 13930, 981070) == 0.98114

In [6]:
# Precision - how accurate were affirmative predictions


def precision(tp: int, fp: int, fn: int, tn: int) -> float:
    return tp / (tp + fp)


assert precision(70, 4930, 13930, 981070) == 0.014

In [7]:
# Recall - proportion of affirmative predictions


def recall(tp: int, fp: int, fn: int, tn: int) -> float:
    return tp / (tp + fn)


assert recall(70, 4930, 13930, 981070) == 0.005

In [8]:
# Harmonic mean of precision and completeness


def f1_score(tp: int, fp: int, fn: int, tn: int) -> float:
    p = precision(tp, fp, fn, tn)
    r = recall(tp, fp, fn, tn)

    return 2 * p * r / (p + r)