In [1]:
# Train/test data split

import random
from typing import TypeVar, List, Tuple
X = TypeVar('X') # generic type to represent a data point

def split_data(data: List[X], prob: float) -> Tuple[List[X], List[X]]:
    """Splits data into fractions [prob, 1 - prob]"""
    
    data = data[:]                    # Make a shallow copy
    random.shuffle(data)              # because shuffle modifies the list
    cut = int(len(data) * prob)       # Use prob to find a cutoff
    return data[:cut], data[cut:]     # and split the shuflled list there

data = [n for n in range(1000)]
train, test = split_data(data, 0.75)

# The proportion should be correct
assert len(train) == 750
assert len(test) == 250

# And the original data should be preserved (in some order)
assert sorted(train + test) == data

In [2]:
Y = TypeVar('Y') # generic type to represent output variables

def train_test_split(xs: List[X], 
                     ys: List[Y], 
                     test_pct: float) -> Tuple[List[X], List[X], List[Y], List[Y]]:
    
    # Generate the indicies and split them
    idxs = [i for i in range(len(xs))]
    train_idxs, test_idxs = split_data(idxs, 1 - test_pct)
    
    return([xs[i] for i in train_idxs], # x_train
           [xs[i] for i in test_idxs],  # x_test
           [ys[i] for i in train_idxs], # y_train
           [ys[i] for i in test_idxs])  # y_test

xs = [x for x in range(1000)]
ys = [2 * x for x in xs]
x_train, x_test, y_train, y_test = train_test_split(xs, ys, 0.25)

# The proportion should be correct
assert len(x_train) == len(y_train) == 750
assert len(x_test) == len(y_test) == 250

# Check that the corresponding data points are paired correctly
assert all(y == 2 * x for x, y in zip(x_train, y_train))
assert all(y == 2 * x for x, y in zip(x_test, y_test))

In [3]:
# Accuracy: fraction of correct predictions

def accuracy(tp: int, fp: int, fn: int, tn: int) -> float:
    correct = tp + tn
    total = tp + fp + fn + tn
    
    return correct / total

assert accuracy(70, 4930, 13930, 981070) == 0.98114

In [4]:
# Precision: how accurate our positive predictions are

def precision(tp: int, fp: int, fn: int, tn: int) -> float:
    
    return tp / (tp + fp)

assert precision(70, 4930, 13930, 981070) == 0.014

In [5]:
# Recall: fraction of the positives our model identified

def recall(tp: int, fp: int, fn: int, tn: int) -> float:
    
    return tp / (tp + fn)

assert recall(70, 4930, 13930, 981070) == 0.005

In [6]:
# F1

def f1_score(tp: int, fp: int, fn: int, tn: int) -> float:
    p = precision(tp, fp, fn, tn)
    r = recall(tp, fp, fn, tn)
    
    return 2 * p * r / (p + r)

In [None]:
"""
Precision-recall Tradeoff

A model that predicts "yes" when it's even a little bit confident 
will probably have a high recall (increased TP; considering Spam only), but a low precision (increased FP).

A model that predicts "yes" when it's extremely confident
will probably have a low recall (decreased TP), but a high precision (increased TP).

"""

In [None]:
"""
Bias-Variance Tradeoff

Underfitted model will make a lot of mistakes for pretty much any training set (drawn from the sample population), 
which means that it has a high "bias". 

However, any two randomly chosen training sets should have pretty similar models,
which means that it has a low "variance".

So, underfitting typically yields high bias and low variance. 
-> We need to have more features that capture regularities in the data, otherwise more data won't help with bias.

On the contrary, overfitting typically yields low bias and high variance.
-> We need to remove features (decreasing model complexity) or more data to reduce variance

"""