# 05. Statistics

## Imports

In [1]:
import math
from collections import Counter

In [2]:
# Import functions from another notebook

%run -i 04_linear_algebra.ipynb

## Central Trends

In [3]:
# Arithmetic mean

def mean(xs: list[float]) -> float:
    return sum(xs) / len(xs)

In [4]:
# Median - closest value to center

def _median_odd(xs: list[float]) -> float:
    return sorted(xs)[len(xs) // 2]


def _median_even(xs: list[float]) -> float:
    sorted_xs = sorted(xs)
    hi_midpoint = len(xs) // 2
    return (sorted_xs[hi_midpoint - 1] + sorted_xs[hi_midpoint]) / 2


def median(v: list[float]) -> float:
    return _median_odd(v) if len(v) % 2 else _median_even(v)


assert median([1, 10, 2, 9, 5]) == 5
assert median([1, 9, 2, 10]) == (2 + 9) / 2

In [5]:
# Quantile - median on certain percentile of data

def quantile(xs: list[float], p: float) -> float:
    p_index = int(p * len(xs))
    return sorted(x)[p_index]

In [6]:
# Mode - most common values

def mode(xs: list[float]) -> list[float]:
    counts = Counter(xs)
    max_count = max(counts.values())
    return [x_i for x_i, count in counts.items() if count == max_count]

## Variation

In [7]:
# Range - difference between maximum and minimum values

def data_range(xs: list[float]) -> float:
    return max(xs) - min(xs)

In [8]:
# Function of subtracting value from mean

def de_mean(xs: list[float]) -> list[float]:
    x_bar = mean(xs)
    return [x - x_bar for x in xs]

In [9]:
# Variance obtaining function

def variance(xs: list[float]) -> float:
    assert len(xs) >= 2, "[-] Variance requires at least two elements!"

    n = len(xs)
    deviations = de_mean(xs)
    return sum_of_squares(deviations) / (n - 1)

In [10]:
# Standard deviation obtaining function

def standard_deviation(xs: list[float]) -> float:
    return math.sqrt(variance(xs))

In [11]:
# Interquartile range obtaining function

def interquartile_range(xs: list[float]) -> float:
    return quantile(xs, 0.75) - quantile(xs, 0.25)

## Correlation

In [12]:
# Covariance - measures deviation of two variables

def covariance(xs: list[float], ys: list[float]) -> float:
    assert len(xs) == len(ys), "[-] XS and YS must have same number of elements!"
    return dot(de_mean(xs), de_mean(ys)) / (len(xs) - 1)

In [13]:
# Correlation - dependence of one variable on another

def correlation(xs: list[float], ys: list[float]) -> float:
    stdev_x = standard_deviation(xs)
    stdev_y = standard_deviation(ys)
    if stdev_x > 0 and stdev_y > 0:
        return covariance(xs, ys) / stdev_x / stdev_y
    else:
        return 0