# Distributions

In [None]:
import matplotlib.pyplot as plt
import seaborn as sns
import numpy as np

## Uniform Distribution
It has constant probabilities across the entire range of values in its domain.

In [None]:
# 10000 numbers

u = np.random.uniform(size=10000)

fig, ax = plt.subplots()
ax.hist(u, edgecolor="k")
ax.set_ylabel('Count');

Real-world examples:

* Model hyperparameters

---

## Normal distribution

Emperical rule: 68 - 95 - 99.7

Mean = 0; Stdev = 1

In [None]:
normal_dist = np.random.normal(size=10000)

fig, ax = plt.subplots()
ax.hist(normal_dist, edgecolor="k", bins = 100)
ax.set_ylabel('Count');

Real-world examples:

* Data imputation

---

## Central Limit Theorem

In [None]:
# Randomly sample 10 values from the normally distributed 10000 values (without replacement)

normal_sample = np.random.choice(normal_dist, size=10, replace=False)
normal_sample

### The mean of a sample isn't always going to be close to zero with such a small sample:

In [None]:
np.mean(normal_sample)

### Let's define a function for generating **sampling distributions** of the mean of a given input distribution:

#### (distribution of sample means)

In [None]:
# Gets random samples from a distribution and returns the means of those samples

def sample_mean_calculator(input_dist, sample_size, n_samples):
    sample_means = []
    for i in range(n_samples):
        sample = np.random.choice(input_dist, size=sample_size, replace=False)
        sample_means.append(sample.mean())
    return sample_means

In [None]:
fig, ax = plt.subplots()
ax.hist(sample_mean_calculator(normal_dist, 10, 10), color='gold', edgecolor="k")
ax.set_xlim(-1.5, 1.5);

### The more samples we take, the more likely that the sampling distribution of the means will be normally distributed:

In [None]:
fig, ax = plt.subplots()
ax.hist(sample_mean_calculator(normal_dist, 10, 1000), color='gold', edgecolor="k")
ax.set_xlim(-1.5, 1.5);

### The larger the sample, the tighter the sample means will tend to be around the population mean:

In [None]:
fig, ax = plt.subplots()
ax.hist(sample_mean_calculator(normal_dist, 100, 1000), color='gold', edgecolor="k")
ax.set_xlim(-1.5, 1.5);

## Sampling from a skewed distribution

In [None]:
import scipy.stats as st

s = st.skewnorm.rvs(10, size=10000)

sns.displot(s, kde=True);

### Sampling distribution

In [None]:
sns.displot(sample_mean_calculator(s, 10, 1000), color='gold', kde=True);

## Sampling from a multimodal distribution

In [None]:
m = np.concatenate((np.random.normal(size=5000), np.random.normal(loc = 4.0, size=5000)))

sns.displot(m, kde=True);

### Sampling distribution

In [None]:
sns.displot(sample_mean_calculator(m, 1000, 1000), color='gold', kde=True);

## Sampling from uniform

In [None]:
sns.displot(u);

### Even sampling from the highly non-normal uniform distribution, the sampling distribution comes out normal: 

In [None]:
sns.displot(sample_mean_calculator(u, 1000, 1000), color='gold', kde=True);

Therefore, with large enough sample sizes, we can assume that the sampling distribution of the means will be normally distributed, allowing us to apply statistical and ML models that are configured for normally distributed data, which is often the default assumption.

---

## Binomial Distribution
For discrete variables (PMF). 

Its parameters are: 

* *n*: number of trials
* *p*: probability of outcome of 1
* *size*: number of experiments with *n* trials each

In [None]:
# coin flips
n = 5

# number of experiments 
n_experiments = 1000

#### Fair coin (p = 0.5)

In [None]:
heads_count = np.random.binomial(n, 0.5, n_experiments)

heads, event_count = np.unique(heads_count, return_counts=True)
event_proba = event_count/n_experiments

In [None]:
heads_count

### Frequency distribution

In [None]:
heads, event_count

### Probability distribution

In [None]:
event_proba

### Visualize distribution

In [None]:
plt.bar(heads, event_proba, color='#FA166C')
plt.xlabel('Heads flips (out of 5 tosses)')
plt.ylabel('Event probability');

#### Weighted coin (p = 0.8)

In [None]:
heads_count = np.random.binomial(n, 0.8, n_experiments)

heads, event_count = np.unique(heads_count, return_counts=True)
event_proba = event_count/n_experiments

### Frequency distribution

In [None]:
heads, event_count

### Probability distribution

In [None]:
event_proba

### Visualize distribution

In [None]:
plt.bar(heads, event_proba, color='#FA166C')
plt.xlabel('Heads flips (out of 5 tosses)')
plt.ylabel('Event probability');

Real-world examples:

* Arriving to work on time
* Contracting a virus
* Purchasing a product

---

## Multinomial Distribution
Generalization of the binomial distribution to discrete random variables with more than two possible outcomes, e.g., the roll of a die: 

In [None]:
# number of rolls of a die
n = 1000

rolls = np.random.multinomial(n, [1/6, 1/6, 1/6, 1/6, 1/6, 1/6]) # also can be written as [1/6]*6

### Frequency distribution

In [None]:
# rolling a [1, 2, 3, 4, 5, 6]

rolls

### Probability distribution

In [None]:
event_proba = rolls/n
event_proba

### Visualize distribution

In [None]:
plt.bar(range(1, 7), event_proba, color='#FA166C')
plt.xlabel('Die roll')
plt.ylabel('Event probability');

---