In [None]:
## import statements
# These lines load the tests. 
from gofer.ok import check
import numpy as np
from datascience import *
import pandas as pd
import matplotlib
%matplotlib inline
import matplotlib.pyplot as plt
plt.style.use('ggplot')
# Fix for datascience plots
import collections as collections
import collections.abc as abc
collections.Iterable = abc.Iterable

### Normal Distribution

$$
\phi(z) = {\frac{1}{\sqrt{2 \pi}}} e^{-\frac{1}{2}z^2}, ~~ -\infty < z < \infty
$$

In [None]:
# The standard normal curve

plot_normal_cdf()

In [None]:
plot_normal_cdf(1)

In [None]:
from scipy import stats

In [None]:
stats.norm.cdf(1)

In [None]:
plot_normal_cdf(lbound=1)

In [None]:
1 - stats.norm.cdf(1)

In [None]:
# Area under the standard normal curve, between -1 and 1

plot_normal_cdf(1, lbound=-1)

In [None]:
stats.norm.cdf(1) - stats.norm.cdf(-1)

In [None]:
# Area under the standard normal curve, between -2 and 2

plot_normal_cdf(2, lbound=-2)

In [None]:
stats.norm.cdf(2) - stats.norm.cdf(-2)

### An example of standardizing data

In [None]:
heart = Table.read_table("data/heart.csv")
heart = heart.select("age", "chol")
heart.show(3)

In [None]:
heart.hist("age", "chol", bins=20)

***
Standard Units
***
$\bar{x}$=mean$(x)$<br>
<br>variance=mean$((x-\bar{x})^2)$<br>
<br>SD = $\sqrt{<(x-\bar{x})^2>}$<br>
<br>Z = ${\frac{(x-\bar{x})}{SD}}$

In [None]:
def standardize(tbl, col):
    "Standardize a column in a data table."
    new_col_name = col + "_z"
    col_data = tbl.column(col)
    average = np.mean(col_data)
    stdev = np.std(col_data)
    new_tbl = tbl.with_column(new_col_name, (col_data - average) / stdev)
    return new_tbl

In [None]:
heart = standardize(heart, "age")
heart = standardize(heart, "chol")
heart.show(3)

In [None]:
heart.select("age_z", "chol_z").hist(bins=20)

In [None]:
heart.select("age_z", "chol_z").hist(bins=np.arange(-5, 7, 0.5), overlay=False)

### Q-Q Plots -- Testing whether or not a distibution is Normal

In [None]:
import statsmodels.api as sm
import seaborn as sns
from scipy.stats import norm

In [None]:
fig, ax = plt.subplots()
data = heart.column('age_z')
sns.histplot(data, kde=False, stat='density', bins=30, ax=ax)
xx = np.linspace(data.min(), data.max(), 100)
ax.plot(xx, norm.pdf(xx, *norm.fit(data)), lw=2.5, color="k")
ax.set_xlim([-3,3])

In [None]:
fig = sm.qqplot(heart.column('age_z'), line='45')

In [None]:
fig, ax = plt.subplots()
data = heart.column('chol_z')
sns.histplot(data, kde=False, stat='density', ax=ax)
xx = np.linspace(data.min(), data.max(), 100)
ax.plot(xx, norm.pdf(xx, *norm.fit(data)), lw=2.5, color="k")

In [None]:
fig = sm.qqplot(heart.column('chol_z'), line='45')

### Central Limit Theorem in Action

In [None]:
population_size = 1000000
population = np.random.rand(1000000)
number_of_samples = 10
sample_means = np.random.rand(number_of_samples)

# Watch what happens as we grow the sample size.
sample_size = 50

c = np.random.rand(number_of_samples)
for i in range(0,number_of_samples):
 c = np.random.randint(1,population_size,sample_size)
 sample_means[i] = population[c].mean()

In [None]:
plt.subplot(1,2,1)
plt.xticks(fontsize=14)
plt.yticks(fontsize=14)
sns.histplot(sample_means,bins=int(180/5), kde=True, stat='density')
plt.title('Histogram of Sample mean',fontsize=20)
plt.xlabel('Sample mean',fontsize=20)
plt.ylabel('Count',fontsize=20)
plt.subplot(1,2,2)
plt.xticks(fontsize=14)
plt.yticks(fontsize=14)
ax = plt.gca()
sample_means_z = (sample_means - np.mean(sample_means)) / np.std(sample_means)
sm.qqplot(sample_means_z, line='45', ax=ax)
plt.title('Q-Q Plot',fontsize=20)
plt.xlabel('Sample mean',fontsize=20)
plt.ylabel('Density',fontsize=20)
plt.subplots_adjust(bottom=0.1, right=2, top=0.9)

### Mt. Saint Helens Tips

In [None]:
datafile = "../Mini-II/data/MSH_STRUCTURE_PLOT_YEAR.csv"
MSH_YEAR = Table.read_table(datafile)
MSH_YEAR

In [None]:
MSH_YEAR.where('PLOT_NAME','STRD').scatter("YEAR","RICHNESS")

In [None]:
MSH_YEAR.where('PLOT_NAME','STRD').group("YEAR",np.mean)

In [None]:
strd_year = MSH_YEAR.where('PLOT_NAME','STRD').group("YEAR",np.mean)
strd_year.show(3)

In [None]:
strd_year.scatter("YEAR", "RICHNESS mean")

In [None]:
# Just like the lab on climate change
def simulate_under_null(num_chances_to_change):
    uniform = Table().with_columns(
        "Change", make_array('Increase', 'Decrease'),
        "Chance", make_array(0.5,        0.5))
    sample = uniform.sample_from_distribution('Chance', num_chances_to_change)     
    increases = sample.column("Chance sample").item(0) 
    decreases = sample.column("Chance sample").item(1)  
    return increases - decreases  

def empirical_distribution(tbl,iterations):
    num_changes = tbl.num_rows 
    samples = make_array()
    for i in np.arange(iterations):
        samples = np.append(samples, simulate_under_null(num_changes)) 
    Table().with_column('Test statistic under null', samples).hist(bins=np.arange(min(samples), max(samples) + max(samples) , 2))
    return samples

In [None]:
samples = empirical_distribution(strd_year,10000)

In [None]:
def diff_n(values, n):
    return np.array(values)[n:] - np.array(values)[:-n]

def changes(rates, years = 1):
    "Return the number of increases minus the number of decreases after two years."
    differences = diff_n(rates, years)  
    positives = np.count_nonzero(differences > 0)
    negatives = np.count_nonzero(differences < 0) 
    return positives - negatives  

In [None]:
num_obs = strd_year.num_rows
num_obs

In [None]:
num_pos_changes = changes(strd_year.column('RICHNESS mean'))
num_pos_changes

In [None]:
# Calculate p-value
np.count_nonzero(samples > num_pos_changes) / len(samples)

In [None]:
before2001 = strd_year.where('YEAR', are.below(2001))
before2001.scatter("YEAR", "RICHNESS mean")

In [None]:
num_obs = before2001.num_rows
num_obs

In [None]:
num_pos_changes = changes(before2001.column('RICHNESS mean'))
num_pos_changes

In [None]:
samples = empirical_distribution(before2001,10000)

In [None]:
# Calculate p-value
np.count_nonzero(samples > num_pos_changes) / len(samples)