# Hypothesis testing
## Mini project II - Mount Saint Helens
Elements of Data Science

In [None]:
import numpy as np
from datascience import *

In [None]:
%matplotlib inline
import matplotlib.pyplot as plt
plt.style.use('fivethirtyeight')

## Create differences Test statistic 
We are looking for a trend, so we compare the number of increases with the number of decreases.

In [None]:
def diff_n(values, n):
    return np.array(values)[n:] - np.array(values)[:-n]

### Student Challenge
Explain in words with the `diff_n` function does.

In [None]:
# Test the function by making a sample array
sample_array = make_array(1.0, 1.5,1.4, 1.2, 2.0, 1.8)

In [None]:
positive = np.count_nonzero(diff_n(sample_array, 2) > 0)
positive

In [None]:
negative = np.count_nonzero(diff_n(sample_array, 2) < 0)
negative

In [None]:
positive - negative

In [None]:
def changes(rates, years = 1):
    '''Return the number of increases minus the number of decreases after a specified number of years.'''
    differences = diff_n(rates, years)  
    positives = np.count_nonzero(differences > 0)
    negatives = np.count_nonzero(differences < 0) 
    return positives - negatives  

In [None]:
def sim_null(num=200):
    '''Simulate random increases and decreases num times. Return the net positives.'''
    tchance = np.random.choice(["Increase", "Decrease"], num)
    inc = np.sum(tchance=="Increase")
    dec = np.sum(tchance=="Decrease")
    return inc - dec

In [None]:
sim_null()

### Inference and biodiversity recovery

In [None]:
# Read the data
datafile = "../../../Mini Project II/data/MSH_STRUCTURE_PLOT_YEAR.csv"
MSH_YEAR = Table.read_table(datafile)
MSH_YEAR.show(2)

In [None]:
# Find the plot names
np.unique(MSH_YEAR['PLOT_NAME'])

In [None]:
# Scatter plot for the Abysmal Plain (ABPL)
MSH_YEAR.where('PLOT_NAME', 'ABPL').scatter("YEAR", "COVER_%")

In [None]:
# Average over all of the subplot by year
MSH_group = MSH_YEAR.where('PLOT_NAME', 'ABPL').group("YEAR", np.mean)
MSH_group.show(3)

In [None]:
# Make a scatter plot of the averages
MSH_group.scatter("YEAR", "COVER_% mean",  label='ABPL data Cover %', color='green')
plt.xlim(1995,2010);

In [None]:
# Net increases in perceent cover
COVER_APBL = MSH_group.column("COVER_% mean")
changes(COVER_APBL,1)

In [None]:
# Number of chances to change
len(COVER_APBL) - 1

In [None]:
# Create the table to sample from
uniform = Table().with_columns(
        "Change", make_array('Increase', 'Decrease'),
        "Chance", make_array(0.5,        0.5))
uniform.sample_from_distribution('Chance', 13)  

In [None]:
def simulate_under_null(num_chances_to_change):
    uniform = Table().with_columns(
        "Change", make_array('Increase', 'Decrease'),
        "Chance", make_array(0.5,        0.5))
    sample = uniform.sample_from_distribution('Chance', num_chances_to_change)     
    increases = sample.column("Chance sample").item(0) 
    decreases = sample.column("Chance sample").item(1)  
    return increases - decreases  

In [None]:
def empirical_distribution(tbl, iterations):
    num_changes = tbl.num_rows - 1
    samples = make_array()
    for i in np.arange(iterations):
        samples = np.append(samples, simulate_under_null(num_changes)) 
    Table().with_column('Test statistic under null', samples).hist(bins=np.arange(min(samples), max(samples) + max(samples) , 2))
    return samples

In [None]:
APBL_group = MSH_YEAR.where('PLOT_NAME','ABPL').group("YEAR",np.mean)
APBL_group.show(3)

In [None]:
changes(APBL_group.column('COVER_% mean'))

In [None]:
APBL_group.scatter('YEAR', 'COVER_% mean')

In [None]:
samples = empirical_distribution(APBL_group,10000)
plt.scatter(5,0);

In [None]:
tstatistic = changes(APBL_group.column('COVER_% mean'))
tstatistic

In [None]:
pvalue = np.count_nonzero(samples >= tstatistic) / len(samples)
pvalue

## More detailed plot

In [None]:
plt.hist(samples, label='null distribution', bins = np.arange(-20,20,2),color='yellow',alpha=0.4, edgecolor='black', linewidth=1.2)
plt.scatter(tstatistic,0, s=300, label='test statistic',marker='o', 
            c='red',alpha=0.8, edgecolors='blue')
plt.legend(loc='center left', bbox_to_anchor=(1.1, 0.5), labelspacing=3)
plt.title('Cover_% Changes')
plt.text(25, 500, 'p-value = '+str(pvalue), color = "black")
plt.show()

In [None]:
# Create bins
bins = np.arange(-20, 20, 2)

# Split the data
left_data = samples[samples <= tstatistic]
right_data = samples[samples > tstatistic]

# Plot the left part of the histogram (values <= test statistic)
plt.hist(left_data, bins=bins, color='yellow', alpha=0.4, edgecolor='black', linewidth=1.2)

# Plot the right part of the histogram (values > test statistic)
plt.hist(right_data, bins=bins, color='lightblue', alpha=0.4, edgecolor='black', linewidth=1.2)

# Add the scatter point for the test statistic
plt.scatter(tstatistic, 0, s=300, label='test statistic', marker='o', 
            c='red', alpha=0.8, edgecolors='blue')

# Add legend, title, and p-value text
plt.legend(['test statistic', 'Null Hypothesis Distribution'], loc='center left', bbox_to_anchor=(1.1, 0.5), labelspacing=3)
plt.title('Cover_% Changes')
plt.text(25, 500, 'p-value = '+str(pvalue), color="black")

plt.show()

## Student Challenge

Is the result statistically significant?

Does this surprise you?

What is your explanation?

