In [None]:
from datascience import *
import numpy as np

%matplotlib inline
import matplotlib.pyplot as plots
plots.style.use('fivethirtyeight')
import warnings
warnings.filterwarnings('ignore')

## Random Permutation (Shuffling)

**Please run all cells before this cell, including the import cell at the top of the notebook.**

In [None]:
letters = Table().with_column('Letter', make_array('a', 'b', 'c', 'd', 'e'))

In [None]:
letters.sample()

In [None]:
letters.sample(with_replacement = False)

In [None]:
letters.with_column('Shuffled', letters.sample(with_replacement = False).column(0))

## Simulation Under Null Hypothesis

**Please run all cells before this cell, including the import cell at the top of the notebook.**

In [None]:
births = Table.read_table('baby.csv')

In [None]:
smoking_and_birthweight = births.select('Maternal Smoker', 'Birth Weight')
smoking_and_birthweight

In [None]:
shuffled_labels = smoking_and_birthweight.sample(with_replacement=False).column('Maternal Smoker')

In [None]:
original_and_shuffled = smoking_and_birthweight.with_column('Shuffled Label', shuffled_labels)
original_and_shuffled

In [None]:
def difference_of_means(table, label, group_label):
    reduced = table.select(label, group_label)  
    means_table = reduced.group(group_label, np.average)
    means = means_table.column(1)
    return means.item(0) - means.item(1)

In [None]:
difference_of_means(original_and_shuffled, 'Birth Weight', 'Shuffled Label')

In [None]:
difference_of_means(original_and_shuffled, 'Birth Weight', 'Maternal Smoker')

## Permutation Test

**Please run all cells before this cell, including the import cell at the top of the notebook.**

In [None]:
def one_simulated_difference(table, label, group_label):
    """Takes: name of table, column label of numerical variable,
    column label of group-label variable
    Returns: Difference of means of the two groups after shuffling labels"""
    
    # array of shuffled labels
    shuffled_labels = table.sample(with_replacement = False).column(group_label)
    
    # table of numerical variable and shuffled labels
    shuffled_table = table.select(label).with_column('Shuffled Label', shuffled_labels)
    
    return difference_of_means(shuffled_table, label, 'Shuffled Label')   

In [None]:
one_simulated_difference(births, 'Birth Weight', 'Maternal Smoker')

In [None]:
differences = make_array()

for i in np.arange(2500):
    new_difference = one_simulated_difference(births, 'Birth Weight', 'Maternal Smoker')
    differences = np.append(differences, new_difference)

In [None]:
observed_difference = difference_of_means(original_and_shuffled, 'Birth Weight', 'Maternal Smoker')

In [None]:
Table().with_column('Difference Between Group Means', differences).hist()
print('Observed Difference:', observed_difference)
plots.title('Prediction Under the Null Hypothesis');