In [None]:
from datascience import *
import numpy as np

%matplotlib inline
import matplotlib.pyplot as plots
plots.style.use('fivethirtyeight')
import warnings
warnings.filterwarnings('ignore')

## Gender Discrimination ##

**Please run all cells before this cell, including the import cell at the top of the notebook.**

In [None]:
GenderPromotion = Table.read_table("GenderPromotion.csv")
GenderPromotion

In [None]:
GenderPromotion.group('Promoted')

In [None]:
GenderPromotion.group('Gender')

In [None]:
pivot_table = GenderPromotion.pivot('Promoted', 'Gender')
summary = pivot_table.with_column('Percent', pivot_table.column('Yes')/24)
summary

In [None]:
observed_difference = summary.column('Percent').item(1) - summary.column('Percent').item(0)
observed_difference

In [None]:
shuffled_labels = GenderPromotion.sample(with_replacement = False).column('Promoted')
shuffled_labels

In [None]:
shuffled_table = GenderPromotion.select('Gender').with_column('Promoted', shuffled_labels)
shuffled_table

In [None]:
shuffled_pivot_table = shuffled_table.pivot('Promoted', 'Gender')
shuffled_summary = shuffled_pivot_table.with_column('Percent', shuffled_pivot_table.column('Yes')/24)
shuffled_summary

In [None]:
shuffled_summary.column('Percent').item(1) - shuffled_summary.column('Percent').item(0)

In [None]:
# Simulate one value of the test statistic 
# under the hypothesis that promotion is randomly determined

def random_sample_percent_difference():
    shuffled_labels = GenderPromotion.sample(with_replacement = False).column('Promoted')
    shuffled_table = GenderPromotion.select('Gender').with_column('Promoted', shuffled_labels)
    shuffled_pivot_table = shuffled_table.pivot('Promoted', 'Gender')
    shuffled_summary = shuffled_pivot_table.with_column('Percent', shuffled_pivot_table.column('Yes')/24)
    return shuffled_summary.column('Percent').item(1) - shuffled_summary.column('Percent').item(0)

In [None]:
# Simulate 10,000 copies of the test statistic
# this code might take a few seconds to run

sample_difference = make_array()

for i in np.arange(10000):
    new_difference = random_sample_percent_difference()
    sample_difference = np.append(sample_difference, new_difference)   

In [None]:
# Compare the simulated distribution of the statistic
# and the actual observed statistic

differences_tbl = Table().with_column('Random Sample Difference', sample_difference)
differences_tbl.hist(bins = 10)
plots.scatter(observed_difference, 0, color = 'red', s=40);

In [None]:
sum(sample_difference >= observed_difference) / 10000

In [None]:
# 5% of 10,000 = 500

five_percent_point = differences_tbl.sort(0).column(0).item(9500)
five_percent_point

In [None]:
differences_tbl.hist(bins = 10)
plots.plot([five_percent_point, five_percent_point], [0, 3], color='gold', lw=2)
plots.title('Area to the left of the gold line: 5%');

### Conventions about inconsistency ###

**Please run all cells before this cell, including the import cell at the top of the notebook.**

In [None]:
# 1% of 10,000 = 100

one_percent_point = differences_tbl.sort(0).column(0).item(9900)
one_percent_point

In [None]:
differences_tbl.hist(bins = 10)
plots.plot([one_percent_point, one_percent_point], [0, 5], color='gold', lw=2)
plots.title('Area to the left of the gold line: 1%');