In [None]:
from datascience import *
import numpy as np

%matplotlib inline
import matplotlib.pyplot as plots
plots.style.use('fivethirtyeight')

## Gender Discrimination ##

**Please run all cells before this cell, including the import cell at the top of the notebook.**

In [None]:
GenderPromotion = Table.read_table("GenderPromotion.csv")
GenderPromotion

In [None]:
GenderPromotion.group('Promoted')

In [None]:
GenderPromotion.group('Gender')

In [None]:
pivot_table = GenderPromotion.pivot('Promoted', 'Gender')
summary = pivot_table.with_column('Percent', pivot_table.column('Yes')/24)
summary

In [None]:
observed_difference = summary.column('Percent').item(1) - summary.column('Percent').item(0)
observed_difference

In [None]:
random_sample = GenderPromotion.select('Gender').sample(35, with_replacement=False)
random_sample = random_sample.with_column('Promoted', 'Yes')
random_sample

In [None]:
pivot_table_sample = random_sample.pivot('Promoted', 'Gender')
summary_sample = pivot_table_sample.with_column('Percent', pivot_table_sample.column('Yes')/24)
summary_sample

In [None]:
# Simulate one value of the test statistic 
# under the hypothesis that promotion is randomly determined

def random_sample_percent_difference():
    random_sample = GenderPromotion.select('Gender').sample(35, with_replacement=False)
    random_sample = random_sample.with_column('Promoted', 'Yes')
    pivot_table_sample = random_sample.pivot('Promoted', 'Gender')
    summary_sample = pivot_table_sample.with_column('Percent', pivot_table_sample.column('Yes')/24)
    return summary_sample.column('Percent').item(1) - summary_sample.column('Percent').item(0)

In [None]:
# Simulate 10,000 copies of the test statistic

sample_difference = make_array()

for i in np.arange(10000):
    sample_difference = np.append(sample_difference, random_sample_percent_difference())   

In [None]:
# Compare the simulated distribution of the statistic
# and the actual observed statistic

differences_tbl = Table().with_column('Random Sample Difference', sample_difference)
differences_tbl.hist(bins = 10)
plots.scatter(observed_difference, 0, color = 'red', s=40);

In [None]:
sum(sample_difference >= observed_difference) / 10000

In [None]:
# 5% of 10,000 = 500

five_percent_point = differences_tbl.sort(0).column(0).item(9500)
five_percent_point

In [None]:
differences_tbl.hist(bins = 10)
plots.plot([five_percent_point, five_percent_point], [0, 5], color='gold', lw=2)
plots.title('Area to the left of the gold line: 5%');

### Conventions about inconsistency ###

**Please run all cells before this cell, including the import cell at the top of the notebook.**

In [None]:
# 1% of 10,000 = 100

one_percent_point = differences_tbl.sort(0).column(0).item(9900)
one_percent_point

In [None]:
differences_tbl.hist(bins = 10)
plots.plot([one_percent_point, one_percent_point], [0, 5], color='gold', lw=2)
plots.title('Area to the left of the gold line: 1%');