# Case study: Gender discrimination

## Setup

In [None]:
import pandas as pd
import altair as alt

## Import data

In [None]:
df = pd.read_csv("https://raw.githubusercontent.com/kirenz/datasets/master/gender_discrimination.csv")

In [None]:
# Take a look at the data

## Observed data

### Crosstable

In [None]:
pd.___(___, ___,  margins=True)

### Calculate difference

In [None]:
df_male = df[(df["gender"] == "___")]
df_female = df[(df["___"] == "female")]

In [None]:
df_female

In [None]:
# Calculate lenght of dataframe (number of people)
male_total = ___(df_male)
female_total = len(___)


In [None]:

male_promoted = df_male['___'].value_counts().promoted
female_promoted = df_female['decision'].___().promoted


In [None]:

male_p = round(male_promoted/___, 3)
female_p = round(female_promoted/___, 3)


In [None]:

p_diff_ob = round(male_p - ___, 3)


In [None]:

df_p_diff_ob = pd.DataFrame({'p_diff': [___] })
df_p_diff_ob

## Randomization

### Crosstable

Example with only one randomization:

In [None]:
df['gender_random'] = df['gender'].sample(frac=1, random_state=123).reset_index(drop=True)

Since the randomization of files in this simulation is independent of the promotion decisions, any difference in promotion rates is due to chance.

In [None]:
pd.crosstab(df.gender_random, ___,  margins=True)

### Calculate difference

#### One randomization

In [None]:
df_male = df[(df["gender_random"] == "male")]
df_female = df[(df["gender_random"] == "female")]

male_total = len(df_male)
female_total = len(df_female)

male_promoted = len(df_male[df_male['decision']=="promoted"])
female_promoted = len(df_female[df_female['decision']=="promoted"])

male_p = round(male_promoted/male_total, 3)
female_p = round(female_promoted/female_total, 3)

p_diff = round(male_p - female_p, 3)
p_diff

#### Multiple randomizations

In [None]:
# create an empty list
random_difference = []

# make 100 randomizations and save results
for i in range(0, 100):

    df['gender_random'] = df['gender'].sample(frac=1, random_state=i).reset_index(drop=True)

    df_male = df[(df["gender_random"] == "male")]
    df_female = df[(df["gender_random"] == "female")]

    male_total = len(df_male)
    female_total = len(df_female)

    male_promoted = df_male['decision'].value_counts().promoted
    female_promoted = df_female['decision'].value_counts().promoted

    male_p = round(male_promoted/male_total, 3)
    female_p = round(female_promoted/female_total, 3)

    p_diff = round(male_p - female_p, 3)

    random_difference.append(p_diff)

In [None]:
# create pandas dataframe
df_random = pd.DataFrame({'p_diff': random_difference})


In [None]:
df_random.describe().T

In [None]:
chart1 = alt.Chart(df_random).mark_circle(size=100).transform_window(
    id='rank()',
    groupby=['p_diff']
).encode(
    alt.X('p_diff:O', title='Differences in promotion rates (male - female) across 100 shuffles'),
    alt.Y('id:O',
          axis=None,
          sort='descending')
).properties(height=300, width=400)


chart2 = alt.Chart(df_p_diff_ob).mark_circle(size=100).transform_window(
    id='rank()',
    groupby=['p_diff']
).encode(
    alt.X('p_diff:O'),
    alt.Y('id:O',
          axis=None,
          sort='descending'),
    color=alt.value('orange')
)

chart1 + chart2 


## Result

In [None]:
count_diff = df_random[df_random['p_diff'] >= p_diff_ob].count().p_diff

p_value = count_diff/len(df_random)
p_value


For our simulation, we get a ___% probability of obtaining a sample where ≥ 29.2% more male candidates than female candidates get promoted under the null hypothesis, 
We conclude The data provide strong evidence of sex discrimination against female candidates 
We reject the null hypothesis in favor of the alternative

 ## p-value and statistical significance

- H0: Sex has no effect on promotion decisions.

- HA: Female candidates are discriminated against in promotion decisions.

The 1-in-100 chance is what we call a p-value, which is a probability quantifying the strength of the evidence against the null hypothesis, given the observed data.

In [None]:
p_value

When the p-value is small (usually less than 0.05), less than a previously set threshold, we say the results are statistically significant. This means the data provide such strong evidence against H0 that we reject the null hypothesis in favor of the alternative hypothesis. 

In [None]:
def significance(p_value):
    if p_value <= 0.05:
        return "Reject Null Hypothesis"
    else:
        return "Accept Null Hypothesis"

In [None]:
significance(p_value)