# Case study: Gender discrimination

## Setup

In [9]:
import pandas as pd
import altair as alt

## Import data

In [10]:
df = pd.read_csv("https://raw.githubusercontent.com/kirenz/datasets/master/gender_discrimination.csv")
df.head()

Unnamed: 0,gender,decision
0,male,promoted
1,male,promoted
2,male,promoted
3,male,promoted
4,male,promoted


In [11]:
df.info()

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 48 entries, 0 to 47
Data columns (total 2 columns):
 #   Column    Non-Null Count  Dtype 
---  ------    --------------  ----- 
 0   gender    48 non-null     object
 1   decision  48 non-null     object
dtypes: object(2)
memory usage: 896.0+ bytes


## Observed data

## Crosstable

In [12]:
pd.crosstab(df.gender, df.decision,  margins=True)

decision,not promoted,promoted,All
gender,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1
female,10,14,24
male,3,21,24
All,13,35,48


## Calculate difference

In [13]:
df_male = df[(df["gender"] == "male")]
df_female = df[(df["gender"] == "female")]

male_total = len(df_male)
female_total = len(df_female)

male_promoted = df_male['decision'].value_counts().promoted
female_promoted = df_female['decision'].value_counts().promoted

male_p = round(male_promoted/male_total, 3)
female_p = round(female_promoted/female_total, 3)

p_diff_ob = round(male_p - female_p, 3)

df_p_diff_ob = pd.DataFrame({'p_diff': [p_diff_ob] })
df_p_diff_ob

Unnamed: 0,p_diff
0,0.292


## Randomization

### Crosstable

Example with only one randomization:

In [14]:
df['gender_random'] = df['gender'].sample(frac=1, random_state=123).reset_index(drop=True)

Since the randomization of files in this simulation is independent of the promotion decisions, any difference in promotion rates is due to chance.

In [15]:
pd.crosstab(df.gender_random, df.decision,  margins=True)

decision,not promoted,promoted,All
gender_random,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1
female,7,17,24
male,6,18,24
All,13,35,48


### Calculate difference

#### One randomization

In [16]:
df_male = df[(df["gender_random"] == "male")]
df_female = df[(df["gender_random"] == "female")]

male_total = len(df_male)
female_total = len(df_female)

male_promoted = len(df_male[df_male['decision']=="promoted"])
female_promoted = len(df_female[df_female['decision']=="promoted"])

male_p = round(male_promoted/male_total, 3)
female_p = round(female_promoted/female_total, 3)

p_diff = round(male_p - female_p, 3)
p_diff

0.042

#### Multiple randomizations

In [17]:
# create an empty list
random_difference = []

# make 100 randomizations and save results
for i in range(0, 100):

    df['gender_random'] = df['gender'].sample(frac=1, random_state=i).reset_index(drop=True)

    df_male = df[(df["gender_random"] == "male")]
    df_female = df[(df["gender_random"] == "female")]

    male_total = len(df_male)
    female_total = len(df_female)

    male_promoted = df_male['decision'].value_counts().promoted
    female_promoted = df_female['decision'].value_counts().promoted

    male_p = round(male_promoted/male_total, 3)
    female_p = round(female_promoted/female_total, 3)

    p_diff = round(male_p - female_p, 3)

    random_difference.append(p_diff)

In [18]:
# create pandas dataframe
df_random = pd.DataFrame({'p_diff': random_difference})


In [19]:
df_random.describe().T

Unnamed: 0,count,mean,std,min,25%,50%,75%,max
p_diff,100.0,-0.01501,0.124152,-0.292,-0.125,-0.042,0.042,0.292


In [20]:
chart1 = alt.Chart(df_random).mark_circle(size=100).transform_window(
    id='rank()',
    groupby=['p_diff']
).encode(
    alt.X('p_diff:O', title='Differences in promotion rates (male - female) across 100 shuffles'),
    alt.Y('id:O',
          axis=None,
          sort='descending')
).properties(height=300, width=400)


chart2 = alt.Chart(df_p_diff_ob).mark_circle(size=100).transform_window(
    id='rank()',
    groupby=['p_diff']
).encode(
    alt.X('p_diff:O'),
    alt.Y('id:O',
          axis=None,
          sort='descending'),
    color=alt.value('orange')
)

chart1 + chart2 


## Result

In [42]:
count_diff = df_random[df_random['p_diff'] >= p_diff_ob].count().p_diff

p_value = count_diff/len(df_random)
p_value

0.01


For our simulation, we get a 1% probability of obtaining a sample where ≥ 29.2% more male candidates than female candidates get promoted under the null hypothesis, 
We conclude The data provide strong evidence of sex discrimination against female candidates 
We reject the null hypothesis in favor of the alternative

 ## p-value and statistical significance

- H0: Sex has no effect on promotion decisions.

- HA: Female candidates are discriminated against in promotion decisions.

The 1-in-100 chance is what we call a p-value, which is a probability quantifying the strength of the evidence against the null hypothesis, given the observed data.

In [43]:
p_value

0.01

When the p-value is small (usually less than 0.05), less than a previously set threshold, we say the results are statistically significant. This means the data provide such strong evidence against H0 that we reject the null hypothesis in favor of the alternative hypothesis. 

In [44]:
def significance(p_value):
    if p_value <= 0.05:
        return "Reject Null Hypothesis"
    else:
        return "Accept Null Hypothesis"

In [45]:
significance(p_value)

'Reject Null Hypothesis'