# Case study: Gender discrimination

## Setup

In [None]:
import pandas as pd

## Import data

In [1]:
df = pd.read_csv("https://raw.githubusercontent.com/kirenz/datasets/master/gender_discrimination.csv")
df.head()

Unnamed: 0,gender,decision
0,male,promoted
1,male,promoted
2,male,promoted
3,male,promoted
4,male,promoted


In [79]:
df.info()

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 48 entries, 0 to 47
Data columns (total 3 columns):
 #   Column         Non-Null Count  Dtype 
---  ------         --------------  ----- 
 0   gender         48 non-null     object
 1   decision       48 non-null     object
 2   gender_random  48 non-null     object
dtypes: object(3)
memory usage: 1.2+ KB


## Create a crosstable

In [12]:
pd.crosstab(df.gender, df.decision,  margins=True)

decision,not promoted,promoted,All
gender,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1
female,10,14,24
male,3,21,24
All,13,35,48


## Data analysis

In [80]:
df_male = df[(df["gender"] == "male")]
df_female = df[(df["gender"] == "female")]

male_total = len(df_male)
female_total = len(df_female)

male_promoted = df_male['decision'].value_counts().promoted
female_promoted = df_female['decision'].value_counts().promoted

male_p = round(male_promoted/male_total, 3)
female_p = round(female_promoted/female_total, 3)

p_diff = round(male_p - female_p, 3)
p_diff

0.292

## Randomization

One randomization:

In [83]:
df['gender_random'] = df['gender'].sample(frac=1).reset_index(drop=True)

Since the randomization of files in this simulation is independent of the promotion decisions, any difference in promotion rates is due to chance.

In [84]:
pd.crosstab(df.gender_random, df.decision,  margins=True)

decision,not promoted,promoted,All
gender_random,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1
female,6,18,24
male,7,17,24
All,13,35,48


In [85]:
df_male = df[(df["gender_random"] == "male")]
df_female = df[(df["gender_random"] == "female")]

male_total = len(df_male)
female_total = len(df_female)

male_promoted = df_male['decision'].value_counts().promoted
female_promoted = df_female['decision'].value_counts().promoted

male_p = round(male_promoted/male_total, 3)
female_p = round(female_promoted/female_total, 3)

p_diff = round(male_p - female_p, 3)
p_diff

-0.042

Multiple randomization