In [57]:
import numpy as np
import scipy.stats as stats
import matplotlib.pyplot as plt
import arviz as az
import pandas as pd
from scipy.special import expit as logistic

import stan_jupyter as stan


## Question 1

__The data in data(NWOGrants) are outcomes for scientific funding applications for the Netherlands Organization for Scientific Research (NWO) from 2010–2012. 
These data have avery similar structure to the UCBAdmit data discussed in Chapter 11.__

- Draw a DAG for this sample and then use one or more binomial GLMs to estimate the TOTAL
causal effect of gender on grant awards.

In [73]:
df = pd.read_csv("../data/NWOGrants.csv", delimiter=";")

In [74]:
df["gender_num"] = df.groupby("gender").ngroup() + 1
df["discipline_num"] = df.groupby("discipline").ngroup() + 1

In [75]:
with open('models/w5_1.stan') as f:
    model_code = f.read()
    
model_data = {"N": len(df),
              "N_GENDERS": df.gender_num.nunique(),
              "gender": df.gender_num.tolist(),
              "awards": df.awards.tolist(),
              "applications": df.applications.tolist()}

In [76]:
%%capture
posterior = stan.build(model_code, model_data)

In [77]:
%%capture
fit = posterior.sample(num_chains=4, num_samples=10000)

In [78]:
np.mean(fit['a_gender'][0] - fit['a_gender'][1])

-0.20439080735317974

In [79]:
print("P(award | male_applicant) = ", round(np.mean(logistic(fit['a_gender'][0])), 3))
print("P(award | female_applicant) = ", round(np.mean(logistic(fit['a_gender'][1])), 3))

print("On average, P(award | male_candidate) is ", abs(round(np.mean(logistic(fit['a_gender'][0]) - logistic(fit['a_gender'][1])), 3)), "lower than P(award | female candidate)")

P(award | male_applicant) =  0.15
P(award | female_applicant) =  0.178
On average, P(award | male_candidate) is  0.028 lower than P(award | female candidate)


## Question 2

__Now estimate the DIRECT causal effect of gender on grant awards.__

- Compute the average direct causal effect of gender, weighting each discipline in proportion to the number of applications in the sample. 
- Refer to the marginal effect example in Lecture 9 for help.

In [80]:
with open('models/w5_2.stan') as f:
    model_code = f.read()
    
model_data = {"N": len(df),
              "N_GENDERS": df.gender_num.nunique(),
              "N_DISCIPLINES": df.discipline_num.nunique(),
              "discipline": df.discipline_num.tolist(),
              "gender": df.gender_num.tolist(),
              "awards": df.awards.tolist(),
              "applications": df.applications.tolist()}

In [81]:
%%capture
posterior = stan.build(model_code, model_data)

In [82]:
%%capture
fit = posterior.sample(num_chains=4, num_samples=10000)

In [83]:
print("P(award | male_applicant) = ", round(np.mean(logistic(fit['a_gender'][0])), 3))
print("P(award | female_applicant) = ", round(np.mean(logistic(fit['a_gender'][1])), 3))

print("On average, P(award | male_candidate) is ", abs(round(np.mean(logistic(fit['a_gender'][0]) - logistic(fit['a_gender'][1])), 3)), "lower than P(award | female candidate)")

P(award | male_applicant) =  0.219
P(award | female_applicant) =  0.243
On average, P(award | male_candidate) is  0.024 lower than P(award | female candidate)


In [84]:
np.mean(logistic(fit['a_gender'][1]))

0.24258866695725356

In [85]:
np.mean(logistic(fit['a_gender'][0]) - logistic(fit['a_gender'][1]))

-0.023741536346116754

## Question 3

Considering the total effect (problem 1) and direct effect (problem 2) of gender, what causes contribute to the average difference between women and men in award rate in this sample? It is not necessary to say whether or not there is evidence of discrimination. 

- Simply explain how the direct effects you have estimated make sense (or not) of the total effect.

## Question 4 (optional). 

The data in data(UFClefties) are the outcomes of 205 Ultimate Fighting Championship (UFC) matches (see ?UFClefties for details). It is widely believed that left-handed fighters (aka “Southpaws”) have an advantage against right-handed fighters, and left-handed men are indeed over-represented among fighters (and fencers and tennis players) compared to the general population. 

- Estimate the average advantage, if any, that a left-handed fighter has against right-handed fighters. Based upon your estimate, why do you think lefthanders are over-represented among UFC fighters?