In [1]:
import pandas as pd

![Probabilities](ethicalmldiagram.png)

# P(race) (this would be given)

In [2]:
ProbRace = pd.read_csv('ProbRace.csv')[['Demographic','Percentage']]
ProbRace.head()

Unnamed: 0,Demographic,Percentage
0,white,0.765109
1,black,0.104995
2,hispanic,0.084471
3,asian,0.045425


# P(good | race)  (this would be given, it is the pi values)

In [3]:
ProbGoodGivenRace = pd.read_csv("ProbGoodGivenRace.csv")
ProbGoodGivenRace

Unnamed: 0,white,black,hispanic,asian
0,0.759185,0.315164,0.550595,0.80066


# P(score>=x | good, race) (this they would calculate)

In [4]:
ProbScoreGreaterThanXGivenGoodAndRace = pd.read_csv('ProbScoreGreaterThanXGivenGoodAndRace.csv').set_index('TransRisk Score')
ProbScoreGreaterThanXGivenGoodAndRace.head()

Unnamed: 0_level_0,asian,black,hispanic,white
TransRisk Score,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1
0.0,0.0,0.0,0.0,0.0
0.5,0.0,0.000161,0.000172,0.0
1.0,0.000568,0.003032,0.000527,0.000304
1.5,0.000568,0.003701,0.000527,0.000401
2.0,0.000895,0.004625,0.000874,0.000606


<hr/>

# How they will calculate P(score>=x | good, race) for tutorial

<h2 align='center'>First: find P(score = x | good, race)</h2>
### P(score=x | good, race) = P(score=x & good & race) / P(good and race)
<hr/>
### Step 1:
### P(score=x & good & race ) = P(race & score=x) * P(good | race, score=x)
###  P(score=x & good & race ) = P(race) * P(score=x | race) * P(good | race, score=x)
<hr/>
### Step 2:
### P(good and race) = P(race) * P(good | race)
<hr/>
### Step 3:
### Step 1 / Step2
### P(score=x | good, race) = P(score=x & good & race) / P(good and race)
<hr/>

## What they will be given

** ProbGoodGivenRaceAndScoreEqualsX : P(good | race, score=x) **
- This was from figure 7A

** ProbRace : P(race) **
- this was given above
- it is from the percentage of people in the demographic sample size

** ProbScoreEqualsXGivenRace : P(score=x | race) **
- This was from figure 3A

** ProbGoodGivenRace : P(good | race) **
- this was given above
- it represents the calculated pi values

In [5]:
ProbGoodGivenRaceAndScoreEqualsX = pd.read_csv('ProbGoodGivenRaceAndScoreEqualsX.csv').set_index('TransRisk Score')
ProbScoreEqualsXGivenRace = pd.read_csv('ProbScoreEqualsXGivenRace.csv').set_index('TransRisk Score')

### Step 1:

In [6]:
ProbRace.set_index('Demographic', inplace=True)

In [7]:
ProbRaceAndScoreEqualsX = pd.DataFrame({
    'white': ProbRace.Percentage['white'] * ProbScoreEqualsXGivenRace['white'],
    'asian': ProbRace.Percentage['asian'] * ProbScoreEqualsXGivenRace['asian'],
    'black': ProbRace.Percentage['black'] * ProbScoreEqualsXGivenRace['black'],
    'hispanic': ProbRace.Percentage['hispanic'] * ProbScoreEqualsXGivenRace['hispanic'],
})

ProbScoreEqualsXAndGoodAndRace = (
    ProbRaceAndScoreEqualsX * ProbGoodGivenRaceAndScoreEqualsX)

### Step 2:

In [8]:
ProbGoodAndRace = pd.DataFrame({
    'white': ProbRace.Percentage['white'] * ProbGoodGivenRace['white'],
    'asian': ProbRace.Percentage['asian'] * ProbGoodGivenRace['asian'],
    'black': ProbRace.Percentage['black'] * ProbGoodGivenRace['black'],
    'hispanic': ProbRace.Percentage['hispanic'] * ProbGoodGivenRace['hispanic'],
})

### Step 3:

In [9]:
ProbScoreEqualsXGivenGoodAndRace = pd.DataFrame({
    'white': ProbScoreEqualsXAndGoodAndRace['white'] / ProbGoodAndRace['white'].values[0],
    'asian': ProbScoreEqualsXAndGoodAndRace['asian'] / ProbGoodAndRace['asian'].values[0],
    'black': ProbScoreEqualsXAndGoodAndRace['black'] / ProbGoodAndRace['black'].values[0],
    'hispanic': ProbScoreEqualsXAndGoodAndRace ['hispanic'] / ProbGoodAndRace['hispanic'].values[0],
})


<h2 align='center'>Second: Use P(score=x | good, race) to find the P(score>=x | good, race)</h2>

##  P(score>=x | good, race) = P(score = x | good, race).cumsum()

In [10]:
ProbScoreEqualsXGivenGoodAndRace.cumsum().head()

Unnamed: 0_level_0,asian,black,hispanic,white
TransRisk Score,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1
0.0,0.0,0.0,0.0,0.0
0.5,0.0,0.000161,0.000172,0.0
1.0,0.000568,0.003032,0.000527,0.000304
1.5,0.000568,0.003701,0.000527,0.000401
2.0,0.000895,0.004625,0.000874,0.000606


In [11]:
# Might actually need to do this to obtain the correct dataframe
# (so 1.0 is at score = 0.0, not at score = 100.0)
#ProbScoreEqualsXGivenGoodAndRace.iloc[::-1].cumsum()[::-1]

# Calulating Precision (not sure if this is the way we want to do it)

<h3>What I'm working with:</h3>

** ProbScoreGreaterThanXGivenRace : P(score>=x | race) **
- obtained from cumsumming figure 3A

** ProbGoodGivenRaceAndScoreGreaterX : P(good | race, score>=x) **
- calculated in the scraping the data, this is what we used to think sensitivity was a few weeks ago

<h3 align='center'>Step 1: Find P(race and score>=x and good) - will use this to find the numerator</h3>

### P(race and score>=x and good) =  P(race and score>=x) * P(good | race & score>=x)
### P(race and score>=x) = P(race) * P(score>=x | race)

In [12]:
ProbScoreGreaterThanXGivenRace = (
    pd.read_csv('ProbScoreGreaterThanXGivenRace.csv')
    .set_index('TransRisk Score'))

ProbGoodGivenRaceAndScoreGreaterX = (
    pd.read_csv('ProbGoodGivenRaceAndScoreGreaterX.csv'))

In [13]:
ProbRaceAndScoreGreaterX = pd.DataFrame({
    'white': ProbRace.Percentage['white'] * ProbScoreGreaterThanXGivenRace['white'],
    'asian': ProbRace.Percentage['asian'] * ProbScoreGreaterThanXGivenRace['asian'],
    'black': ProbRace.Percentage['black'] * ProbScoreGreaterThanXGivenRace['black'],
    'hispanic': ProbRace.Percentage['hispanic'] * ProbScoreGreaterThanXGivenRace['hispanic'],
})
ProbRaceAndScoreGreaterXAndGood = (
    ProbGoodGivenRaceAndScoreGreaterX.set_index('TransRisk Score') * 
    ProbRaceAndScoreGreaterX).fillna(value=0)

In [14]:
ProbRaceAndScoreGreaterXAndGood.head()

Unnamed: 0_level_0,asian,black,hispanic,white
TransRisk Score,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1
0.0,0.03637,0.03309,0.04651,0.58086
0.5,0.03637,0.03309,0.04651,0.58086
1.0,0.03637,0.033085,0.046502,0.58086
1.5,0.036349,0.03299,0.046485,0.580683
2.0,0.036349,0.032968,0.046485,0.580627


<h3 align='center'>Step 2: Sum up all the demographics from P(score>=x and race) to get P(score>=x)</h3>
- this will be our denominator

In [15]:
ProbScoreGreaterX = pd.DataFrame(ProbRaceAndScoreGreaterX.sum(axis=1))
ProbScoreGreaterX.columns=['P(score>=x)']
ProbScoreGreaterX.head()

Unnamed: 0_level_0,P(score>=x)
TransRisk Score,Unnamed: 1_level_1
0.0,0.999842
0.5,0.999842
1.0,0.996305
1.5,0.983254
2.0,0.979465


<h3 align='center'>Step 3: Sum up all the demographics from P(score>=x and race and good) to get P(score>=x and good)</h3>
- this will be our numerator

In [16]:
ProbScoreGreaterXAndGood = pd.DataFrame(ProbRaceAndScoreGreaterXAndGood.sum(axis=1))
ProbScoreGreaterXAndGood.columns=['P(score>=x & good)']
ProbScoreGreaterXAndGood.head()

Unnamed: 0_level_0,P(score>=x & good)
TransRisk Score,Unnamed: 1_level_1
0.0,0.696829
0.5,0.696829
1.0,0.696816
1.5,0.696507
2.0,0.696429


<h3 align='center'>Step 4: Divide numerator / denominator to get precision!</h3>
### Precision is P(good | score>=x)

In [17]:
Precision = pd.DataFrame(
    ProbScoreGreaterXAndGood['P(score>=x & good)'] / 
    ProbScoreGreaterX['P(score>=x)'])
Precision.columns=['Precision']
Precision.head()

Unnamed: 0_level_0,Precision
TransRisk Score,Unnamed: 1_level_1
0.0,0.69694
0.5,0.69694
1.0,0.6994
1.5,0.70837
2.0,0.71103


In [19]:
pd.read_csv('figure7A.csv').cumsum()

Unnamed: 0,Score,White (Good),White (Bad),Black (Good),Black (Bad),Hispanic (Good),Hispanic (Bad),Asian (Good),Asian (Bad)
0,0.0,0.00,0.04,0.00,0.12,0.00,0.03,0.00,0.00
1,0.5,0.00,1.25,0.02,2.12,0.02,1.20,0.00,0.75
2,1.5,0.03,6.40,0.39,10.80,0.08,6.62,0.06,5.37
3,3.0,0.07,12.78,0.84,21.29,0.14,13.37,0.12,10.95
4,5.0,0.13,20.73,1.40,34.02,0.24,21.96,0.21,17.60
5,7.5,0.21,29.90,2.07,49.01,0.40,31.89,0.30,25.19
6,10.5,0.32,40.97,3.00,66.99,0.62,44.24,0.40,34.51
7,14.0,0.46,53.69,4.07,87.49,1.00,58.20,0.51,45.59
8,18.0,0.63,68.16,5.22,110.64,1.43,74.06,0.65,57.97
9,22.5,0.81,83.23,6.41,134.67,1.91,90.61,0.80,70.88
