In [3]:
import pandas as pd
import numpy as np
from scipy import stats

In [4]:
data = pd.io.stata.read_stata('data/us_job_market_discrimination.dta')

In [5]:
# number of callbacks for black-sounding names
sum(data[data.race=='b'].call)

157.0

In [6]:
data.head()

Unnamed: 0,id,ad,education,ofjobs,yearsexp,honors,volunteer,military,empholes,occupspecific,...,compreq,orgreq,manuf,transcom,bankreal,trade,busservice,othservice,missind,ownership
0,b,1,4,2,6,0,0,0,1,17,...,1.0,0.0,1.0,0.0,0.0,0.0,0.0,0.0,0.0,
1,b,1,3,3,6,0,1,1,0,316,...,1.0,0.0,1.0,0.0,0.0,0.0,0.0,0.0,0.0,
2,b,1,4,1,6,0,0,0,0,19,...,1.0,0.0,1.0,0.0,0.0,0.0,0.0,0.0,0.0,
3,b,1,3,4,6,0,1,0,1,313,...,1.0,0.0,1.0,0.0,0.0,0.0,0.0,0.0,0.0,
4,b,1,3,3,22,0,0,0,0,313,...,1.0,1.0,0.0,0.0,0.0,0.0,0.0,1.0,0.0,Nonprofit


In [7]:
Nb =len(data[data.race=='b'])
Nw=len(data[data.race=='w'])

In [8]:
Nb,Nw

(2435, 2435)

In [9]:
# The Central Limit Theorem applies to binomial population provided that the minimum 
#of np and n(1-p) is t least 5.
# In this problem, the sample size is large enough that the CLT can be applied


In [10]:
Nb_call=sum(data[data.race=='b'].call)
Nw_call=sum(data[data.race=='w'].call)

In [11]:
percentage_b=Nb_call/Nb
percentage_w=Nw_call/Nw
print("the percentage of black name got call is:",percentage_b)
print ("the percentage of white name got call is:",percentage_w)

the percentage of black name got call is: 0.064476386037
the percentage of white name got call is: 0.0965092402464


In [12]:
# The null hypothesis is: there is no difference between black and white names for call back
# H0: percentage_b=percentage_w
# For estimating the standard deviation for a sample proportion, the equation is
# std=sqrt(sample proportion*(1-sample proportion)/sample size)
# Margin of Error for a proportion= Z*sqrt(p1*(1-p1)/n+p2*(1-p2)/m), n,m are sample sizes

In [13]:
var_b=(1-percentage_b)*percentage_b/Nb
var_b

2.4771737856498466e-05

In [14]:
var_w=(1-percentage_w)*percentage_w/Nw
var_w

3.5809119833046381e-05

In [15]:
# For 95% confidence interval, alpha=0.25, Z=1.96

In [16]:
std_diff=np.sqrt(var_b+var_w)
print ("The std diff is:", std_diff)
Margin_of_Error = 1.96*std_diff
print("The margin of error for the difference is:", Margin_of_Error)

The std diff is: 0.00778337058668
The margin of error for the difference is: 0.0152554063499


In [17]:
Diff =percentage_w-percentage_b
CI=[Diff-Margin_of_Error, Diff+Margin_of_Error]
print("The 95% confidence interval is:",CI)

The 95% confidence interval is: [0.016777447859559147, 0.047288260559332024]


In [18]:
Z = (Diff-0)/std_diff
print("The z value for the hypothesis test is:", Z)
pval = 2*(1-stats.norm.cdf(abs(Z)))
print ("The p value for the hypothesis test is:", pval)

The z value for the hypothesis test is: 4.11555043573
The p value for the hypothesis test is: 3.86256520752e-05


In [19]:
# The P value is much smaller than the significance value 0.025, indicating the data is 
# more extreme than the normal distribution. Thus, REJECT the null hypothesis that there is no diff
# between the black-soundig name and the white-sounding name for calling back.

In [20]:
# From the hypothesis test, there is a difference between the black and white souding names for
# for calling back, thus race does have an impact on calling back from resume.
# However, it may not be the MOST important factor. There are other factors that may affect the value.