In [1]:
import pandas as pd
import numpy as np
import statsmodels.api as sm
from scipy import stats
from statsmodels.iolib.table import (SimpleTable, default_txt_fmt)


In [9]:
df = pd.read_csv("ovb.csv")
df['const'] = 1

In [10]:
#Collecting women in the data:
dfwomen = df[df.female == 1]

In [7]:
#Correlation Coefficient:
corr_sq = np.corrcoef(dfwomen.educ, dfwomen.logwage)[1][0]**2

print("The correlation coefficient squared between education and logwage of women is: {}".format(corr_sq))

The correlation coefficient squared between education and logwage of women is: 0.22388700299289635


In [16]:
reg = sm.OLS(dfwomen.logwage, dfwomen[['educ', 'const']]).fit()


In [21]:
sumtable = np.vstack([[reg.rsquared], [corr_sq]])
sumtable = np.round(sumtable, 6)
col = [' R**2 vs Corr Coef' ]
row = ['Rsquared','corrsquared' ]
tb1 = SimpleTable(sumtable,col, row, txt_fmt=default_txt_fmt)

In [22]:
print(tb1)

             R**2 vs Corr Coef
------------------------------
Rsquared         0.223887     
corrsquared      0.223887     
------------------------------


### 3

#### a) 
The unpooled t-test shows that the mean difference between female immigrants and female non-immigrant logwages are significant. 
        t-test = -1111.506

In [81]:
#Mean and standard error imm = 0: X
Mx = dfwomen.loc[dfwomen['imm'] == 0].logwage.mean()
sex = stats.sem(df.loc[df['imm'] == 0].logwage)
nx = dfwomen.loc[dfwomen['imm'] == 1].logwage.count()

print("The mean of female non-immigrants: {}".format(Mx))
print("The standard error of female non-immigrants: {}".format(sex))


The mean of female non-immigrants: 2.886378280700209
The standard error of female non-immigrants: 0.0051014224397050185


In [82]:
#Mean and standard error imm = 1: Y
My = dfwomen.loc[dfwomen['imm'] == 1].logwage.mean()
sey = stats.sem(df.loc[df['imm'] == 1].logwage)
ny = dfwomen.loc[dfwomen['imm'] == 1].logwage.count()

print("The mean of female immigrants: {}".format(My))
print("The standard error of female immigrants: {}".format(sey))


The mean of female immigrants: 2.706392526468514
The standard error of female immigrants: 0.011012508469630942


##### Test statistic:

In [83]:
ttest = (My-Mx) / np.sqrt((sex**2 / nx) + (sex**2 / ny))
print("The unpooled t-test for the difference: {}".format(ttest))
print('Difference in mean: {}'.format(My-Mx))

The unpooled t-test for the difference: -1111.5067433677216
Difference in mean: -0.17998575423169516


#### b)

In [79]:
reg1 = sm.OLS(dfwomen.logwage, dfwomen[['const','imm']]).fit()
reg1.summary()

0,1,2,3
Dep. Variable:,logwage,R-squared:,0.011
Model:,OLS,Adj. R-squared:,0.011
Method:,Least Squares,F-statistic:,118.5
Date:,"Wed, 27 Feb 2019",Prob (F-statistic):,1.85e-27
Time:,17:25:06,Log-Likelihood:,-10701.0
No. Observations:,10601,AIC:,21410.0
Df Residuals:,10599,BIC:,21420.0
Df Model:,1,,
Covariance Type:,nonrobust,,

0,1,2,3,4,5,6
,coef,std err,t,P>|t|,[0.025,0.975]
const,2.8864,0.007,403.480,0.000,2.872,2.900
imm,-0.1800,0.017,-10.887,0.000,-0.212,-0.148

0,1,2,3
Omnibus:,353.044,Durbin-Watson:,1.837
Prob(Omnibus):,0.0,Jarque-Bera (JB):,538.131
Skew:,0.324,Prob(JB):,1.4000000000000001e-117
Kurtosis:,3.893,Cond. No.,2.67


In [84]:
beta_imm = reg1.params[1]
print("The mean difference for the mean and the immigration coefficient from OLS imm is : {}".format((My-Mx) - beta_imm))




The mean difference for the mean and the immigration coefficient from OLS imm is : 9.71445146547012e-16


In [87]:
#Test statistic for beta:
sem1 = reg1.bse[1]
t_beta = beta_imm / sem1
t_beta

-10.887143729424317

In [89]:
se = np.vstack([[reg1.bse], [reg1.HC1_se]])
se = np.round(se,5)
col = ['constant', 'Immigrant']
row = ['OLS', "Heteroskedasticity-robust"]
tb1 = SimpleTable(se,col, row, txt_fmt=default_txt_fmt)
tb1

0,1,2
,constant,Immigrant
OLS,0.00715,0.01653
Heteroskedasticity-robust,0.00702,0.01753
