In [1]:
import pandas as pd
import numpy as np
import statsmodels.formula.api as smf
from statsmodels.api import add_constant
from statsmodels.sandbox.regression.gmm import IV2SLS

%store -r perform_dist_high_compare_b3

In [2]:
df = pd.read_csv('perform_dist_high_compare_b2.csv', header = 0)
df = df.dropna()
df

Unnamed: 0.1,Unnamed: 0,Student,group,abs_perform_diff_best,phase,Q7_Q7_1,Q7_Q7_2,Q8_Q8_1,Q10,similarity
1,1,bdvegat,2,260.00,1,1.0,2.0,4.0,1.0,0.018072
2,2,Phoenixest,2,185.83,1,0.0,1.0,3.0,1.0,0.017964
3,3,HashNick,2,260.00,1,1.0,3.0,5.0,3.0,0.028302
4,4,ccvacad,2,260.00,1,1.0,0.0,4.0,2.0,0.000000
5,5,joaortizro,2,152.50,1,0.0,2.0,2.0,1.0,0.041420
...,...,...,...,...,...,...,...,...,...,...
162,162,lsfinite,2,358.33,4,1.0,1.0,6.0,3.0,0.008392
163,163,xdanielsb,2,283.33,4,3.0,3.0,3.0,2.0,0.040128
164,164,CSebasGomez,2,358.33,4,3.0,3.0,5.0,2.0,0.000000
165,165,jhcardenasa,2,358.33,4,4.0,4.0,5.0,3.0,0.000000


In [3]:
#Let's confirm that aspiration satisfy the relevance condition for performance distance to the best 
reg_expr = 'abs_perform_diff_best ~ Q10'

# Build and train an OLS model that regresses performance distance to the best on aspiration and verify
# using the F-test that coefficients of aspiration is significant 
olsr_model = smf.ols(formula=reg_expr, data=df)
olsr_model_results = olsr_model.fit()
print(olsr_model_results.summary())

df['ln_similarity'] = np.log(df['similarity'] + 1)

# Build out the exog matrix. Statsmodels requires this matrix to contain all the endogenous and
# exogenous variables, plus the constant.
exog = df[['abs_perform_diff_best']]
exog = add_constant(exog)

# Build out the instruments matrix. Statsmodels requires this matrix to contain not only all the
# instruments but also the variables in exog that will NOT be instrumented
instruments = df[['Q10']]
instruments = add_constant(instruments)

#Build and train the IV2SLS model
iv2sls_model = IV2SLS(endog=df['ln_similarity'], exog=exog, instrument=instruments)
iv2sls_model_results = iv2sls_model.fit()

#Print the training summary
print(iv2sls_model_results.summary())

#Compare the performance of 2SLS with OLS of ln(wage) on performance distance to the best
reg_expr = 'ln_similarity ~ abs_perform_diff_best'
olsr_model = smf.ols(formula=reg_expr, data=df)
olsr_model_results = olsr_model.fit()
print(olsr_model_results.summary())

                              OLS Regression Results                             
Dep. Variable:     abs_perform_diff_best   R-squared:                       0.002
Model:                               OLS   Adj. R-squared:                 -0.005
Method:                    Least Squares   F-statistic:                    0.2540
Date:                   Sat, 24 Sep 2022   Prob (F-statistic):              0.615
Time:                           23:26:59   Log-Likelihood:                -915.75
No. Observations:                    156   AIC:                             1836.
Df Residuals:                        154   BIC:                             1842.
Df Model:                              1                                         
Covariance Type:               nonrobust                                         
                 coef    std err          t      P>|t|      [0.025      0.975]
------------------------------------------------------------------------------
Intercept    199.5418 

In [4]:
solution = df
%store solution

Stored 'solution' (DataFrame)


## Phase 4

In [5]:
df = pd.read_csv('perform_dist_high_compare_b2.csv', header = 0)
df = df.dropna()
df = df[df['phase'] == 4]
df

Unnamed: 0.1,Unnamed: 0,Student,group,abs_perform_diff_best,phase,Q7_Q7_1,Q7_Q7_2,Q8_Q8_1,Q10,similarity
127,127,bdvegat,2,263.33,4,1.0,2.0,4.0,1.0,0.031579
128,128,Phoenixest,2,263.33,4,0.0,1.0,3.0,1.0,0.016043
129,129,HashNick,2,226.66,4,1.0,3.0,5.0,3.0,0.056277
130,130,ccvacad,2,358.33,4,1.0,0.0,4.0,2.0,0.066406
131,131,joaortizro,2,78.33,4,0.0,2.0,2.0,1.0,0.040816
132,132,juasmartinezbel,2,37.5,4,0.0,5.0,5.0,2.0,0.08078
133,133,mdbelloc,2,229.16,4,1.0,3.0,2.0,2.0,0.053097
134,134,diegocruz10,2,251.66,4,3.0,3.0,3.0,1.0,0.056338
135,135,crarojasca,2,313.33,4,1.0,3.0,5.0,2.0,0.0
136,136,sagilm,2,208.33,4,0.0,0.0,3.0,1.0,0.047393


In [6]:
#Let's confirm that aspiration satisfy the relevance condition for performance distance to the best. There is a difference between aspiration and social aspiration. 
reg_expr = 'abs_perform_diff_best ~ Q10'

# Build and train an OLS model that regresses performance distance to the best on aspiration and verify
# using the F-test that coefficients of aspiration is significant 
olsr_model = smf.ols(formula=reg_expr, data=df)
olsr_model_results = olsr_model.fit()
print(olsr_model_results.summary())

df['ln_similarity'] = np.log(df['similarity'] + 1)

# Build out the exog matrix. Statsmodels requires this matrix to contain all the endogenous and
# exogenous variables, plus the constant.
exog = df[['abs_perform_diff_best']]
exog = add_constant(exog)

# Build out the instruments matrix. Statsmodels requires this matrix to contain not only all the
# instruments but also the variables in exog that will NOT be instrumented
instruments = df[['Q10']]
instruments = add_constant(instruments)

#Build and train the IV2SLS model
iv2sls_model = IV2SLS(endog=df['ln_similarity'], exog=exog, instrument=instruments)
iv2sls_model_results = iv2sls_model.fit()

#Print the training summary
print(iv2sls_model_results.summary())

#Compare the performance of 2SLS with OLS of ln(wage) on performance distance to the best
reg_expr = 'ln_similarity ~ abs_perform_diff_best'
olsr_model = smf.ols(formula=reg_expr, data=df)
olsr_model_results = olsr_model.fit()
print(olsr_model_results.summary())

                              OLS Regression Results                             
Dep. Variable:     abs_perform_diff_best   R-squared:                       0.001
Model:                               OLS   Adj. R-squared:                 -0.026
Method:                    Least Squares   F-statistic:                   0.02584
Date:                   Sat, 24 Sep 2022   Prob (F-statistic):              0.873
Time:                           23:26:59   Log-Likelihood:                -232.99
No. Observations:                     39   AIC:                             470.0
Df Residuals:                         37   BIC:                             473.3
Df Model:                              1                                         
Covariance Type:               nonrobust                                         
                 coef    std err          t      P>|t|      [0.025      0.975]
------------------------------------------------------------------------------
Intercept    197.5375 

## Comments

![image1a](image1a.png)

The coefficients of aspiration (Q10) is significant at a p of < 0 as indicated by their p-values which are basically zero. Aspiration (Q10) clearly meet the **relevance condition** for instrumental variables of performance distance. 

We’ll now build a linear model for the similarity to the best equation and using statsmodels, we’ll train the model using the two-stage least square estimator.

We’ll start by building the design matrices. The dependent variable is ln(similarity). 

```
ln_wage = np.log(df['similarity'])
```

Statsmodel’s IV2SLS estimator is defined as follows:

```

statsmodels.sandbox.regression.gmm.IV2SLS(endog, exog, instrument=None)

```
Statsmodels needs the endog, exog and instrument matrices to be constructed in a specific way as follows:

`endog` is an [n x 1] matrix containing the dependent variable. In our example, it is the log(similarity) variable.

`exog` is an [n x (k+1)] size matrix that must contain all the endogenous and exogenous variables, plus the constant. In our example, apart from the constant, we do not have any exogenous variables defined in our wage equation. 

`instrument` is a matrix that contains the instrumental variables. Additionally, the Statsmodels’ IV2SLS estimator requires instrument to also contain all variables from the exog matrix that are not being instrumented. In our example, the instrumental variable is aspiration. The variables in exog that are not being instrumented is just the placeholder column for the intercept. 

![image2a](image2a.png)

## Interpretation of results of the 2SLS model

Since our primary interest is in estimating the effect of performance distance to the best on similarity to the best, we’ll focus our attention on the coefficient estimate of the performance distance to the best. 

We see that the 2SLS model has estimated the coefficient of performance distance to the best as -7.251e-05 with a standard error of 0.000 and a 95% confidence interval of -0.000 to 0.000. The p-value of 0.084 suggests a significance at (1–0.702)100%=29.8%. Overall, and as expected for a 2SLS model, the model lacks precision.

Note that dependent variable is log(similarity + 1). To calculate the rate of change of similarity to the best for each unit change of performance distance to the best, we must exponentiate the coefficient of performance distance to the best.

1 - 10^(-7.251e-05) = 0.000167 implying that a unit decrease in performance distance to the best is estimated to yield an increase of 0.000167 in similarity to the best, and vice-versa.

## Comparison of the IV estimator with an OLS estimator 

Let’s compare the performance of the 2SLS model with a straight-up OLS model that regresses log(similarity) on performance distance to the best. 

![image3a.png](image3a.png)

We’ll focus our attention on the estimated value of the coefficient of performance distance to the best. At -0.0002, it is a lot higher than the estimate reported by the 2SLS model.

1 - 10^(-0.0002)=0.00046, implying a unit decrease in the performance distance to the best is estimated to translate into a 0.00046 increase in similarity to the best (vice versa). 

The higher estimate from OLS is expected due to the suspected endogeniety of performance distance to the best. In practice, depending on the situation we are modeling, we may want to accept the more conservative estimate of -7.251e-05 reported by the 2SLS model. However, (and against the 2SLS model), the coefficient estimate from the OLS model is highly significant with a p-value that is 0.000. Recollect that the estimate from the 2SLS model was significant at only a 29.8% confidence level.

The coefficient estimate of performance distance to the best reported by the OLS model has pretty similar standard error which is zero as compared to that from the 2SLS model. 

For comparison, here are the coefficient estimates of performance distance to the best and corresponding 95% CIs from the two models:

With the IV estimator, one trades precision of estimates for the removal of endogeneity and the consequent bias in the estimates.

