In [5]:
import pandas as pd
import numpy as np
import statsmodels.formula.api as smf
from statsmodels.api import add_constant
from statsmodels.sandbox.regression.gmm import IV2SLS


#Load the Panel Study of Income Dynamics (PSID) into a Dataframe
df = pd.read_csv('PSID1976.csv', header=0)
print(df)

#Use a subset of the dataset where participating=yes
df_1975 = df.query('participation == \'yes\'')
print(df_1975)

#Let's confirm that meducation and feducation satisfy the relevance condition for education
reg_expr = 'education ~ meducation + feducation'

#Build an train an OLS model that regresses education on meducation and feducation and verify
# using the F-test that coefficients of meducation and feducation are jointly significant
# significance
olsr_model = smf.ols(formula=reg_expr, data=df_1975)
olsr_model_results = olsr_model.fit()
print(olsr_model_results.summary())

#Build the dependent variable column
df_1975['ln_wage'] = np.log(df_1975['wage'])

#Build out the exog matrix. Statsmodels requires this matrix to contain all the endogenous and
# exogenous variables, plus the constant.
exog = df_1975[['education']]
exog = add_constant(exog)

#Build out the instruments matrix. Statsmodels requires this matrix to contain not only all the
# instruments but also the variables in exog that will NOT be instrumented
instruments = df_1975[['meducation', 'feducation']]
instruments = add_constant(instruments)

#Build and train the IV2SLS model
iv2sls_model = IV2SLS(endog=df_1975['ln_wage'], exog=exog, instrument=instruments)
iv2sls_model_results = iv2sls_model.fit()

#Print the training summary
print(iv2sls_model_results.summary())

#Compare the performance of 2SLS with OLS of ln(wage) on education
reg_expr = 'ln_wage ~ education'
olsr_model = smf.ols(formula=reg_expr, data=df_1975)
olsr_model_results = olsr_model.fit()
print(olsr_model_results.summary())

     Unnamed: 0 participation  hours  youngkids  oldkids  age  education  \
0             1           yes   1610          1        0   32         12   
1             2           yes   1656          0        2   30         12   
2             3           yes   1980          1        3   35         12   
3             4           yes    456          0        3   34         12   
4             5           yes   1568          1        2   31         14   
..          ...           ...    ...        ...      ...  ...        ...   
748         749            no      0          0        2   40         13   
749         750            no      0          2        3   31         12   
750         751            no      0          0        0   43         12   
751         752            no      0          0        0   60         12   
752         753            no      0          0        3   39          9   

       wage  repwage  hhours  ...    hwage  fincome     tax  meducation  \
0    3.3540 

A value is trying to be set on a copy of a slice from a DataFrame.
Try using .loc[row_indexer,col_indexer] = value instead

See the caveats in the documentation: https://pandas.pydata.org/pandas-docs/stable/user_guide/indexing.html#returning-a-view-versus-a-copy
  df_1975['ln_wage'] = np.log(df_1975['wage'])
