In [99]:
import pandas as pd
import pyreadstat
import statsmodels.api as sm
import numpy as np
from scipy import stats

# 1

In [None]:
path = 'econ535_hw1.dta'


df, meta = pyreadstat.read_dta(path)
meta.column_labels
meta.variable_value_labels

{}

In [31]:
dict_df = pd.DataFrame({
    "variable": df.columns,
    "label": meta.column_labels,
    "dtypes": df.dtypes,
    "n_nonnull": df.count(),
    "mean": df.mean()
})
print(dict_df)

          variable                                    label   dtypes  \
sampleid  sampleid  unique sample id, links with survey and   object   
quarter    quarter                                     None    int64   
treat        treat                     research group dummy    int64   
higrade    higrade                  highest grade completed   object   
black        black          dummy:ethnicity black (ethnicy)   object   
white        white          dummy:ethnicity white (ethnicy)   object   
otheth      otheth     dummy:ethnicity other (native,asian)   object   
yngchage  yngchage           age yngst child (0-19yrs only)  float64   
adcc          adcc                  quarterly afdc payments    int64   
yr2adc      yr2adc     cova:total afdc payments prior 2 yrs    int64   
yradc        yradc      cova:total afdc payments prior year    int64   
earn          earn                       quarterly earnings    int64   
yrearn      yrearn        cova:earnings in year prior to ra    i

In [29]:
print(df.shape)

(11260, 24)


For earnings, welfare income, total income, education, age, and age squared, responses are complete for everything except education.

In [16]:
for ycol in ["earn", "adcc", "tinc"]:
    y = df[ycol]
    X = pd.DataFrame({"const": 1.0}, index=df.index)

    m = sm.OLS(y, X, missing="drop").fit()
    print("\n", ycol)
    print("N used:", int(m.nobs))
    print("Intercept (regression):", float(m.params["const"]))
    print("Mean (dropna):", float(y.dropna().mean()))


 earn
N used: 11260
Intercept (regression): 571.5408525754883
Mean (dropna): 571.5408525754884

 adcc
N used: 11260
Intercept (regression): 543.4680284191828
Mean (dropna): 543.4680284191829

 tinc
N used: 11260
Intercept (regression): 1691.3188277087029
Mean (dropna): 1691.3188277087033


Regressing a variable on a constant returns the the exact same value as the sample mean.

In [47]:
cols = ['higrade', 'age', 'agesq']
df[cols] = df[cols].apply(pd.to_numeric, errors='raise').astype(float)

d = df[['earn', 'higrade', 'age', 'agesq']].dropna()

In [53]:
y = d['earn']
X = d[['higrade', 'age', 'agesq']]

X = sm.add_constant(X)

model = sm.OLS(y, X).fit()
print(model.summary())

                            OLS Regression Results                            
Dep. Variable:                   earn   R-squared:                       0.027
Model:                            OLS   Adj. R-squared:                  0.027
Method:                 Least Squares   F-statistic:                     102.5
Date:                Mon, 16 Feb 2026   Prob (F-statistic):           2.08e-65
Time:                        14:43:26   Log-Likelihood:                -91983.
No. Observations:               10940   AIC:                         1.840e+05
Df Residuals:                   10936   BIC:                         1.840e+05
Df Model:                           3                                         
Covariance Type:            nonrobust                                         
                 coef    std err          t      P>|t|      [0.025      0.975]
------------------------------------------------------------------------------
const      -1221.0501    186.247     -6.556      0.0

In [54]:
X_age = sm.add_constant(d[['age', 'agesq']])
earn_resid = sm.OLS(d['earn'], X_age).fit().resid

In [55]:
edu_resid = sm.OLS(d['higrade'], X_age).fit().resid

In [56]:
y = earn_resid
X = edu_resid

X = sm.add_constant(X)

fwl_model = sm.OLS(y, X).fit()
print(fwl_model.summary())


                            OLS Regression Results                            
Dep. Variable:                      y   R-squared:                       0.023
Model:                            OLS   Adj. R-squared:                  0.023
Method:                 Least Squares   F-statistic:                     256.3
Date:                Mon, 16 Feb 2026   Prob (F-statistic):           4.98e-57
Time:                        14:43:32   Log-Likelihood:                -91983.
No. Observations:               10940   AIC:                         1.840e+05
Df Residuals:                   10938   BIC:                         1.840e+05
Df Model:                           1                                         
Covariance Type:            nonrobust                                         
                 coef    std err          t      P>|t|      [0.025      0.975]
------------------------------------------------------------------------------
const      -7.844e-12     10.371  -7.56e-13      1.0

In [57]:
y = earn_resid
X = edu_resid

fwl_model_no_constant = sm.OLS(y, X).fit()
print(fwl_model_no_constant.summary())

                                 OLS Regression Results                                
Dep. Variable:                      y   R-squared (uncentered):                   0.023
Model:                            OLS   Adj. R-squared (uncentered):              0.023
Method:                 Least Squares   F-statistic:                              256.3
Date:                Mon, 16 Feb 2026   Prob (F-statistic):                    4.92e-57
Time:                        14:43:37   Log-Likelihood:                         -91983.
No. Observations:               10940   AIC:                                  1.840e+05
Df Residuals:                   10939   BIC:                                  1.840e+05
Df Model:                           1                                                  
Covariance Type:            nonrobust                                                  
                 coef    std err          t      P>|t|      [0.025      0.975]
-----------------------------------------

In [82]:
d_dm = d - d.mean()

y = d_dm['earn']
X = d_dm[['higrade', 'age', 'agesq']].to_numpy()

demeaned_model = sm.OLS(y, X).fit()
residuals = demeaned_model.resid.to_numpy()
yhat = demeaned_model.fittedvalues.to_numpy()
print(demeaned_model.summary())

                                 OLS Regression Results                                
Dep. Variable:                   earn   R-squared (uncentered):                   0.027
Model:                            OLS   Adj. R-squared (uncentered):              0.027
Method:                 Least Squares   F-statistic:                              102.5
Date:                Mon, 16 Feb 2026   Prob (F-statistic):                    2.05e-65
Time:                        14:59:35   Log-Likelihood:                         -91983.
No. Observations:               10940   AIC:                                  1.840e+05
Df Residuals:                   10937   BIC:                                  1.840e+05
Df Model:                           3                                                  
Covariance Type:            nonrobust                                                  
                 coef    std err          t      P>|t|      [0.025      0.975]
-----------------------------------------

In [83]:
print(np.round(residuals.sum(), 4))

0.0


In [84]:
print(np.round(y.mean(), 4))

print(np.round(yhat.mean(), 4))

0.0
-0.0


In [85]:
Xt_resid = X.T @ residuals
print("X' u =", np.round(Xt_resid, 4))

X' u = [ 0. -0. -0.]


In [86]:
r = np.corrcoef(y, yhat)[0, 1]
r2_from_corr = r**2

print("corr(y,yhat)^2:", r2_from_corr)
print("R^2 (statsmodels):", demeaned_model.rsquared)

corr(y,yhat)^2: 0.027337872608211338
R^2 (statsmodels): 0.02733787260821141


In [88]:
y = earn_resid.to_numpy()
X = edu_resid.to_numpy()

residuals = fwl_model_no_constant.resid.to_numpy()
yhat = fwl_model_no_constant.fittedvalues.to_numpy()

In [89]:
print(np.round(residuals.sum(), 4))

print(np.round(y.mean(), 4))
print(np.round(yhat.mean(), 4))

Xt_resid = X.T @ residuals
print("X' u =", np.round(Xt_resid, 4))

r = np.corrcoef(y, yhat)[0, 1]
r2_from_corr = r**2

print("corr(y,yhat)^2:", r2_from_corr)
print("R^2 (statsmodels):", fwl_model_no_constant.rsquared)

-0.0
0.0
0.0
X' u = 0.0
corr(y,yhat)^2: 0.022891756621222583
R^2 (statsmodels): 0.022891756621222892


All of the properties from 1d still hold for the FWL model without a constant term.

# 2

In [100]:
y = d['earn']
X = d[['higrade', 'age', 'agesq']]

X = sm.add_constant(X)

model_ur = sm.OLS(y, X).fit()
print(model_ur.summary())

                            OLS Regression Results                            
Dep. Variable:                   earn   R-squared:                       0.027
Model:                            OLS   Adj. R-squared:                  0.027
Method:                 Least Squares   F-statistic:                     102.5
Date:                Mon, 16 Feb 2026   Prob (F-statistic):           2.08e-65
Time:                        15:15:40   Log-Likelihood:                -91983.
No. Observations:               10940   AIC:                         1.840e+05
Df Residuals:                   10936   BIC:                         1.840e+05
Df Model:                           3                                         
Covariance Type:            nonrobust                                         
                 coef    std err          t      P>|t|      [0.025      0.975]
------------------------------------------------------------------------------
const      -1221.0501    186.247     -6.556      0.0

In [101]:
ser = np.sqrt(model_ur.ssr / model_ur.df_resid)
print("The standard error of the regression is", np.round(ser, 3))
print("The standard error of the education coefficient is 6.606")
print("The standard error of the age coefficient is 11.727")

The standard error of the regression is 1084.861
The standard error of the education coefficient is 6.606
The standard error of the age coefficient is 11.727


Every additional year of schooling completed is associated with $105.73 additional dollars in earnings.

In [102]:
q = 1
df2 = model_ur.df_resid

In [104]:
X_r = sm.add_constant(d[['age', 'agesq']])
model_r = sm.OLS(y, X_r).fit()

In [105]:
F_r2 = ((model_ur.rsquared - model_r.rsquared) / q) / ((1 - model_ur.rsquared) / df2)
p_r2 = stats.f.sf(F_r2, q, df2)

In [106]:
print("UR R^2:", model_ur.rsquared)
print("R  R^2:", model_r.rsquared)

print(f"F (R^2): {F_r2:.6f}, p-value: {p_r2:.6g}")

UR R^2: 0.0273378726082113
R  R^2: 0.004550279886713748
F (R^2): 256.209332, p-value: 5.09967e-57
