In [2]:
import pandas as pd
import numpy as np
import statsmodels.api as sm

# -------------------------------------------------
# 1) Load final cleaned dataset
# -------------------------------------------------
df = pd.read_csv("NGS2020_selected_renamed.csv")

# -------------------------------------------------
# 2) Prepare ln(earnings)
# -------------------------------------------------
df["JobIncomeCategory"] = pd.to_numeric(df["JobIncomeCategory"], errors="coerce")

# Keep valid positive income only
df = df.dropna(subset=["JobIncomeCategory"]).copy()
df = df[df["JobIncomeCategory"] > 0].copy()

df["LnEarnings"] = np.log(df["JobIncomeCategory"])

# -------------------------------------------------
# 3) Main variables (funding dummies)
#    Baseline omitted: OtherFunding
# -------------------------------------------------
funding_vars = [
    "GovLoan", "RESP", "GovGrant", "NonGovGrant", "Scholarship",
    "Savings", "FamilySupport", "BankLoan", "CreditCard", "EmployerFunding"
]
funding_vars = [v for v in funding_vars if v in df.columns]

# Build X and y
X = sm.add_constant(df[funding_vars])
y = df["LnEarnings"]

# Drop any remaining missing values (safe)
reg_data = pd.concat([y, X], axis=1).dropna()
y_clean = reg_data["LnEarnings"]
X_clean = reg_data.drop(columns=["LnEarnings"])

# -------------------------------------------------
# 4) OLS without controls (standard SEs)
# -------------------------------------------------
model_ols = sm.OLS(y_clean, X_clean).fit()

# -------------------------------------------------
# 5) Robustness check: Robust SEs (HC1)
# -------------------------------------------------
model_robust = sm.OLS(y_clean, X_clean).fit(cov_type="HC1")

# -------------------------------------------------
# 6) Print results
# -------------------------------------------------
print("\n========= OLS (Standard SEs) =========")
print(model_ols.summary())

print("\n========= OLS (Robust SEs: HC1) =========")
print(model_robust.summary())

# -------------------------------------------------
# 7) Quick side-by-side comparison table
# -------------------------------------------------
compare = pd.DataFrame({
    "coef_OLS": model_ols.params,
    "se_OLS": model_ols.bse,
    "t_OLS": model_ols.tvalues,
    "p_OLS": model_ols.pvalues,
    "coef_Robust": model_robust.params,
    "se_Robust": model_robust.bse,
    "t_Robust": model_robust.tvalues,
    "p_Robust": model_robust.pvalues
})

print("\n========= Coefficient Comparison (OLS vs Robust) =========")
print(compare)

print("\nN used in regression:", int(model_ols.nobs))



                            OLS Regression Results                            
Dep. Variable:             LnEarnings   R-squared:                       0.017
Model:                            OLS   Adj. R-squared:                  0.016
Method:                 Least Squares   F-statistic:                     21.90
Date:                Sun, 14 Dec 2025   Prob (F-statistic):           4.22e-41
Time:                        22:42:53   Log-Likelihood:                -11045.
No. Observations:               12672   AIC:                         2.211e+04
Df Residuals:                   12661   BIC:                         2.219e+04
Df Model:                          10                                         
Covariance Type:            nonrobust                                         
                      coef    std err          t      P>|t|      [0.025      0.975]
-----------------------------------------------------------------------------------
const              10.9405      0.007   1

In [6]:
# -------------------------------------------------
# SUMMARY STATISTICS
# -------------------------------------------------

sumstats_vars = [
    "LnEarnings",
    "GovLoan", "RESP", "GovGrant", "NonGovGrant", "Scholarship",
    "Savings", "FamilySupport", "BankLoan", "CreditCard", "EmployerFunding"
]

summary_stats = reg_data[sumstats_vars].agg(
    ["mean", "std"]
).T

print("\n========= SUMMARY STATISTICS (FINAL SAMPLE) =========")
print(summary_stats)




                      mean       std
LnEarnings       10.961986  0.583502
GovLoan           0.215278  0.411031
RESP              0.095249  0.293571
GovGrant          0.058633  0.234946
NonGovGrant       0.017598  0.131490
Scholarship       0.100616  0.300831
Savings           0.259864  0.438577
FamilySupport     0.158617  0.365333
BankLoan          0.065420  0.247275
CreditCard        0.015388  0.123096
EmployerFunding   0.019807  0.139344
