In [2]:
import numpy as np
import pandas as pd

from sklearn.model_selection import train_test_split, GridSearchCV
from sklearn.preprocessing import StandardScaler
from sklearn.pipeline import Pipeline
from sklearn.metrics import mean_squared_error, r2_score

from sklearn.linear_model import LinearRegression, Ridge, Lasso, ElasticNet
from sklearn.ensemble import RandomForestRegressor

import statsmodels.api as sm


In [3]:
df = pd.read_csv(r"C:\Users\Elias\Final Project\Cleaned output data files\final_model_dataset.csv")

outcome_var = "LBW_Rate"

predictor_vars = [
    "combined_noise_mean_db",
    "homeownership_rate",
    "log_median_household_income",
    "black_prop",
    "asian_prop",
    "insurance_coverage_prop",
    "pca_cluster_social_ej"
]

X = df[predictor_vars]
y = df[outcome_var]

X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.2, random_state=42)


In [4]:
X_train_sm = sm.add_constant(X_train)
X_test_sm = sm.add_constant(X_test)

ols_model = sm.OLS(y_train, X_train_sm).fit()

y_pred_ols = ols_model.predict(X_test_sm)

ols_rmse = np.sqrt(mean_squared_error(y_test, y_pred_ols))
ols_r2 = r2_score(y_test, y_pred_ols)


In [5]:
def evaluate_model(model, X_train, X_test, y_train, y_test):
    # Fit the model
    model.fit(X_train, y_train)
    
    # Predict on test set
    y_pred = model.predict(X_test)
    
    # Manual RMSE calculation
    rmse = np.sqrt(mean_squared_error(y_test, y_pred))
    
    # RÂ² score
    r2 = r2_score(y_test, y_pred)
    
    return rmse, r2

In [6]:
ridge_pipe = Pipeline([
    ("scaler", StandardScaler()),
    ("ridge", Ridge())
])

ridge_param_grid = {
    "ridge__alpha": np.logspace(-3, 3, 50)
}

ridge_cv = GridSearchCV(
    ridge_pipe,
    ridge_param_grid,
    cv=5,
    scoring="neg_root_mean_squared_error"
)

ridge_cv.fit(X_train, y_train)

ridge_best = ridge_cv.best_estimator_

ridge_rmse, ridge_r2 = evaluate_model(
    ridge_best, X_train, X_test, y_train, y_test
)


In [7]:
lasso_pipe = Pipeline([
    ("scaler", StandardScaler()),
    ("lasso", Lasso(max_iter=10000))
])

lasso_param_grid = {
    "lasso__alpha": np.logspace(-4, 1, 50)
}

lasso_cv = GridSearchCV(
    lasso_pipe,
    lasso_param_grid,
    cv=5,
    scoring="neg_root_mean_squared_error"
)

lasso_cv.fit(X_train, y_train)

lasso_best = lasso_cv.best_estimator_

lasso_rmse, lasso_r2 = evaluate_model(
    lasso_best, X_train, X_test, y_train, y_test
)


In [8]:
enet_pipe = Pipeline([
    ("scaler", StandardScaler()),
    ("enet", ElasticNet(max_iter=10000))
])

enet_param_grid = {
    "enet__alpha": np.logspace(-4, 1, 30),
    "enet__l1_ratio": np.linspace(0.1, 0.9, 9)
}

enet_cv = GridSearchCV(
    enet_pipe,
    enet_param_grid,
    cv=5,
    scoring="neg_root_mean_squared_error"
)

enet_cv.fit(X_train, y_train)

enet_best = enet_cv.best_estimator_

enet_rmse, enet_r2 = evaluate_model(
    enet_best, X_train, X_test, y_train, y_test
)


In [9]:
rf_model = RandomForestRegressor(
    n_estimators=500,
    max_depth=None,
    min_samples_leaf=5,
    random_state=42
)

rf_rmse, rf_r2 = evaluate_model(
    rf_model, X_train, X_test, y_train, y_test
)


In [10]:
results_df = pd.DataFrame({
    "Model": [
        "OLS",
        "Ridge",
        "Lasso",
        "Elastic Net",
        "Random Forest"
    ],
    "RMSE": [
        ols_rmse,
        ridge_rmse,
        lasso_rmse,
        enet_rmse,
        rf_rmse
    ],
    "R2": [
        ols_r2,
        ridge_r2,
        lasso_r2,
        enet_r2,
        rf_r2
    ]
})

results_df.sort_values("RMSE")


Unnamed: 0,Model,RMSE,R2
0,OLS,5.273252,0.079193
2,Lasso,5.282996,0.075787
3,Elastic Net,5.376365,0.04283
1,Ridge,5.377342,0.042482
4,Random Forest,5.444526,0.018407


In [11]:
coef_df = pd.DataFrame({
    "feature": predictor_vars,
    "ridge_coef": ridge_best.named_steps["ridge"].coef_,
    "lasso_coef": lasso_best.named_steps["lasso"].coef_,
    "enet_coef": enet_best.named_steps["enet"].coef_
})

coef_df


Unnamed: 0,feature,ridge_coef,lasso_coef,enet_coef
0,combined_noise_mean_db,-0.515103,-0.589827,-0.513755
1,homeownership_rate,-0.322099,-0.044793,-0.307676
2,log_median_household_income,-0.487099,-1.03477,-0.503086
3,black_prop,0.481417,0.613643,0.485863
4,asian_prop,-0.065378,0.016753,-0.042761
5,insurance_coverage_prop,0.608923,0.809892,0.613572
6,pca_cluster_social_ej,-0.651166,-1.123715,-0.666149


In [14]:
import statsmodels.api as sm

# ------------------------------------
# FINAL INFERENCE MODEL (ROBUST SEs)
# ------------------------------------

X_full = sm.add_constant(df[predictor_vars])
y_full = df["LBW_Rate"]

ols_robust = sm.OLS(y_full, X_full).fit(cov_type="HC3")

print(ols_robust.summary())


                            OLS Regression Results                            
Dep. Variable:               LBW_Rate   R-squared:                       0.190
Model:                            OLS   Adj. R-squared:                  0.166
Method:                 Least Squares   F-statistic:                     3.238
Date:                Tue, 06 Jan 2026   Prob (F-statistic):            0.00268
Time:                        18:43:34   Log-Likelihood:                -657.17
No. Observations:                 243   AIC:                             1330.
Df Residuals:                     235   BIC:                             1358.
Df Model:                           7                                         
Covariance Type:                  HC3                                         
                                  coef    std err          z      P>|z|      [0.025      0.975]
-----------------------------------------------------------------------------------------------
const             