In [1]:
import pandas as pd
import numpy as np

import statsmodels.api as sm
from sklearn.model_selection import train_test_split
from sklearn.pipeline import Pipeline
from sklearn.preprocessing import StandardScaler
from sklearn.linear_model import Ridge, Lasso, ElasticNet

from sklearn.metrics import mean_squared_error, r2_score

In [2]:
df = pd.read_csv(r"C:\Users\Demoted\Downloads\datasci\Data_Science_FP\Cleaned output data files\final_model_dataset.csv")

y = df["LBW_Rate"]
X = df[["combined_noise_mean_db","race_ethnicity_minority_status","black_prop","insurance_coverage_prop","late_prenatal_care_prop","maternal_age_risk_prop","low_education_prop",]]

df.head()

Unnamed: 0,ZIP_Code_of_Residence,Total_Births_2018_2022,Total_LBW_Count_2018_2022,LBW_Rate,combined_noise_mean_db,race_ethnicity_minority_status,black_prop,insurance_coverage_prop,late_prenatal_care_prop,maternal_age_risk_prop,low_education_prop
0,90001,3936.0,308.0,7.825203,56.810581,0.9909,0.074499,0.999775,0.188643,0.178481,0.694106
1,90002,3839.5,324.5,8.451621,57.588403,0.9926,0.157321,0.999849,0.177367,0.169684,0.681339
2,90003,5583.5,439.5,7.871407,60.222628,0.9921,0.164641,0.99976,0.198263,0.170413,0.710844
3,90004,2642.0,211.0,7.986374,56.035845,0.7972,0.044516,0.998844,0.194739,0.331945,0.42922
4,90005,1537.0,140.0,9.108653,54.012587,0.9054,0.053133,0.997378,0.165908,0.292453,0.477554


In [3]:
# Training data: used to fit models
# Testing data: used for predictive evaluation
X_train, X_test, y_train, y_test = train_test_split(
    X,
    y,
    test_size=0.20,
    random_state=42
)


In [4]:
# OLS model for prediction
X_train_const = sm.add_constant(X_train)
X_test_const = sm.add_constant(X_test)

ols_pred_model = sm.OLS(y_train, X_train_const).fit()

# Predict LBW_Rate on unseen test data
y_pred_ols = ols_pred_model.predict(X_test_const)

ols_rmse = np.sqrt(mean_squared_error(y_test, y_pred_ols))
ols_r2 = r2_score(y_test, y_pred_ols)


In [5]:
# Penalized regression models
# Standardization is applied because coefficients are penalized based on magnitude.
models = {
    "Ridge": Ridge(alpha=1.0),
    "Lasso": Lasso(alpha=0.01),
    "Elastic Net": ElasticNet(alpha=0.01, l1_ratio=0.5),
}

results = []
results.append({
    "Model": "OLS",
    "RMSE": ols_rmse,
    "R2": ols_r2
})

# Fit and evaluate ML models on test data
for name, model in models.items():
    pipeline = Pipeline([
        ("scaler", StandardScaler()),
        ("model", model)
    ])
    
    pipeline.fit(X_train, y_train)
    y_pred = pipeline.predict(X_test)
    
    results.append({
        "Model": name,
        "RMSE": np.sqrt(mean_squared_error(y_test, y_pred)),
        "R2": r2_score(y_test, y_pred)
    })

results_df = pd.DataFrame(results).sort_values("RMSE")
results_df


Unnamed: 0,Model,RMSE,R2
2,Lasso,4.287116,0.391386
0,OLS,4.296263,0.388786
3,Elastic Net,4.300765,0.387504
1,Ridge,4.30539,0.386186


In [6]:
# Final OLS model for inference
# This model:
# uses the full dataset
# is not standardized
# uses HC3 robust standard errors
# is the model reported in the Results section

X_full_const = sm.add_constant(X)

final_ols_model = sm.OLS(y, X_full_const).fit(cov_type="HC3")
final_ols_model.summary()


0,1,2,3
Dep. Variable:,LBW_Rate,R-squared:,0.358
Model:,OLS,Adj. R-squared:,0.338
Method:,Least Squares,F-statistic:,7.821
Date:,"Thu, 08 Jan 2026",Prob (F-statistic):,1.64e-08
Time:,06:21:12,Log-Likelihood:,-629.04
No. Observations:,243,AIC:,1274.0
Df Residuals:,235,BIC:,1302.0
Df Model:,7,,
Covariance Type:,HC3,,

0,1,2,3,4,5,6
,coef,std err,z,P>|z|,[0.025,0.975]
const,1.6278,11.251,0.145,0.885,-20.424,23.680
combined_noise_mean_db,-0.0202,0.138,-0.147,0.883,-0.290,0.249
race_ethnicity_minority_status,-6.5651,1.631,-4.026,0.000,-9.761,-3.369
black_prop,5.8799,2.218,2.650,0.008,1.532,10.228
insurance_coverage_prop,7.2315,9.605,0.753,0.452,-11.594,26.057
late_prenatal_care_prop,46.8570,9.511,4.926,0.000,28.215,65.499
maternal_age_risk_prop,0.2699,5.098,0.053,0.958,-9.722,10.262
low_education_prop,-2.2839,3.293,-0.693,0.488,-8.739,4.171

0,1,2,3
Omnibus:,25.888,Durbin-Watson:,1.209
Prob(Omnibus):,0.0,Jarque-Bera (JB):,123.879
Skew:,0.081,Prob(JB):,1.2600000000000001e-27
Kurtosis:,6.494,Cond. No.,3070.0
