In [None]:
# This file tests two different approaches for the regression models
# Note: Both modelling approaches in this file were not chosen as "best" approaches.
# Refer to the final_model file to view the chosen modelling apporach and the results

import numpy as np
import pandas as pd

from sklearn.model_selection import train_test_split, GridSearchCV, cross_val_score
from sklearn.preprocessing import StandardScaler
from sklearn.pipeline import Pipeline
from sklearn.metrics import mean_squared_error, r2_score

from sklearn.linear_model import LinearRegression, Ridge, Lasso, ElasticNet
from sklearn.ensemble import GradientBoostingRegressor

import statsmodels.api as sm

# -----------------------------------------
# 1. DEFINE VARIABLES (LOCKED DATASET)
# -----------------------------------------

outcome_var = "LBW_Rate"
weight_var = "Total_Births_2018_2022"

base_predictors = [
    "combined_noise_mean_db",
    "homeownership_rate",
    "log_median_household_income",
    "black_prop",
    "asian_prop",
    "insurance_coverage_prop",
    "pca_cluster_social_ej"
]

# -----------------------------------------
# 2. CREATE INTERACTION TERMS
# -----------------------------------------

df["income_x_black"] = (
    df["log_median_household_income"] * df["black_prop"]
)

df["income_x_insurance"] = (
    df["log_median_household_income"] * df["insurance_coverage_prop"]
)

df["ej_x_noise"] = (
    df["pca_cluster_social_ej"] * df["combined_noise_mean_db"]
)

predictor_vars = base_predictors + [
    "income_x_black",
    "income_x_insurance",
    "ej_x_noise"
]

# -----------------------------------------
# 3. OUTCOME TRANSFORMATION (LOGIT)
# -----------------------------------------

eps = 1e-6
df["LBW_prop"] = df[outcome_var] / 100

df["LBW_logit"] = np.log(
    (df["LBW_prop"] + eps) / (1 - df["LBW_prop"] + eps)
)

# -----------------------------------------
# 4. TRAIN / TEST SPLIT
# -----------------------------------------

X = df[predictor_vars]
y = df["LBW_logit"]
w = df[weight_var]

X_train, X_test, y_train, y_test, w_train, w_test = train_test_split(
    X, y, w, test_size=0.2, random_state=42
)
# -----------------------------------------
# 5. MODEL 1 — WEIGHTED OLS (BASELINE)
# -----------------------------------------

X_train_sm = sm.add_constant(X_train)
X_test_sm = sm.add_constant(X_test)

wls_model = sm.WLS(y_train, X_train_sm, weights=w_train).fit()

y_pred_wls = wls_model.predict(X_test_sm)

wls_rmse = np.sqrt(mean_squared_error(y_test, y_pred_wls))
wls_r2 = r2_score(y_test, y_pred_wls)

# -----------------------------------------
# 6. HELPER FUNCTION
# -----------------------------------------

def eval_model(model, X_train, X_test, y_train, y_test, w_train=None):
    if w_train is not None:
        model.fit(X_train, y_train, **{"sample_weight": w_train})
    else:
        model.fit(X_train, y_train)

    y_pred = model.predict(X_test)
    rmse = np.sqrt(mean_squared_error(y_test, y_pred))
    r2 = r2_score(y_test, y_pred)
    return rmse, r2

# -----------------------------------------
# 7. RIDGE REGRESSION
# -----------------------------------------

ridge_pipe = Pipeline([
    ("scaler", StandardScaler()),
    ("ridge", Ridge())
])

ridge_grid = {
    "ridge__alpha": np.logspace(-3, 3, 50)
}

ridge_cv = GridSearchCV(
    ridge_pipe, ridge_grid, cv=5,
    scoring="neg_root_mean_squared_error"
)

ridge_cv.fit(X_train, y_train, ridge__sample_weight=w_train)
ridge_best = ridge_cv.best_estimator_

ridge_rmse, ridge_r2 = eval_model(
    ridge_best, X_train, X_test, y_train, y_test
)

# -----------------------------------------
# 8. LASSO REGRESSION
# -----------------------------------------

lasso_pipe = Pipeline([
    ("scaler", StandardScaler()),
    ("lasso", Lasso(max_iter=10000))
])

lasso_grid = {
    "lasso__alpha": np.logspace(-4, 1, 50)
}

lasso_cv = GridSearchCV(
    lasso_pipe, lasso_grid, cv=5,
    scoring="neg_root_mean_squared_error"
)

lasso_cv.fit(X_train, y_train)
lasso_best = lasso_cv.best_estimator_

lasso_rmse, lasso_r2 = eval_model(
    lasso_best, X_train, X_test, y_train, y_test
)

# -----------------------------------------
# 9. ELASTIC NET
# -----------------------------------------

enet_pipe = Pipeline([
    ("scaler", StandardScaler()),
    ("enet", ElasticNet(max_iter=10000))
])

enet_grid = {
    "enet__alpha": np.logspace(-4, 1, 30),
    "enet__l1_ratio": np.linspace(0.1, 0.9, 9)
}

enet_cv = GridSearchCV(
    enet_pipe, enet_grid, cv=5,
    scoring="neg_root_mean_squared_error"
)

enet_cv.fit(X_train, y_train)
enet_best = enet_cv.best_estimator_

enet_rmse, enet_r2 = eval_model(
    enet_best, X_train, X_test, y_train, y_test
)

# -----------------------------------------
# 10. GRADIENT BOOSTING REGRESSOR
# -----------------------------------------

gbr = GradientBoostingRegressor(
    n_estimators=500,
    learning_rate=0.05,
    max_depth=3,
    random_state=42
)

gbr_rmse, gbr_r2 = eval_model(
    gbr, X_train, X_test, y_train, y_test
)

# -----------------------------------------
# 11. FINAL MODEL COMPARISON
# -----------------------------------------

results = pd.DataFrame({
    "Model": [
        "Weighted OLS",
        "Ridge",
        "Lasso",
        "Elastic Net",
        "Gradient Boosting"
    ],
    "RMSE": [
        wls_rmse,
        ridge_rmse,
        lasso_rmse,
        enet_rmse,
        gbr_rmse
    ],
    "R2": [
        wls_r2,
        ridge_r2,
        lasso_r2,
        enet_r2,
        gbr_r2
    ]
}).sort_values("RMSE")

results


In [None]:
import numpy as np
import pandas as pd

from sklearn.model_selection import train_test_split
from sklearn.preprocessing import StandardScaler
from sklearn.linear_model import LinearRegression, Ridge, Lasso, ElasticNet
from sklearn.metrics import r2_score

# ------------------------------------------------
# 1. LOCKED VARIABLES (DO NOT CHANGE)
# ------------------------------------------------

outcome_var = "LBW_Rate"
weight_var = "Total_Births_2018_2022"   # NOT USED IN MODEL

base_predictors = [
    "combined_noise_mean_db",
    "homeownership_rate",
    "log_median_household_income",
    "black_prop",
    "asian_prop",
    "insurance_coverage_prop",
    "pca_cluster_social_ej"
]

# ------------------------------------------------
# 2. LOGIT TRANSFORM OUTCOME (RATE → LOG-ODDS)
# ------------------------------------------------

eps = 1e-6

df = df.copy()
df["LBW_prop"] = df[outcome_var] / 100

df["LBW_logit"] = np.log(
    (df["LBW_prop"] + eps) / (1 - df["LBW_prop"] + eps)
)

# ------------------------------------------------
# 3. TRAIN / TEST SPLIT
# ------------------------------------------------

X = df[base_predictors]
y = df["LBW_logit"]

X_train, X_test, y_train, y_test = train_test_split(
    X, y, test_size=0.2, random_state=42
)

# ------------------------------------------------
# 4. STANDARDIZE PREDICTORS ONLY
# ------------------------------------------------

scaler = StandardScaler()

X_train_scaled = scaler.fit_transform(X_train)
X_test_scaled = scaler.transform(X_test)

# ------------------------------------------------
# 5. MODELS (LINEAR FAMILY ONLY)
# ------------------------------------------------

models = {
    "OLS": LinearRegression(),
    "Ridge": Ridge(alpha=1.0),
    "Lasso": Lasso(alpha=0.01),
    "Elastic Net": ElasticNet(alpha=0.01, l1_ratio=0.5)
}

# ------------------------------------------------
# 6. FIT → BACK-TRANSFORM → EVALUATE
# ------------------------------------------------

results = []

for name, model in models.items():
    model.fit(X_train_scaled, y_train)

    # Predict on logit scale
    y_pred_logit = model.predict(X_test_scaled)

    # Back-transform: logit → proportion
    y_pred_prop = np.exp(y_pred_logit) / (1 + np.exp(y_pred_logit))
    y_test_prop = np.exp(y_test) / (1 + np.exp(y_test))

    # Convert to percentage
    y_pred_rate = y_pred_prop * 100
    y_test_rate = y_test_prop * 100

    # Metrics
    rmse = np.sqrt(np.mean((y_test_rate - y_pred_rate) ** 2))
    r2 = r2_score(y_test_rate, y_pred_rate)

    results.append({
        "Model": name,
        "RMSE": rmse,
        "R2": r2
    })

results_df = pd.DataFrame(results).sort_values("RMSE")
results_df