In [9]:
from src.utils import load_dataframe
from src.constants import DATA_DIR

df = load_dataframe(DATA_DIR / "data_cleaned_PersonalProperty_encoded.csv")
df.head()

Data loaded successfully.


Unnamed: 0,student_id,name,class,gpa,is_greek,is_on_campus,distance_to_campus,is_female,is_sprinklered,claim_id,amount,risk_tier,holdout,Business,Humanities,Other,Science
0,1,"Saiz, Leslie",3,1.203913,0,0,0.0,0.0,1,0,0.0,2,0,0,1,0,0
1,2,"Reddy, Dawna",3,0.609616,0,0,0.0,1.0,1,0,0.0,2,0,0,0,0,1
2,3,"Rivera, Johnna",3,3.073385,0,1,0.223942,1.0,0,0,0.0,2,0,1,0,0,0
3,4,"al-Jamil, Umaira",4,3.224736,0,0,0.0,0.0,0,0,0.0,3,0,1,0,0,0
4,5,"el-Mohammed, Qutb",2,2.687851,0,0,0.0,1.0,0,0,0.0,1,0,1,0,0,0


In [10]:
# Separate the data into train and test sets based on the 'holdout' column
train_df = df[df["holdout"] == 0]
test_df = df[df["holdout"] == 1]

# Drop the 'holdout' column from the train and test sets
train_df = train_df.drop(
    columns=["holdout", "student_id", "name", "claim_id", "is_female"]
)
test_df = test_df.drop(
    columns=["holdout", "student_id", "name", "claim_id", "is_female"]
)

print("Train set shape:", train_df.shape)
print("Test set shape:", test_df.shape)

Train set shape: (8070, 12)
Test set shape: (1981, 12)


In [11]:
X_train = train_df.drop(columns=["risk_tier", "amount"]).to_numpy()
y_train_risk_tier = train_df[["risk_tier"]].to_numpy().reshape(-1, 1)
y_train_amount = train_df[["amount"]].to_numpy().reshape(-1, 1)

X_test = test_df.drop(columns=["risk_tier", "amount"]).to_numpy()
y_test_risk_tier = test_df[["risk_tier"]].to_numpy().reshape(-1, 1)
y_test_amount = test_df[["amount"]].to_numpy().reshape(-1, 1)

In [14]:
from sklearn.linear_model import PoissonRegressor
from sklearn.metrics import mean_squared_error
import numpy as np

#single model for Poisson
model = PoissonRegressor(
    alpha=1.0,
    max_iter=100,
         
)

model.fit(X_train, y_train_amount.ravel())
y_pred = model.predict(X_test)

train_rmse = np.sqrt(
    mean_squared_error(y_train_amount, model.predict(X_train))
)
test_rmse  = np.sqrt(mean_squared_error(y_test_amount, y_pred))

print(f"Train RMSE With Single Model Poisson (Amount): {train_rmse}")
print(f"Test RMSE Single Model Poisson (Amount): {test_rmse}")


Train RMSE With Single Model Poisson (Amount): 396.58666351014165
Test RMSE Single Model Poisson (Amount): 353.14471216676355


In [18]:
from sklearn.ensemble import BaggingRegressor
from sklearn.linear_model import PoissonRegressor

base_glm = PoissonRegressor(max_iter=100, alpha=1.0)
bagged = BaggingRegressor(
    estimator=base_glm,
    n_estimators=100,      
    random_state=SEED
)

bagged.fit(X_train, y_train_amount.ravel())
y_pred_bag = bagged.predict(X_test)

train_rmse = np.sqrt(
    mean_squared_error(y_train_amount, bagged.predict(X_train))
)
test_rmse  = np.sqrt(mean_squared_error(y_test_amount, y_pred_bag))

print(f"Train RMSE With Bagged Model Poisson (Amount): {train_rmse}")
print(f"Test RMSE With Bagged Model Poisson (Amount): {test_rmse}")

Train RMSE With Bagged Model Poisson (Amount): 396.6212933795463
Test RMSE With Bagged Model Poisson (Amount): 353.11278054428055


In [None]:
"""
"""
#test model
from sklearn.ensemble import BaggingRegressor
from sklearn.linear_model import PoissonRegressor
from sklearn.metrics import mean_squared_error
import numpy as np
from src.constants import SEED

# Base Poisson GLM
base_glm = PoissonRegressor(max_iter=100, alpha=1.0)

# Bagged ensemble with subsampling
bagged = BaggingRegressor(
    estimator=base_glm,
    n_estimators=100,
    max_samples=0.7,           # each GLM sees only 70% of rows
    max_features=0.7,          # each GLM sees only 70% of features
    bootstrap=True,
    bootstrap_features=True,   # sample features with replacement
    random_state=SEED
)

# Fit and predict
bagged.fit(X_train, y_train_amount.ravel())
y_pred_bag = bagged.predict(X_test)

# Compute RMSE on train and test
train_rmse = np.sqrt(mean_squared_error(y_train_amount, bagged.predict(X_train)))
test_rmse  = np.sqrt(mean_squared_error(y_test_amount, y_pred_bag))

print(f"Train RMSE (Bagged Poisson): {train_rmse:.4f}")
print(f"Test  RMSE (Bagged Poisson): {test_rmse:.4f}")


Train RMSE (Bagged Poisson): 400.3021
Test  RMSE (Bagged Poisson): 353.3140


In [16]:
import numpy as np
from sklearn.ensemble import BaggingRegressor
from sklearn.linear_model import TweedieRegressor
from sklearn.metrics import mean_squared_error


#    power=1 for Poisson (y>=0 allowed); use 1<p<2 for compound Poisson-Gamma
base_glm = TweedieRegressor(power=1.5, alpha=1.0, max_iter=100)  


bagged = BaggingRegressor(
    estimator=base_glm,
    n_estimators=100,
    random_state=SEED
)  

# 3. Fit on training data (zeros allowed)
bagged.fit(X_train, y_train_amount.ravel())


y_train_pred = bagged.predict(X_train)
y_test_pred  = bagged.predict(X_test)


train_rmse = np.sqrt(mean_squared_error(y_train_amount, y_train_pred))
test_rmse  = np.sqrt(mean_squared_error(y_test_amount, y_test_pred))  

print(f"Train RMSE (Bagged Tweedie, power=1): {train_rmse}")
print(f"Test  RMSE (Bagged Tweedie, power=1): {test_rmse}")


Train RMSE (Bagged Tweedie, power=1): 398.9775
Test  RMSE (Bagged Tweedie, power=1): 352.7212


In [19]:
import numpy as np  # for np.sqrt()
from sklearn.ensemble import BaggingRegressor  # bagging meta‐estimator
from sklearn.linear_model import TweedieRegressor, LinearRegression
from sklearn.metrics import mean_squared_error  # for MSE / RMSE

# 1) Define your bagged models using your existing X_train, X_test, y_train_amount, y_test_amount
models = {
    "Tweedie p=1.5": BaggingRegressor(
        estimator=TweedieRegressor(power=1.5, alpha=1.0, max_iter=100),
        n_estimators=100,
        random_state=SEED
    ),
    "Gaussian GLM": BaggingRegressor(
        estimator=TweedieRegressor(power=0.0, alpha=1.0, max_iter=100),
        n_estimators=100,
        random_state=SEED
    ),
    "Bagged OLS": BaggingRegressor(
        estimator=LinearRegression(),
        n_estimators=100,
        random_state=SEED
    )
}

# 2) Fit each model on your training arrays, predict on train & test, compute RMSE
for name, model in models.items():
    # Flatten y from shape (n_samples,1) to (n_samples,)
    model.fit(X_train, y_train_amount.ravel())  # flatten via ndarray.ravel() :contentReference[oaicite:3]{index=3}

    # Predictions
    y_tr_pred = model.predict(X_train)  # train‐set predictions
    y_te_pred = model.predict(X_test)   # test‐set predictions

    # Compute RMSE = sqrt(MSE)
    train_rmse = np.sqrt(mean_squared_error(y_train_amount, y_tr_pred))  # sqrt + MSE :contentReference[oaicite:4]{index=4}&#8203;:contentReference[oaicite:5]{index=5}
    test_rmse  = np.sqrt(mean_squared_error(y_test_amount,  y_te_pred))

    print(f"{name:15s}  Train RMSE: {train_rmse:.4f}  Test RMSE: {test_rmse:.4f}")


Tweedie p=1.5    Train RMSE: 398.9775  Test RMSE: 352.7212
Gaussian GLM     Train RMSE: 405.2127  Test RMSE: 357.3043
Bagged OLS       Train RMSE: 398.4879  Test RMSE: 352.8624


In [None]:
gb_risk_tier = GradientBoostingRegressor(n_estimators=100, random_state=SEED)
gb_risk_tier.fit(X_train, y_train_risk_tier.ravel())
y_pred_risk_tier = gb_risk_tier.predict(X_test)

train_rmse_risk_tier = np.sqrt(
    mean_squared_error(y_train_risk_tier, gb_risk_tier.predict(X_train))
)
test_rmse_risk_tier = np.sqrt(mean_squared_error(y_test_risk_tier, y_pred_risk_tier))

print(f"Train Mean Squared Error (Risk Tier): {train_rmse_risk_tier}")
print(f"Test Mean Squared Error (Risk Tier): {test_rmse_risk_tier}")