In [1]:
import numpy as np
import pandas as pd
import ssl
import os
import yaml
import shutil

from sklearn.datasets import fetch_openml
from sklearn.model_selection import train_test_split

from local_glm_boost import LocalGLMBooster
from local_glm_boost.utils.tuning import tune_n_estimators
from local_glm_boost.utils.logger import LocalGLMBoostLogger

config_name = "real_data_study"

# Set up output folder, configuration file, run_id and logger
script_dir = "/home/heza7322/PycharmProjects/local-glm-boost/scripts"
folder_path = os.path.join(script_dir, "../data/results/")
config_path = os.path.join(script_dir, f"{config_name}.yaml")
with open(config_path, "r") as f:
    config = yaml.load(f, Loader=yaml.FullLoader)

if os.path.exists(folder_path) and os.listdir(folder_path):
    run_id = (
        max(
            [int(folder_name.split("_")[1]) for folder_name in os.listdir(folder_path)]
            + [0]
        )
        + 1
    )
else:
    run_id = 0

output_path = os.path.join(folder_path, f"run_{run_id}")
os.makedirs(output_path)

# Put the config in the output folder
shutil.copyfile(config_path, f"{output_path}/config.yaml")

# Set up logger
logger = LocalGLMBoostLogger(
    verbose=2,
    output_path=output_path,
)
logger.append_format_level(f"run_{run_id}")

# Load stuff from the config
logger.log("Loading configuration")
n = config["n"]
features_to_use = config["features_to_use"]
target = config["target"]
weights = config["weights"]
distribution = config["distribution"]
n_estimators_max = config["n_estimators_max"]
learning_rate = config["learning_rate"]
min_samples_split = config["min_samples_split"]
min_samples_leaf = config["min_samples_leaf"]
max_depth = config["max_depth"]
glm_init = config["glm_init"]
random_seed = config["random_seed"]
n_splits = config["n_splits"]
test_size = config["test_size"]
parallel = config["parallel"]
stratified = config["stratified"]
n_jobs = config["n_jobs"]

# Load and preprocess data
logger.log("Loading data")
ssl._create_default_https_context = ssl._create_unverified_context
df_num = fetch_openml(data_id=41214, as_frame=True).data
df_sev = fetch_openml(data_id=41215, as_frame=True).data
df_sev_tot = df_sev.groupby('IDpol')['ClaimAmount'].sum()
df = df_num.merge(df_sev_tot, left_on='IDpol', right_index=True, how='left')
df.loc[df['ClaimAmount'].isna(), 'ClaimNb'] = 0
df.loc[df['ClaimAmount'].isna(), 'ClaimAmount'] = 0
df = df.loc[df['ClaimNb'] <=5]

df['Exposure'] = df['Exposure'].clip(0, 1)
df['Area'] = df['Area'].apply(lambda x: ord(x) - 65)
df['VehGas'] = df['VehGas'].apply(lambda x: 1 if x == 'Regular' else 0)

continous_features = ['VehPower','VehAge','DrivAge','BonusMalus','Density','Area','VehGas']
features = [feature for feature in continous_features if feature in features_to_use]
parallel_fit = []
for feature in ['VehBrand','Region']:
    if feature in features_to_use:
        dummies = pd.get_dummies(df[feature], prefix=feature)
        df = pd.concat([df, dummies], axis=1)
        dummy_feature_indices = [j for j in range(len(features),len(features)+len(dummies.columns))]
        parallel_fit.append(dummy_feature_indices)
        features += dummies.columns.tolist()

rng = np.random.default_rng(seed=random_seed)
if n != "all":
    df = df.sample(n, random_state = rng.integers(0, 10000))
n = len(df)

X = df[features].astype(float)
y = df[target]
w = df[weights]

X = X / X.max(axis=0)


[2023-09-18 10:58][run_41][Loading configuration]
[2023-09-18 10:58][run_41][Loading data]


In [2]:
# Fit a GLM using statsmodels using poisson deviance
import statsmodels.api as sm
from statsmodels.genmod.generalized_linear_model import GLM
from statsmodels.genmod.families.family import Poisson

logger.log("Fitting a GLM using statsmodels")
glm = GLM(y, sm.add_constant(X), family=Poisson(), freq_weights=w)
glm_results = glm.fit()
glm_results.summary()


[2023-09-18 10:59][run_41][Fitting a GLM using statsmodels]


0,1,2,3
Dep. Variable:,ClaimNb,No. Observations:,678007.0
Model:,GLM,Df Residuals:,358319.81
Model Family:,Poisson,Df Model:,38.0
Link Function:,Log,Scale:,1.0
Method:,IRLS,Log-Likelihood:,-71121.0
Date:,"Mon, 18 Sep 2023",Deviance:,107100.0
Time:,10:59:12,Pearson chi2:,376000.0
No. Iterations:,7,Pseudo R-squ. (CS):,0.006628
Covariance Type:,nonrobust,,

0,1,2,3,4,5,6
,coef,std err,z,P>|z|,[0.025,0.975]
const,-4.5989,0.051,-90.281,0.000,-4.699,-4.499
VehPower,0.5456,0.060,9.101,0.000,0.428,0.663
VehAge,-2.2187,0.154,-14.437,0.000,-2.520,-1.918
DrivAge,1.0674,0.054,19.671,0.000,0.961,1.174
BonusMalus,5.9096,0.093,63.219,0.000,5.726,6.093
Density,-0.0348,0.070,-0.500,0.617,-0.171,0.102
Area,0.4362,0.037,11.773,0.000,0.364,0.509
VehGas,-0.1717,0.015,-11.208,0.000,-0.202,-0.142
VehBrand_B1,-0.3798,0.019,-20.253,0.000,-0.417,-0.343


In [6]:
df['VehBrand'].unique()


['B12', 'B6', 'B3', 'B2', 'B5', ..., 'B14', 'B13', 'B4', 'B1', 'B11']
Length: 11
Categories (11, object): ['B1', 'B10', 'B11', 'B12', ..., 'B3', 'B4', 'B5', 'B6']