In [30]:
import numpy as np
import pandas as pd
import ssl
import os
import yaml
import shutil

import matplotlib.pyplot as plt

from sklearn.datasets import fetch_openml
from sklearn.model_selection import train_test_split

from local_glm_boost import LocalGLMBooster
from local_glm_boost.utils.tuning import tune_n_estimators
from local_glm_boost.utils.logger import LocalGLMBoostLogger

config_name = "real_data_study"

In [3]:
# Set up output folder, configuration file, run_id and logger
script_dir = "/home/heza7322/PycharmProjects/local-glm-boost/notebooks"
folder_path = os.path.join(script_dir, "../data/results/")
config_path = os.path.join(script_dir, f"{config_name}.yaml")
with open(config_path, "r") as f:
    config = yaml.load(f, Loader=yaml.FullLoader)

if os.path.exists(folder_path) and os.listdir(folder_path):
    run_id = (
        max(
            [int(folder_name.split("_")[1]) for folder_name in os.listdir(folder_path)]
            + [0]
        )
        + 1
    )
else:
    run_id = 0

output_path = os.path.join(folder_path, f"run_{run_id}")
os.makedirs(output_path)


In [66]:
# Put the config in the output folder
shutil.copyfile(config_path, f"{output_path}/config.yaml")

# Set up logger
logger = LocalGLMBoostLogger(
    verbose=2,
    output_path=output_path,
)
logger.append_format_level(f"run_{run_id}")

# Load stuff from the config
logger.log("Loading configuration")
n = config["n"]
features_to_use = config["features_to_use"]
target = config["target"]
weights = config["weights"]
distribution = config["distribution"]
n_estimators_max = config["n_estimators_max"]
learning_rate = config["learning_rate"]
min_samples_split = config["min_samples_split"]
min_samples_leaf = config["min_samples_leaf"]
max_depth = config["max_depth"]
glm_init = config["glm_init"]
random_seed = config["random_seed"]
n_splits = config["n_splits"]
test_size = config["test_size"]
parallel = config["parallel"]
stratified = config["stratified"]
n_jobs = config["n_jobs"]

# Load and preprocess data
logger.log("Loading data")
ssl._create_default_https_context = ssl._create_unverified_context
df_num = fetch_openml(data_id=41214, as_frame=True).data
df_sev = fetch_openml(data_id=41215, as_frame=True).data
df_sev_tot = df_sev.groupby('IDpol')['ClaimAmount'].sum()
df = df_num.merge(df_sev_tot, left_on='IDpol', right_index=True, how='left')
df.loc[df['ClaimAmount'].isna(), 'ClaimNb'] = 0
df.loc[df['ClaimAmount'].isna(), 'ClaimAmount'] = 0
df = df.loc[df['ClaimNb'] <=5]

df['Exposure'] = df['Exposure'].clip(0, 1)
df['Area'] = df['Area'].apply(lambda x: ord(x) - 65)
df['VehGas'] = df['VehGas'].apply(lambda x: 1 if x == 'Regular' else 0)

features = ['VehPower','VehAge','DrivAge','BonusMalus','Density','Area','VehGas']
parallel_fit = []
for feature in ['VehBrand','Region']:
    dummies = pd.get_dummies(df[feature], prefix=feature)
    df = pd.concat([df, dummies], axis=1)
    features += dummies.columns.tolist()
    parallel_fit.append(dummies.columns.tolist())

[2023-09-04 11:48][run_13][Loading configuration]
[2023-09-04 11:48][run_13][Loading data]
