In [1]:
import numpy as np

import pandas as pd

from sklearn.metrics import root_mean_squared_log_error
from sklearn.model_selection import KFold
from insurance.common import PREP_DATA_PATH
from insurance.data_pipeline import get_feat_columns, make_xgboost_pipeline
from autogluon.tabular import TabularDataset, TabularPredictor

In [2]:
prep_data_path = PREP_DATA_PATH / "prepared_data.feather"
target_column = "Premium Amount"

df = pd.read_feather(prep_data_path)

feat_cols = get_feat_columns()
feat_names = feat_cols.names


df[target_column] = np.log1p(df[target_column])

# Fit the pipeline
data_pipeline = make_xgboost_pipeline()
X_train = data_pipeline.fit_transform(df)
for col in feat_cols.categorical:
    X_train[col] = X_train[col].astype("category")
display(X_train.head())
display(X_train.columns)

Unnamed: 0,year,month,day,dayofweek,Age,Gender,Annual Income,Marital Status,Number of Dependents,Education Level,...,Policy Type,Previous Claims,Vehicle Age,Credit Score,Insurance Duration,Customer Feedback,Smoking Status,Exercise Frequency,Property Type,Premium Amount
372605,2020,2,10,0,52.0,Female,5538.0,Married,,Master's,...,Basic,0.0,18.0,497.0,3.0,Average,No,Weekly,Apartment,7.916807
551204,2020,10,16,4,60.0,Female,39711.0,Married,4.0,Bachelor's,...,Comprehensive,0.0,3.0,340.0,3.0,Poor,No,Weekly,Apartment,7.206377
240320,2021,1,2,5,58.0,Female,2364.0,Married,2.0,Bachelor's,...,Comprehensive,2.0,19.0,,7.0,Poor,No,Monthly,House,7.694848
1047361,2022,5,11,2,39.0,Male,9094.0,Divorced,0.0,Bachelor's,...,Basic,1.0,17.0,761.0,5.0,Poor,Yes,Rarely,Condo,6.529419
555362,2020,7,28,1,47.0,Male,43189.0,Divorced,,PhD,...,Comprehensive,1.0,4.0,319.0,4.0,Poor,Yes,Weekly,Apartment,7.447168


Index(['year', 'month', 'day', 'dayofweek', 'Age', 'Gender', 'Annual Income',
       'Marital Status', 'Number of Dependents', 'Education Level',
       'Occupation', 'Health Score', 'Location', 'Policy Type',
       'Previous Claims', 'Vehicle Age', 'Credit Score', 'Insurance Duration',
       'Customer Feedback', 'Smoking Status', 'Exercise Frequency',
       'Property Type', 'Premium Amount'],
      dtype='object')

In [4]:
predictor = TabularPredictor(
    label=target_column, problem_type="regression", eval_metric="root_mean_squared_error"
)

predictor.fit(
    X_train, time_limit=3600 * 6, presets="best", excluded_model_types=["KNN"], num_stack_levels=4
)

No path specified. Models will be saved in: "AutogluonModels/ag-20241223_135741"
Preset alias specified: 'best' maps to 'best_quality'.
Verbosity: 2 (Standard Logging)
AutoGluon Version:  1.2
Python Version:     3.11.2
Operating System:   Linux
Platform Machine:   x86_64
Platform Version:   #1 SMP PREEMPT_DYNAMIC Debian 6.1.115-1 (2024-11-01)
CPU Count:          12
Memory Avail:       10.85 GB / 15.46 GB (70.2%)
Disk Space Avail:   713.62 GB / 871.45 GB (81.9%)
Presets specified: ['best']
Setting dynamic_stacking from 'auto' to True. Reason: Enable dynamic_stacking when use_bag_holdout is disabled. (use_bag_holdout=False)
Stack configuration (auto_stack=True): num_stack_levels=4, num_bag_folds=8, num_bag_sets=1
DyStack is enabled (dynamic_stacking=True). AutoGluon will try to determine whether the input data is affected by stacked overfitting and enable or disable stacking as a consequence.
	This is used to identify the optimal `num_stack_levels` value. Copies of AutoGluon will be fit 

<autogluon.tabular.predictor.predictor.TabularPredictor at 0x7fc5d0504b10>

In [5]:
predictor.leaderboard()

Unnamed: 0,model,score_val,eval_metric,pred_time_val,fit_time,pred_time_val_marginal,fit_time_marginal,stack_level,can_infer,fit_order
0,WeightedEnsemble_L6,-1.044272,root_mean_squared_error,446.168089,15422.86827,0.008758,3.540444,6,True,42
1,WeightedEnsemble_L4,-1.044295,root_mean_squared_error,362.922553,10982.049283,0.008795,1.05277,4,True,29
2,WeightedEnsemble_L5,-1.04432,root_mean_squared_error,409.507833,13719.689279,0.009139,0.962005,5,True,37
3,WeightedEnsemble_L3,-1.044359,root_mean_squared_error,293.854346,6914.212276,0.008862,1.330885,3,True,19
4,LightGBM_BAG_L3,-1.044398,root_mean_squared_error,305.645274,8099.388272,1.186625,21.083313,3,True,21
5,LightGBM_BAG_L4,-1.0444,root_mean_squared_error,372.844972,11536.540823,1.14721,20.324388,4,True,31
6,NeuralNetFastAI_BAG_L4,-1.044414,root_mean_squared_error,378.125066,11692.98954,6.427303,176.773105,4,True,35
7,NeuralNetFastAI_BAG_L3,-1.044424,root_mean_squared_error,310.341584,8446.502449,5.882934,368.197491,3,True,25
8,LightGBMXT_BAG_L3,-1.044452,root_mean_squared_error,306.374881,8104.739532,1.916231,26.434574,3,True,20
9,LightGBM_BAG_L5,-1.044452,root_mean_squared_error,429.452525,14386.649409,1.185189,20.209351,5,True,39


In [6]:
predictor.fit_summary(show_plot=True)

*** Summary of fit() ***
Estimated performance of each model:
                     model  score_val              eval_metric  pred_time_val      fit_time  pred_time_val_marginal  fit_time_marginal  stack_level  can_infer  fit_order
0      WeightedEnsemble_L6  -1.044272  root_mean_squared_error     446.168089  15422.868270                0.008758           3.540444            6       True         42
1      WeightedEnsemble_L4  -1.044295  root_mean_squared_error     362.922553  10982.049283                0.008795           1.052770            4       True         29
2      WeightedEnsemble_L5  -1.044320  root_mean_squared_error     409.507833  13719.689279                0.009139           0.962005            5       True         37
3      WeightedEnsemble_L3  -1.044359  root_mean_squared_error     293.854346   6914.212276                0.008862           1.330885            3       True         19
4          LightGBM_BAG_L3  -1.044398  root_mean_squared_error     305.645274   8099.388

{'model_types': {'LightGBMXT_BAG_L1': 'StackerEnsembleModel_LGB',
  'LightGBM_BAG_L1': 'StackerEnsembleModel_LGB',
  'RandomForestMSE_BAG_L1': 'StackerEnsembleModel_RF',
  'CatBoost_BAG_L1': 'StackerEnsembleModel_CatBoost',
  'ExtraTreesMSE_BAG_L1': 'StackerEnsembleModel_XT',
  'NeuralNetFastAI_BAG_L1': 'StackerEnsembleModel_NNFastAiTabular',
  'XGBoost_BAG_L1': 'StackerEnsembleModel_XGBoost',
  'WeightedEnsemble_L2': 'WeightedEnsembleModel',
  'LightGBMXT_BAG_L2': 'StackerEnsembleModel_LGB',
  'LightGBM_BAG_L2': 'StackerEnsembleModel_LGB',
  'RandomForestMSE_BAG_L2': 'StackerEnsembleModel_RF',
  'CatBoost_BAG_L2': 'StackerEnsembleModel_CatBoost',
  'ExtraTreesMSE_BAG_L2': 'StackerEnsembleModel_XT',
  'NeuralNetFastAI_BAG_L2': 'StackerEnsembleModel_NNFastAiTabular',
  'XGBoost_BAG_L2': 'StackerEnsembleModel_XGBoost',
  'NeuralNetTorch_BAG_L2': 'StackerEnsembleModel_TabularNeuralNetTorch',
  'LightGBMLarge_BAG_L2': 'StackerEnsembleModel_LGB',
  'CatBoost_r177_BAG_L2': 'StackerEnsembleMo

In [7]:
import numpy as np
import pandas as pd

import xgboost as xgb

from insurance.common import OUT_PATH, PREP_DATA_PATH, RAW_DATA_PATH
from insurance.data_pipeline import get_feat_columns

from insurance.prepare_basic import prepare


df_test = pd.read_csv(RAW_DATA_PATH / "test.csv")
ids = df_test["id"].values
df_test = prepare(df=df_test)
df_test.to_feather(PREP_DATA_PATH / "test_prepared.feather")

feat_cols = get_feat_columns()
feat_names = feat_cols.names
df_test = df_test[feat_names]

df_test["Premium Amount"] = -1.0
X_test = data_pipeline.transform(df_test)
for col in feat_cols.categorical:
    X_test[col] = X_test[col].astype("category")


predictions = np.expm1(predictor.predict(X_test))

# Prepare submission file
submission = pd.DataFrame(
    {
        "id": ids,
        "Premium Amount": predictions,
    }
)
pred_file = OUT_PATH / "preds.csv"
submission.to_csv(pred_file, index=False)
print(f"Submission file saved to {pred_file}")

Submission file saved to /mnt/data/Projects/insurance/out/preds.csv
