In [35]:
!pip install --upgrade scikit-learn
!pip install shap

Collecting shap
  Downloading shap-0.48.0-cp311-cp311-win_amd64.whl.metadata (25 kB)
Collecting slicer==0.0.8 (from shap)
  Downloading slicer-0.0.8-py3-none-any.whl.metadata (4.0 kB)
Collecting numba>=0.54 (from shap)
  Downloading numba-0.61.2-cp311-cp311-win_amd64.whl.metadata (2.9 kB)
Collecting cloudpickle (from shap)
  Downloading cloudpickle-3.1.1-py3-none-any.whl.metadata (7.1 kB)
Collecting llvmlite<0.45,>=0.44.0dev0 (from numba>=0.54->shap)
  Downloading llvmlite-0.44.0-cp311-cp311-win_amd64.whl.metadata (5.0 kB)
Collecting numpy (from shap)
  Using cached numpy-2.2.6-cp311-cp311-win_amd64.whl.metadata (60 kB)
Downloading shap-0.48.0-cp311-cp311-win_amd64.whl (544 kB)
   ---------------------------------------- 0.0/544.4 kB ? eta -:--:--
   ---------------------------------------- 0.0/544.4 kB ? eta -:--:--
   ---------------------------------------- 0.0/544.4 kB ? eta -:--:--
   ------------------- -------------------- 262.1/544.4 kB ? eta -:--:--
   ------------------------

  You can safely remove it manually.
  You can safely remove it manually.
ERROR: pip's dependency resolver does not currently take into account all the packages that are installed. This behaviour is the source of the following dependency conflicts.
sklearn-compat 0.1.3 requires scikit-learn<1.7,>=1.2, but you have scikit-learn 1.7.0 which is incompatible.


## Data Preparation

In [17]:
from sklearn.model_selection import train_test_split
from sklearn.metrics import classification_report, mean_squared_error
from sklearn.preprocessing import LabelEncoder
import pandas as pd
import numpy as np

In [8]:
# Load data
data = pd.read_csv("../data/raw/MachineLearningRating_v3.txt", sep="|", low_memory=False)
data["LossRatio"] = data["TotalClaims"] / data["TotalPremium"]
data["HasClaim"] = data["TotalClaims"] > 0
model_data = data.copy()

In [9]:
# Encode categorical variables
categorical_cols = model_data.select_dtypes(include='object').columns
for col in categorical_cols:
    model_data[col] = LabelEncoder().fit_transform(model_data[col].astype(str))

In [10]:
# Define features and targets
features = model_data.drop(columns=["TotalClaims", "HasClaim"])
target_class = model_data["HasClaim"]
target_reg = model_data["TotalClaims"]

In [20]:
# Split classification data
X_train_c, X_test_c, y_train_c, y_test_c = train_test_split(features, target_class, test_size=0.2, random_state=42)

In [21]:
# Ensure SMOTE compatibility (clean data)
X_train_c.replace([np.inf, -np.inf], np.nan, inplace=True)
X_train_c.fillna(0, inplace=True)
y_train_c = y_train_c[X_train_c.index]

In [28]:
# Split regression data (only rows with claims)
X_train_r.replace([np.inf, -np.inf], np.nan, inplace=True)
y_train_r.replace([np.inf, -np.inf], np.nan, inplace=True)
reg_clean = X_train_r.copy()
reg_clean["target"] = y_train_r
reg_clean.dropna(inplace=True)
X_train_r = reg_clean.drop(columns=["target"])
y_train_r = reg_clean["target"]

## Model 1: Claim Occurrence (Classification)

In [25]:
from xgboost import XGBClassifier
from imblearn.over_sampling import SMOTE

# Balance the training set using SMOTE
smote = SMOTE(random_state=42)
X_train_c_bal, y_train_c_bal = smote.fit_resample(X_train_c.values, y_train_c.values)

xgb_clf = XGBClassifier(use_label_encoder=False, eval_metric="logloss")
xgb_clf.fit(X_train_c_bal, y_train_c_bal)

# Predictions and Evaluation
y_pred_c = xgb_clf.predict(X_test_c)
print(classification_report(y_test_c, y_pred_c))

Parameters: { "use_label_encoder" } are not used.

  bst.update(dtrain, iteration=i, fobj=obj)


              precision    recall  f1-score   support

       False       1.00      0.62      0.76    199439
        True       0.01      1.00      0.01       581

    accuracy                           0.62    200020
   macro avg       0.50      0.81      0.39    200020
weighted avg       1.00      0.62      0.76    200020



## Model 2: Claim Severity (Regression)


In [33]:
from xgboost import XGBRegressor
from sklearn.metrics import classification_report, mean_squared_error, r2_score

xgb_reg = XGBRegressor()
xgb_reg.fit(X_train_r, y_train_r)

# Predictions and Evaluation
y_pred_r = xgb_reg.predict(X_test_r)
rmse = mean_squared_error(y_test_r, y_pred_r) ** 0.5
r2 = r2_score(y_test_r, y_pred_r)
print(f"RMSE for claim severity: {rmse:.2f}")
print(f"R-squared: {r2:.2f}")

RMSE for claim severity: 46362.25
R-squared: -0.34


## Feature importance with shap

In [47]:
import shap

# Explain classification model predictions
explainer_c = shap.Explainer(xgb_clf, X_test_c)
shap_values_c = explainer_c(X_test_c)
shap.plots.bar(shap_values_c, max_display=10)

# Explain regression model predictions
explainer_r = shap.Explainer(xgb_reg, X_test_r)
shap_values_r = explainer_r(X_test_r)
shap.plots.bar(shap_values_r, max_display=10)

ImportError: Numba needs NumPy 2.2 or less. Got NumPy 2.3.