In [7]:
# Load the cleaned dataset
import pandas as pd

cleaned_data_path = "../data/processed/cleaned_dataset.csv"
df = pd.read_csv(cleaned_data_path)

print(f"Dataset loaded with {df.shape[0]} rows and {df.shape[1]} columns.")
df.head()


  df = pd.read_csv(cleaned_data_path)


Dataset loaded with 999805 rows and 50 columns.


Unnamed: 0,UnderwrittenCoverID,PolicyID,TransactionMonth,IsVATRegistered,Citizenship,LegalType,Title,Language,Bank,AccountType,...,ExcessSelected,CoverCategory,CoverType,CoverGroup,Section,Product,StatutoryClass,StatutoryRiskType,TotalPremium,TotalClaims
0,145249,12827,2015-03-01,True,,Close Corporation,Mr,English,First National Bank,Current account,...,Mobility - Windscreen,Windscreen,Windscreen,Comprehensive - Taxi,Motor Comprehensive,Mobility Metered Taxis: Monthly,Commercial,IFRS Constant,21.929825,0.0
1,145249,12827,2015-05-01,True,,Close Corporation,Mr,English,First National Bank,Current account,...,Mobility - Windscreen,Windscreen,Windscreen,Comprehensive - Taxi,Motor Comprehensive,Mobility Metered Taxis: Monthly,Commercial,IFRS Constant,21.929825,0.0
2,145249,12827,2015-07-01,True,,Close Corporation,Mr,English,First National Bank,Current account,...,Mobility - Windscreen,Windscreen,Windscreen,Comprehensive - Taxi,Motor Comprehensive,Mobility Metered Taxis: Monthly,Commercial,IFRS Constant,0.0,0.0
3,145255,12827,2015-05-01,True,,Close Corporation,Mr,English,First National Bank,Current account,...,Mobility - Metered Taxis - R2000,Own damage,Own Damage,Comprehensive - Taxi,Motor Comprehensive,Mobility Metered Taxis: Monthly,Commercial,IFRS Constant,512.84807,0.0
4,145255,12827,2015-07-01,True,,Close Corporation,Mr,English,First National Bank,Current account,...,Mobility - Metered Taxis - R2000,Own damage,Own Damage,Comprehensive - Taxi,Motor Comprehensive,Mobility Metered Taxis: Monthly,Commercial,IFRS Constant,0.0,0.0


## Inspect Missing Data

Check for missing values in the dataset to decide on imputation or removal strategies.


In [8]:
# Check missing values count and percentage
missing_counts = df.isnull().sum()
missing_percent = (missing_counts / len(df)) * 100
missing_summary = pd.DataFrame({"Missing Count": missing_counts, "Missing %": missing_percent})

print(missing_summary[missing_summary["Missing Count"] > 0].sort_values(by="Missing %", ascending=False))


                    Missing Count  Missing %
WrittenOff                 641638  64.176314
Converted                  641638  64.176314
Rebuilt                    641638  64.176314
NewVehicle                 153050  15.307985
mmcode                        550   0.055011
VehicleType                   550   0.055011
make                          550   0.055011
kilowatts                     550   0.055011
cubiccapacity                 550   0.055011
Cylinders                     550   0.055011
Model                         550   0.055011
NumberOfDoors                 550   0.055011
VehicleIntroDate              550   0.055011
bodytype                      550   0.055011
CapitalOutstanding              2   0.000200


## Handling Missing Data

Based on the missing data analysis, decide to either impute or remove missing values.

- For columns with low missing %, consider imputation.
- For columns with large missing %, consider removal or further investigation.


In [9]:
# Cell 5: Handling Missing Data (fix for CapitalOutstanding conversion)

# Drop columns with large missing percentage
cols_to_drop = ['WrittenOff', 'Converted', 'Rebuilt']
df = df.drop(columns=cols_to_drop)

# Drop rows with missing 'NewVehicle'
df = df.dropna(subset=['NewVehicle'])

# Impute missing values in columns with low missing percentage
low_missing_cols = ['mmcode', 'VehicleType', 'make', 'kilowatts', 'cubiccapacity', 
                    'Cylinders', 'Model', 'NumberOfDoors', 'VehicleIntroDate', 'bodytype']

for col in low_missing_cols:
    if df[col].dtype == 'object':
        df[col] = df[col].fillna(df[col].mode()[0])
    else:
        df[col] = df[col].fillna(df[col].median())

# Fix CapitalOutstanding: convert string with commas to float
df['CapitalOutstanding'] = df['CapitalOutstanding'].astype(str).str.replace(',', '.')
df['CapitalOutstanding'] = pd.to_numeric(df['CapitalOutstanding'], errors='coerce')

# Now fill missing values with median
df['CapitalOutstanding'] = df['CapitalOutstanding'].fillna(df['CapitalOutstanding'].median())

# Check remaining missing data
print(df.isnull().sum())


UnderwrittenCoverID         0
PolicyID                    0
TransactionMonth            0
IsVATRegistered             0
Citizenship                 0
LegalType                   0
Title                       0
Language                    0
Bank                        0
AccountType                 0
MaritalStatus               0
Gender                      0
Country                     0
Province                    0
PostalCode                  0
MainCrestaZone              0
SubCrestaZone               0
ItemType                    0
mmcode                      0
VehicleType                 0
RegistrationYear            0
make                        0
Model                       0
Cylinders                   0
cubiccapacity               0
kilowatts                   0
bodytype                    0
NumberOfDoors               0
VehicleIntroDate            0
CustomValueEstimate         0
AlarmImmobiliser            0
TrackingDevice              0
CapitalOutstanding          0
NewVehicle

## Feature Engineering

Create new features relevant for predicting claims and premiums, such as:

- Policy duration (if start and end dates exist)
- Claim history count or flags
- Vehicle age buckets
- Interaction terms, if applicable


calculate the VehicleAge using the RegistrationYear column and the current year.

In [10]:
import datetime

# Calculate the current year
current_year = datetime.datetime.now().year

# Create the VehicleAge column
df['VehicleAge'] = current_year - df['RegistrationYear']

# Create the VehicleAgeBucket column
df['VehicleAgeBucket'] = pd.cut(
    df['VehicleAge'], 
    bins=[0, 5, 10, 15, 100], 
    labels=['0-5', '6-10', '11-15', '15+']
)

# Verify the new columns
print(df[['VehicleAge', 'VehicleAgeBucket']].head())


   VehicleAge VehicleAgeBucket
0          21              15+
1          21              15+
2          21              15+
3          21              15+
4          21              15+


In [11]:
# Example: Create a binary claim flag
df['HasClaim'] = (df['TotalClaims'] > 0).astype(int)

# Example: Vehicle age buckets
df['VehicleAgeBucket'] = pd.cut(df['VehicleAge'], bins=[0, 5, 10, 15, 100], labels=['0-5', '6-10', '11-15', '15+'])

# Example: Policy duration in days (assuming start_date and end_date columns exist)
if 'PolicyStartDate' in df.columns and 'PolicyEndDate' in df.columns:
    df['PolicyDuration'] = (pd.to_datetime(df['PolicyEndDate']) - pd.to_datetime(df['PolicyStartDate'])).dt.days


In [12]:
print(df.columns)


Index(['UnderwrittenCoverID', 'PolicyID', 'TransactionMonth',
       'IsVATRegistered', 'Citizenship', 'LegalType', 'Title', 'Language',
       'Bank', 'AccountType', 'MaritalStatus', 'Gender', 'Country', 'Province',
       'PostalCode', 'MainCrestaZone', 'SubCrestaZone', 'ItemType', 'mmcode',
       'VehicleType', 'RegistrationYear', 'make', 'Model', 'Cylinders',
       'cubiccapacity', 'kilowatts', 'bodytype', 'NumberOfDoors',
       'VehicleIntroDate', 'CustomValueEstimate', 'AlarmImmobiliser',
       'TrackingDevice', 'CapitalOutstanding', 'NewVehicle', 'SumInsured',
       'TermFrequency', 'CalculatedPremiumPerTerm', 'ExcessSelected',
       'CoverCategory', 'CoverType', 'CoverGroup', 'Section', 'Product',
       'StatutoryClass', 'StatutoryRiskType', 'TotalPremium', 'TotalClaims',
       'VehicleAge', 'VehicleAgeBucket', 'HasClaim'],
      dtype='object')


## Encoding Categorical Variables

Convert categorical columns to numerical format using one-hot encoding or label encoding as appropriate.


In [13]:
from sklearn.preprocessing import LabelEncoder

# For simplicity, use one-hot encoding for all categorical columns with relatively few categories
categorical_cols = df.select_dtypes(include=['object', 'category']).columns.tolist()

# Drop columns not needed for modeling or identifiers if necessary
columns_to_drop = ['PolicyID', 'CustomerID'] if 'PolicyID' in df.columns and 'CustomerID' in df.columns else []

df_model = df.drop(columns=columns_to_drop)

# One-hot encode categorical features
df_model = pd.get_dummies(df_model, columns=categorical_cols, drop_first=True)

print(f"Data shape after encoding: {df_model.shape}")


Data shape after encoding: (846755, 798)


## Train-Test Split

Split the dataset into training and testing sets, e.g., 80% training and 20% testing.


In [14]:
from sklearn.model_selection import train_test_split

# Define features (X) and target (y) for claim severity prediction (only where claims exist)
df_claims = df_model[df_model['TotalClaims'] > 0]

X = df_claims.drop(columns=['TotalClaims'])
y = df_claims['TotalClaims']

X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.2, random_state=42)

print(f"Training set size: {X_train.shape[0]} samples")
print(f"Test set size: {X_test.shape[0]} samples")


Training set size: 1960 samples
Test set size: 490 samples


## Model Building - Claim Severity Prediction

## Subset Data to Policies with Claims

We will only include data where `TotalClaims > 0` for claim severity prediction.


In [15]:
# Subset the dataset to policies with claims
df_claims = df_model[df_model['TotalClaims'] > 0]

X = df_claims.drop(columns=['TotalClaims'])
y = df_claims['TotalClaims']

print(f"Dataset for modeling: {X.shape[0]} rows and {X.shape[1]} features.")


Dataset for modeling: 2450 rows and 797 features.


## Baseline Linear Regression Model

Train a simple Linear Regression model to establish a baseline.


In [16]:
import sklearn
print(sklearn.__version__)


1.7.0


In [17]:
from sklearn.linear_model import LinearRegression
from sklearn.metrics import mean_squared_error, r2_score

# Train Linear Regression model
lr_model = LinearRegression()
lr_model.fit(X_train, y_train)

# Predict on the test set
y_pred_lr = lr_model.predict(X_test)

# Calculate metrics
# Calculate Mean Squared Error
mse_lr = mean_squared_error(y_test, y_pred_lr)
# Calculate RMSE manually
rmse_lr = mse_lr ** 0.5
# Calculate R-squared
r2_lr = r2_score(y_test, y_pred_lr)

# Print results
print(f"Linear Regression MSE: {mse_lr:.4f}")
print(f"Linear Regression RMSE: {rmse_lr:.4f}")
print(f"Linear Regression R-squared: {r2_lr:.4f}")


Linear Regression MSE: 1540828472.1111
Linear Regression RMSE: 39253.3880
Linear Regression R-squared: 0.0387


## Decision Tree Regressor

Train a Decision Tree model for claim severity prediction.


In [18]:
from sklearn.tree import DecisionTreeRegressor
from sklearn.metrics import mean_squared_error, r2_score

# Train Decision Tree model
dt_model = DecisionTreeRegressor(random_state=42)
dt_model.fit(X_train, y_train)

# Predict on the test set
y_pred_dt = dt_model.predict(X_test)

# Calculate metrics
# Calculate Mean Squared Error
mse_dt = mean_squared_error(y_test, y_pred_dt)
# Calculate RMSE manually
rmse_dt = mse_dt ** 0.5
# Calculate R-squared
r2_dt = r2_score(y_test, y_pred_dt)

# Print results
print(f"Decision Tree MSE: {mse_dt:.4f}")
print(f"Decision Tree RMSE: {rmse_dt:.4f}")
print(f"Decision Tree R-squared: {r2_dt:.4f}")


Decision Tree MSE: 2005630123.1536
Decision Tree RMSE: 44784.2620
Decision Tree R-squared: -0.2513


## Random Forest Regressor

Train a Random Forest model for claim severity prediction.


In [19]:
from sklearn.ensemble import RandomForestRegressor
from sklearn.metrics import mean_squared_error, r2_score

# Train Random Forest model
rf_model = RandomForestRegressor(random_state=42)
rf_model.fit(X_train, y_train)

# Predict on the test set
y_pred_rf = rf_model.predict(X_test)

# Calculate metrics
# Mean Squared Error
mse_rf = mean_squared_error(y_test, y_pred_rf)
# Root Mean Squared Error (RMSE)
rmse_rf = mse_rf ** 0.5
# R-squared
r2_rf = r2_score(y_test, y_pred_rf)

# Print results
print(f"Random Forest MSE: {mse_rf:.4f}")
print(f"Random Forest RMSE: {rmse_rf:.4f}")
print(f"Random Forest R-squared: {r2_rf:.4f}")


Random Forest MSE: 1247551352.2853
Random Forest RMSE: 35320.6930
Random Forest R-squared: 0.2217


## XGBoost Regressor

Train an XGBoost model for claim severity prediction.


In [20]:
from xgboost import XGBRegressor
from sklearn.metrics import mean_squared_error, r2_score
from math import sqrt

# Train XGBoost model
xgb_model = XGBRegressor(random_state=42)
xgb_model.fit(X_train, y_train)

# Predict and evaluate
y_pred_xgb = xgb_model.predict(X_test)

# Manually compute RMSE to avoid 'squared' param issue
mse_xgb = mean_squared_error(y_test, y_pred_xgb)
rmse_xgb = sqrt(mse_xgb)
r2_xgb = r2_score(y_test, y_pred_xgb)

print(f"XGBoost RMSE: {rmse_xgb:.4f}")
print(f"XGBoost R-squared: {r2_xgb:.4f}")


XGBoost RMSE: 39650.7471
XGBoost R-squared: 0.0191


## Hyperparameter Tuning: Random Forest Regressor

Use `GridSearchCV` to find the best combination of hyperparameters for the Random Forest Regressor.


In [21]:
from sklearn.ensemble import RandomForestRegressor
from sklearn.model_selection import GridSearchCV
from sklearn.metrics import mean_squared_error, r2_score
from math import sqrt

# Define hyperparameter grid
rf_param_grid = {
    'n_estimators': [50, 100, 200],
    'max_depth': [None, 10, 20, 30],
    'min_samples_split': [2, 5, 10],
    'min_samples_leaf': [1, 2, 4]
}

# Perform Grid Search
rf_grid_search = GridSearchCV(
    estimator=RandomForestRegressor(random_state=42),
    param_grid=rf_param_grid,
    cv=3,
    scoring='neg_root_mean_squared_error',
    verbose=2,
    n_jobs=-1
)

# Fit the model
rf_grid_search.fit(X_train, y_train)

# Retrieve the best parameters and best model
rf_best_params = rf_grid_search.best_params_
rf_best_model = rf_grid_search.best_estimator_

print(f"Best Parameters for Random Forest: {rf_best_params}")

# Evaluate the tuned model
y_pred_rf_tuned = rf_best_model.predict(X_test)

# Compute metrics manually to avoid compatibility issues
mse_rf_tuned = mean_squared_error(y_test, y_pred_rf_tuned)
rmse_rf_tuned = sqrt(mse_rf_tuned)  # Manually compute RMSE
r2_rf_tuned = r2_score(y_test, y_pred_rf_tuned)

# Print evaluation metrics
print(f"Tuned Random Forest RMSE: {rmse_rf_tuned:.4f}")
print(f"Tuned Random Forest R-squared: {r2_rf_tuned:.4f}")


Fitting 3 folds for each of 108 candidates, totalling 324 fits
Best Parameters for Random Forest: {'max_depth': 10, 'min_samples_leaf': 2, 'min_samples_split': 10, 'n_estimators': 100}
Tuned Random Forest RMSE: 34086.4682
Tuned Random Forest R-squared: 0.2751


## Hyperparameter Tuning: XGBoost Regressor

Use `GridSearchCV` to find the best combination of hyperparameters for the XGBoost Regressor.


In [22]:
from sklearn.model_selection import GridSearchCV
from xgboost import XGBRegressor
from sklearn.metrics import mean_squared_error, r2_score
from math import sqrt

# Define hyperparameter grid
xgb_param_grid = {
    'n_estimators': [50, 100, 200],
    'max_depth': [3, 5, 7],
    'learning_rate': [0.01, 0.1, 0.2],
    'subsample': [0.8, 1],
    'colsample_bytree': [0.8, 1]
}

# Perform Grid Search
xgb_grid_search = GridSearchCV(
    estimator=XGBRegressor(random_state=42),
    param_grid=xgb_param_grid,
    cv=3,
    scoring='neg_root_mean_squared_error',
    verbose=2,
    n_jobs=-1
)

xgb_grid_search.fit(X_train, y_train)

# Best parameters and model
xgb_best_params = xgb_grid_search.best_params_
xgb_best_model = xgb_grid_search.best_estimator_

print(f"Best Parameters for XGBoost: {xgb_best_params}")

# Evaluate tuned model
y_pred_xgb_tuned = xgb_best_model.predict(X_test)
mse_xgb_tuned = mean_squared_error(y_test, y_pred_xgb_tuned)
rmse_xgb_tuned = sqrt(mse_xgb_tuned)
r2_xgb_tuned = r2_score(y_test, y_pred_xgb_tuned)

print(f"Tuned XGBoost RMSE: {rmse_xgb_tuned:.4f}")
print(f"Tuned XGBoost R-squared: {r2_xgb_tuned:.4f}")


Fitting 3 folds for each of 108 candidates, totalling 324 fits
Best Parameters for XGBoost: {'colsample_bytree': 0.8, 'learning_rate': 0.01, 'max_depth': 3, 'n_estimators': 200, 'subsample': 1}
Tuned XGBoost RMSE: 34350.4680
Tuned XGBoost R-squared: 0.2638


## Model Building: Premium Prediction

In this section, we will build models to predict the `CalculatedPremiumPerTerm` using the same regression algorithms. Additionally, we will optionally train a binary classification model to predict the probability of claim occurrence.


In [23]:
# Target variable for premium prediction
y_premium = df['CalculatedPremiumPerTerm']

# Features — same as before, excluding target variables if needed
X_premium = df.drop(columns=['CalculatedPremiumPerTerm', 'TotalClaims', 'HasClaim'])

# Split into train and test sets (reuse or new split)
from sklearn.model_selection import train_test_split
X_train_prem, X_test_prem, y_train_prem, y_test_prem = train_test_split(X_premium, y_premium, test_size=0.2, random_state=42)


In [24]:
import pandas as pd

# Convert to datetime and extract parts
X_train_prem['VehicleIntroDate'] = pd.to_datetime(X_train_prem['VehicleIntroDate'], errors='coerce')
X_train_prem['VehicleIntroYear'] = X_train_prem['VehicleIntroDate'].dt.year
X_train_prem['VehicleIntroMonth'] = X_train_prem['VehicleIntroDate'].dt.month
X_train_prem['VehicleIntroDay'] = X_train_prem['VehicleIntroDate'].dt.day
X_train_prem = X_train_prem.drop(columns=['VehicleIntroDate'])

X_test_prem['VehicleIntroDate'] = pd.to_datetime(X_test_prem['VehicleIntroDate'], errors='coerce')
X_test_prem['VehicleIntroYear'] = X_test_prem['VehicleIntroDate'].dt.year
X_test_prem['VehicleIntroMonth'] = X_test_prem['VehicleIntroDate'].dt.month
X_test_prem['VehicleIntroDay'] = X_test_prem['VehicleIntroDate'].dt.day
X_test_prem = X_test_prem.drop(columns=['VehicleIntroDate'])


  X_train_prem['VehicleIntroDate'] = pd.to_datetime(X_train_prem['VehicleIntroDate'], errors='coerce')
  X_test_prem['VehicleIntroDate'] = pd.to_datetime(X_test_prem['VehicleIntroDate'], errors='coerce')


In [25]:
print(X_train_prem.columns)


Index(['UnderwrittenCoverID', 'PolicyID', 'TransactionMonth',
       'IsVATRegistered', 'Citizenship', 'LegalType', 'Title', 'Language',
       'Bank', 'AccountType', 'MaritalStatus', 'Gender', 'Country', 'Province',
       'PostalCode', 'MainCrestaZone', 'SubCrestaZone', 'ItemType', 'mmcode',
       'VehicleType', 'RegistrationYear', 'make', 'Model', 'Cylinders',
       'cubiccapacity', 'kilowatts', 'bodytype', 'NumberOfDoors',
       'CustomValueEstimate', 'AlarmImmobiliser', 'TrackingDevice',
       'CapitalOutstanding', 'NewVehicle', 'SumInsured', 'TermFrequency',
       'ExcessSelected', 'CoverCategory', 'CoverType', 'CoverGroup', 'Section',
       'Product', 'StatutoryClass', 'StatutoryRiskType', 'TotalPremium',
       'VehicleAge', 'VehicleAgeBucket', 'VehicleIntroYear',
       'VehicleIntroMonth', 'VehicleIntroDay'],
      dtype='object')


In [26]:
X_train_prem = pd.get_dummies(X_train_prem, drop_first=True)
X_test_prem = pd.get_dummies(X_test_prem, drop_first=True)

# Align columns of test to train 
X_test_prem = X_test_prem.reindex(columns=X_train_prem.columns, fill_value=0)


In [27]:
from sklearn.linear_model import LinearRegression
from sklearn.metrics import mean_squared_error, r2_score

lr_premium = LinearRegression()
lr_premium.fit(X_train_prem, y_train_prem)
y_pred_lr_prem = lr_premium.predict(X_test_prem)

mse_lr_prem = mean_squared_error(y_test_prem, y_pred_lr_prem)  # MSE
rmse_lr_prem = mse_lr_prem ** 0.5  # RMSE
r2_lr_prem = r2_score(y_test_prem, y_pred_lr_prem)

print(f"Linear Regression RMSE (Premium): {rmse_lr_prem:.4f}")
print(f"Linear Regression R-squared (Premium): {r2_lr_prem:.4f}")


Linear Regression RMSE (Premium): 295.8220
Linear Regression R-squared (Premium): 0.6889


In [28]:
X_train_sample = X_train_prem.sample(frac=0.1, random_state=42)
y_train_sample = y_train_prem.loc[X_train_sample.index]

rf_premium.fit(X_train_sample, y_train_sample)


0,1,2
,n_estimators,20
,criterion,'squared_error'
,max_depth,
,min_samples_split,2
,min_samples_leaf,1
,min_weight_fraction_leaf,0.0
,max_features,1.0
,max_leaf_nodes,
,min_impurity_decrease,0.0
,bootstrap,True


In [30]:
from xgboost import XGBRegressor
from sklearn.metrics import mean_squared_error, r2_score

xgb_premium = XGBRegressor(random_state=42, use_label_encoder=False, eval_metric='rmse')
xgb_premium.fit(X_train_prem, y_train_prem)
y_pred_xgb_prem = xgb_premium.predict(X_test_prem)

# Compute MSE and RMSE
mse_xgb_prem = mean_squared_error(y_test_prem, y_pred_xgb_prem)
rmse_xgb_prem = mse_xgb_prem ** 0.5  # Square root of MSE
r2_xgb_prem = r2_score(y_test_prem, y_pred_xgb_prem)

print(f"XGBoost RMSE (Premium): {rmse_xgb_prem:.4f}")
print(f"XGBoost R-squared (Premium): {r2_xgb_prem:.4f}")


Parameters: { "use_label_encoder" } are not used.

  bst.update(dtrain, iteration=i, fobj=obj)


XGBoost RMSE (Premium): 21.7392
XGBoost R-squared (Premium): 0.9983


## Optional: Binary Classification Model to Predict Claim Occurrence

This model will predict whether a claim occurs (`HasClaim` = 1) or not (`HasClaim` = 0).


In [31]:
# Target and features for classification
y_claim_occur = df['HasClaim']
X_claim_occur = df.drop(columns=['HasClaim', 'TotalClaims', 'CalculatedPremiumPerTerm'])

X_train_cls, X_test_cls, y_train_cls, y_test_cls = train_test_split(X_claim_occur, y_claim_occur, test_size=0.2, random_state=42)


In [34]:
print(X_train_cls.head())


        UnderwrittenCoverID  PolicyID TransactionMonth  IsVATRegistered  \
764786               154388     12620       2015-04-01            False   
136700               123561      9864       2015-06-01            False   
26140                198301     17711       2015-05-01            False   
335649               137282     11886       2015-03-01            False   
452596               252860     21357       2015-08-01            False   

       Citizenship   LegalType Title Language                 Bank  \
764786              Individual    Mr  English        Standard Bank   
136700              Individual    Mr  English            ABSA Bank   
26140               Individual    Mr  English              Unknown   
335649              Individual    Mr  English  First National Bank   
452596              Individual    Mr  English              Nedbank   

            AccountType  ...              CoverCategory  \
764786  Savings account  ...            Keys and Alarms   
136700  Sa

In [35]:
X_train_cls['TransactionMonth'] = pd.to_datetime(X_train_cls['TransactionMonth'])
X_train_cls['Year'] = X_train_cls['TransactionMonth'].dt.year
X_train_cls['Month'] = X_train_cls['TransactionMonth'].dt.month
X_train_cls.drop(columns=['TransactionMonth'], inplace=True)


In [36]:
from sklearn.preprocessing import OneHotEncoder

# Get categorical columns
categorical_cols = [
    'Citizenship', 'LegalType', 'Title', 'Language', 
    'Bank', 'AccountType', 'CoverCategory', 'CoverType', 
    'CoverGroup', 'Section', 'Product', 'StatutoryClass', 
    'StatutoryRiskType', 'VehicleAgeBucket'
]

# One-hot encode
X_train_cls = pd.get_dummies(X_train_cls, columns=categorical_cols, drop_first=True)


In [37]:
X_train_cls['IsVATRegistered'] = X_train_cls['IsVATRegistered'].astype(int)


In [38]:
X_test_cls = pd.get_dummies(X_test_cls, columns=categorical_cols, drop_first=True)
X_test_cls = X_test_cls.reindex(columns=X_train_cls.columns, fill_value=0)


In [40]:
print(X_train_cls.dtypes)


UnderwrittenCoverID                         int64
PolicyID                                    int64
IsVATRegistered                             int64
MaritalStatus                              object
Gender                                     object
                                            ...  
Product_Mobility Metered Taxis: Monthly      bool
Product_Standalone Passenger Liability       bool
VehicleAgeBucket_6-10                        bool
VehicleAgeBucket_11-15                       bool
VehicleAgeBucket_15+                         bool
Length: 131, dtype: object


## Model Evaluation

We will evaluate the regression models for Claim Severity and Premium Prediction using RMSE and R-squared metrics.  
For the classification model predicting claim occurrence, we will use accuracy, precision, recall, and F1-score.  
Finally, we will compare the performances and select the best models for each task.


 ## Evaluate Claim Severity Models

In [None]:
from sklearn.metrics import mean_squared_error, r2_score

# Assuming predictions are already made: y_pred_lr_sev, y_pred_rf_sev, y_pred_xgb_sev
# And true labels: y_test_sev

def eval_regression_models(y_true, preds_dict):
    for model_name, y_pred in preds_dict.items():
        rmse = mean_squared_error(y_true, y_pred, squared=False)
        r2 = r2_score(y_true, y_pred)
        print(f"{model_name} — RMSE: {rmse:.4f}, R²: {r2:.4f}")

claim_severity_preds = {
    'Linear Regression': y_pred_lr_sev,
    'Random Forest': y_pred_rf_sev,
    'XGBoost': y_pred_xgb_sev
}

print("Claim Severity Model Performance:")
eval_regression_models(y_test_sev, claim_severity_preds)


## Evaluate Premium Prediction Models

In [None]:
premium_preds = {
    'Linear Regression': y_pred_lr_prem,
    'Random Forest': y_pred_rf_prem,
    'XGBoost': y_pred_xgb_prem
}

print("\nPremium Prediction Model Performance:")
eval_regression_models(y_test_prem, premium_preds)


 ## Evaluate Classification Model

In [None]:
from sklearn.metrics import accuracy_score, precision_score, recall_score, f1_score

def eval_classification_model(y_true, y_pred, model_name="Model"):
    acc = accuracy_score(y_true, y_pred)
    prec = precision_score(y_true, y_pred)
    rec = recall_score(y_true, y_pred)
    f1 = f1_score(y_true, y_pred)
    print(f"{model_name} — Accuracy: {acc:.4f}, Precision: {prec:.4f}, Recall: {rec:.4f}, F1-score: {f1:.4f}")

print("Claim Occurrence Classification Performance:")
eval_classification_model(y_test_cls, y_pred_cls, "Logistic Regression")
# Add other classifiers if you train them


## Model Interpretability

In this section, we use SHAP (SHapley Additive exPlanations) to explain the predictions of the best-performing models for:

1. Claim Severity Prediction  
2. Premium Prediction  

We will generate feature importance plots and explore individual feature effects.  
Finally, we will identify the top 5-10 most influential features for claims severity and premium prediction.


##  Install SHAP and Prepare Data

In [None]:
# Install SHAP if not already installed
!pip install shap

import shap

# Initialize SHAP explainer (use best model, e.g., Random Forest or XGBoost for claim severity)
explainer_severity = shap.Explainer(best_claim_severity_model, X_train_sev)


## SHAP Summary Plot for Claim Severity

In [None]:
# Calculate SHAP values
shap_values_sev = explainer_severity(X_test_sev)

# Summary plot for feature importance
shap.summary_plot(shap_values_sev, X_test_sev, plot_type="bar")


## SHAP Individual Feature Effects (Claim Severity)

In [None]:
# Individual feature effects for the most important feature
shap.dependence_plot("MostImportantFeature", shap_values_sev, X_test_sev)

# Force plot for a specific instance
shap.force_plot(explainer_severity.expected_value, shap_values_sev[0, :], X_test_sev.iloc[0, :])


## SHAP Analysis for Premium Prediction

In [None]:
# Initialize SHAP explainer for premium prediction
explainer_premium = shap.Explainer(best_premium_model, X_train_prem)

# Calculate SHAP values
shap_values_prem = explainer_premium(X_test_prem)

# Summary plot for feature importance
shap.summary_plot(shap_values_prem, X_test_prem, plot_type="bar")

# Optional: Dependence plot for a specific feature
shap.dependence_plot("AnotherImportantFeature", shap_values_prem, X_test_prem)


## LIME for Local Interpretability

LIME (Local Interpretable Model-agnostic Explanations) will help us understand the predictions of individual data points for:

1. Claim Severity Prediction  
2. Premium Prediction  

We will generate explanations for specific instances to validate model behavior and identify feature contributions.


In [None]:
# Install LIME if not already installed
!pip install lime

from lime.lime_tabular import LimeTabularExplainer

# Initialize LIME explainer for Claim Severity
lime_explainer_severity = LimeTabularExplainer(
    training_data=X_train_sev.values,
    feature_names=X_train_sev.columns,
    class_names=["Claim Severity"],
    mode="regression"
)

# Initialize LIME explainer for Premium Prediction
lime_explainer_premium = LimeTabularExplainer(
    training_data=X_train_prem.values,
    feature_names=X_train_prem.columns,
    class_names=["Premium"],
    mode="regression"
)


In [None]:
# Pick a random instance from the test set
instance_idx = 10  # Example instance index
instance_severity = X_test_sev.iloc[instance_idx]

# Explain the prediction
lime_explanation_sev = lime_explainer_severity.explain_instance(
    data_row=instance_severity.values,
    predict_fn=best_claim_severity_model.predict
)

# Visualize the explanation
lime_explanation_sev.show_in_notebook(show_table=True)


In [None]:
# Pick a random instance from the test set
instance_idx = 15  # Example instance index
instance_premium = X_test_prem.iloc[instance_idx]

# Explain the prediction
lime_explanation_prem = lime_explainer_premium.explain_instance(
    data_row=instance_premium.values,
    predict_fn=best_premium_model.predict
)

# Visualize the explanation
lime_explanation_prem.show_in_notebook(show_table=True)
