In [1]:
#Step 1: Data Preparation
#1.1 Handling Missing Data

import pandas as pd
import numpy as np
from sklearn.model_selection import train_test_split
from sklearn.preprocessing import OneHotEncoder, StandardScaler
from sklearn.impute import SimpleImputer


In [2]:
# Load the data
data = pd.read_csv('../data/MachineLearningRating_v3.txt', sep='|', engine='python')

In [3]:
# Convert data types to more memory-efficient formats
for col in data.select_dtypes(include=['float64']).columns:
    data[col] = pd.to_numeric(data[col], downcast='float')

for col in data.select_dtypes(include=['int64']).columns:
    data[col] = pd.to_numeric(data[col], downcast='integer')

In [4]:
#1.2 Feature Engineering
# Convert TransactionMonth to datetime and extract relevant features
data['TransactionMonth'] = pd.to_datetime(data['TransactionMonth'], errors='coerce')
data['TransactionYear'] = data['TransactionMonth'].dt.year
data['TransactionMonth'] = data['TransactionMonth'].dt.month

# Feature Engineering
data['VehicleAge'] = 2024 - data['RegistrationYear']
data['ProfitMarginRatio'] = data['TotalPremium'] - data['TotalClaims']
data['ProfitMarginRatio'] = data['ProfitMarginRatio'].replace([np.inf, -np.inf], 0)
data['ProfitMarginRatio'] = data['ProfitMarginRatio'].fillna(0)


In [5]:

# Handling Missing Data
imputer = SimpleImputer(strategy='mean')
data['TotalPremium'] = imputer.fit_transform(data[['TotalPremium']])
data['TotalClaims'] = imputer.fit_transform(data[['TotalClaims']])

# Drop rows with missing values in categorical columns
data.dropna(subset=['Province', 'PostalCode', 'Gender'], inplace=True)

In [6]:
# Drop irrelevant or non-numeric columns before encoding
data.drop(['UnderwrittenCoverID', 'PolicyID', 'Country', 'MainCrestaZone', 'SubCrestaZone',
           'make', 'Model', 'bodytype', 'Title', 'Language', 'Bank', 'AccountType', 'LegalType', 'MaritalStatus'], axis=1, inplace=True)

In [7]:
#1.3 Encoding Categorical Data
# Encoding Categorical Data
categorical_features = ['Province', 'PostalCode', 'Gender', 'VehicleType', 'CoverType']



In [8]:
# Reduce cardinality by grouping infrequent categories (if necessary)
for feature in categorical_features:
    top_categories = data[feature].value_counts().nlargest(10).index
    data[feature] = np.where(data[feature].isin(top_categories), data[feature], 'Other')

In [9]:
# One-hot encoding with memory efficiency
data = pd.get_dummies(data, columns=categorical_features, drop_first=True)

# Ensure all columns are numeric and handle non-numeric values
data = data.apply(pd.to_numeric, errors='coerce').fillna(0)

In [10]:
#1.4 Train-Test Split
# Train-Test Split
X = data.drop(['TotalPremium', 'TotalClaims'], axis=1)
y = data['TotalPremium']
X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.3, random_state=42)

# Feature Scaling
scaler = StandardScaler()
X_train = scaler.fit_transform(X_train)
X_test = scaler.transform(X_test)

In [11]:
# Verify the transformations
print(f"X_train shape: {X_train.shape}")
print(f"X_test shape: {X_test.shape}")
print(f"Sample feature names: {X.columns[:5]}")

X_train shape: (693393, 69)
X_test shape: (297169, 69)
Sample feature names: Index(['TransactionMonth', 'IsVATRegistered', 'Citizenship', 'ItemType',
       'mmcode'],
      dtype='object')


In [12]:
#Step 2: Modeling Techniques
#2.1 Linear Regression

from sklearn.linear_model import LinearRegression
from sklearn.metrics import mean_squared_error, r2_score

# Linear Regression Model
lr_model = LinearRegression()
lr_model.fit(X_train, y_train)
y_pred_lr = lr_model.predict(X_test)

# Evaluation
mse_lr = mean_squared_error(y_test, y_pred_lr)
r2_lr = r2_score(y_test, y_pred_lr)
print(f"Linear Regression - MSE: {mse_lr}, R2: {r2_lr}")


Linear Regression - MSE: 34337.5633913358, R2: 0.270038211581488


In [13]:
#2.2 Random Forest
from sklearn.ensemble import RandomForestRegressor

# Random Forest Model
rf_model = RandomForestRegressor(n_estimators=100, random_state=42)
rf_model.fit(X_train, y_train)
y_pred_rf = rf_model.predict(X_test)

# Evaluation
mse_rf = mean_squared_error(y_test, y_pred_rf)
r2_rf = r2_score(y_test, y_pred_rf)
print(f"Random Forest - MSE: {mse_rf}, R2: {r2_rf}")



Random Forest - MSE: 3014.1336803693066, R2: 0.9359243290859125


In [14]:
#2.3 XGBoost
import xgboost as xgb

# XGBoost Model
xgb_model = xgb.XGBRegressor(objective='reg:squarederror', n_estimators=100, random_state=42)
xgb_model.fit(X_train, y_train)
y_pred_xgb = xgb_model.predict(X_test)

# Evaluation
mse_xgb = mean_squared_error(y_test, y_pred_xgb)
r2_xgb = r2_score(y_test, y_pred_xgb)
print(f"XGBoost - MSE: {mse_xgb}, R2: {r2_xgb}")



XGBoost - MSE: 19536.46847941468, R2: 0.584685864046667


In [15]:
#Step 3: Feature Importance Analysis
# Feature Importance for Random Forest
importances_rf = rf_model.feature_importances_
feature_names = X.columns
feature_importances_rf = pd.Series(importances_rf, index=feature_names).sort_values(ascending=False)

print("Feature Importances for Random Forest:")
print(feature_importances_rf)

# Feature Importance for XGBoost
importances_xgb = xgb_model.feature_importances_
feature_importances_xgb = pd.Series(importances_xgb, index=feature_names).sort_values(ascending=False)

print("Feature Importances for XGBoost:")
print(feature_importances_xgb)



Feature Importances for Random Forest:
ProfitMarginRatio           0.728814
CalculatedPremiumPerTerm    0.118814
SumInsured                  0.092487
TransactionMonth            0.035077
TransactionYear             0.024431
                              ...   
WrittenOff                  0.000000
NewVehicle                  0.000000
TrackingDevice              0.000000
AlarmImmobiliser            0.000000
CrossBorder                 0.000000
Length: 69, dtype: float64
Feature Importances for XGBoost:
Cylinders               0.412889
SumInsured              0.186915
ProfitMarginRatio       0.154176
TransactionMonth        0.090620
CoverType_Own Damage    0.079213
                          ...   
CoverCategory           0.000000
CoverGroup              0.000000
Section                 0.000000
Product                 0.000000
Converted               0.000000
Length: 69, dtype: float32


In [16]:
#Step 4: Model Evaluation and Comparison
# Comparison of model performance
results = pd.DataFrame({
    'Model': ['Linear Regression', 'Random Forest', 'XGBoost'],
    'MSE': [mse_lr, mse_rf, mse_xgb],
    'R2': [r2_lr, r2_rf, r2_xgb]
})

print("Model Comparison:")
print(results)



Model Comparison:
               Model           MSE        R2
0  Linear Regression  34337.563391  0.270038
1      Random Forest   3014.133680  0.935924
2            XGBoost  19536.468479  0.584686
