In [1]:
#Step 1: Data Preparation
#1.1 Handling Missing Data

import pandas as pd
import numpy as np
from sklearn.model_selection import train_test_split
from sklearn.preprocessing import OneHotEncoder, StandardScaler
from sklearn.impute import SimpleImputer

# Load the data
data = pd.read_csv('../data/MachineLearningRating_v3.txt', sep='|', low_memory=False)


# Handling Missing Data
imputer = SimpleImputer(strategy='mean')
data['TotalPremium'] = imputer.fit_transform(data[['TotalPremium']])
data['TotalClaims'] = imputer.fit_transform(data[['TotalClaims']])

# Drop rows with missing values in categorical columns
data.dropna(subset=['Province', 'PostalCode', 'Gender'], inplace=True)


In [2]:
#1.2 Feature Engineering
# Convert TransactionMonth to datetime and extract relevant features
data['TransactionMonth'] = pd.to_datetime(data['TransactionMonth'], errors='coerce')
data['TransactionYear'] = data['TransactionMonth'].dt.year
data['TransactionMonth'] = data['TransactionMonth'].dt.month

# Feature Engineering: Example - Creating a new feature for vehicle age
data['VehicleAge'] = 2024 - data['RegistrationYear']

# Example - Creating a new feature for total claims ratio
data['ClaimsRatio'] = data['TotalClaims'] / data['TotalPremium']
data['ClaimsRatio'].replace([np.inf, -np.inf], 0, inplace=True)
data['ClaimsRatio'].fillna(0, inplace=True)



The behavior will change in pandas 3.0. This inplace method will never work because the intermediate object on which we are setting values always behaves as a copy.

For example, when doing 'df[col].method(value, inplace=True)', try using 'df.method({col: value}, inplace=True)' or df[col] = df[col].method(value) instead, to perform the operation inplace on the original object.


  data['ClaimsRatio'].replace([np.inf, -np.inf], 0, inplace=True)
The behavior will change in pandas 3.0. This inplace method will never work because the intermediate object on which we are setting values always behaves as a copy.

For example, when doing 'df[col].method(value, inplace=True)', try using 'df.method({col: value}, inplace=True)' or df[col] = df[col].method(value) instead, to perform the operation inplace on the original object.


  data['ClaimsRatio'].fillna(0, inplace=True)


In [3]:
#1.3 Encoding Categorical Data
# Encoding Categorical Data
categorical_features = ['Province', 'PostalCode', 'Gender', 'VehicleType', 'CoverType']

# One-hot encoding
data = pd.get_dummies(data, columns=categorical_features, drop_first=True)



In [4]:
#1.4 Handling Non-Numeric Values
# Ensure all columns are numeric and handle non-numeric values
data = data.apply(pd.to_numeric, errors='coerce').fillna(0)


In [None]:
#1.4 Train-Test Split
# Drop irrelevant or non-numeric columns before splitting
data.drop(['UnderwrittenCoverID', 'PolicyID', 'Country', 'MainCrestaZone', 'SubCrestaZone',
           'make', 'Model', 'bodytype', 'Title', 'Language', 'Bank', 'AccountType', 'LegalType', 'MaritalStatus'], axis=1, inplace=True)

# Train-Test Split
X = data.drop(['TotalPremium', 'TotalClaims'], axis=1)
y = data['TotalPremium']
X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.3, random_state=42)

# Feature Scaling
scaler = StandardScaler()
X_train = scaler.fit_transform(X_train)
X_test = scaler.transform(X_test)

In [None]:
#Step 2: Modeling Techniques
#2.1 Linear Regression

from sklearn.linear_model import LinearRegression
from sklearn.metrics import mean_squared_error, r2_score

# Linear Regression Model
lr_model = LinearRegression()
lr_model.fit(X_train, y_train)
y_pred_lr = lr_model.predict(X_test)

# Evaluation
mse_lr = mean_squared_error(y_test, y_pred_lr)
r2_lr = r2_score(y_test, y_pred_lr)
print(f"Linear Regression - MSE: {mse_lr}, R2: {r2_lr}")


In [None]:
#2.2 Random Forest
from sklearn.ensemble import RandomForestRegressor

# Random Forest Model
rf_model = RandomForestRegressor(n_estimators=100, random_state=42)
rf_model.fit(X_train, y_train)
y_pred_rf = rf_model.predict(X_test)

# Evaluation
mse_rf = mean_squared_error(y_test, y_pred_rf)
r2_rf = r2_score(y_test, y_pred_rf)
print(f"Random Forest - MSE: {mse_rf}, R2: {r2_rf}")


In [None]:
#2.3 XGBoost
import xgboost as xgb

# XGBoost Model
xgb_model = xgb.XGBRegressor(objective='reg:squarederror', n_estimators=100, random_state=42)
xgb_model.fit(X_train, y_train)
y_pred_xgb = xgb_model.predict(X_test)

# Evaluation
mse_xgb = mean_squared_error(y_test, y_pred_xgb)
r2_xgb = r2_score(y_test, y_pred_xgb)
print(f"XGBoost - MSE: {mse_xgb}, R2: {r2_xgb}")


In [None]:
#Step 3: Feature Importance Analysis
# Feature Importance for Random Forest
importances_rf = rf_model.feature_importances_
feature_names = X.columns
feature_importances_rf = pd.Series(importances_rf, index=feature_names).sort_values(ascending=False)

print("Feature Importances for Random Forest:")
print(feature_importances_rf)

# Feature Importance for XGBoost
importances_xgb = xgb_model.feature_importances_
feature_importances_xgb = pd.Series(importances_xgb, index=feature_names).sort_values(ascending=False)

print("Feature Importances for XGBoost:")
print(feature_importances_xgb)


In [None]:
#Step 4: Model Evaluation and Comparison
# Comparison of model performance
results = pd.DataFrame({
    'Model': ['Linear Regression', 'Random Forest', 'XGBoost'],
    'MSE': [mse_lr, mse_rf, mse_xgb],
    'R2': [r2_lr, r2_rf, r2_xgb]
})

print("Model Comparison:")
print(results)
