In [1]:
#Step 1: Data Preparation
#1.1 Handling Missing Data

import pandas as pd
import numpy as np
from sklearn.model_selection import train_test_split
from sklearn.preprocessing import OneHotEncoder, StandardScaler
from sklearn.impute import SimpleImputer

# Load the data
data = pd.read_csv('../data/MachineLearningRating_v3.txt', sep='|', low_memory=False)


# Handling Missing Data
imputer = SimpleImputer(strategy='mean')
data['TotalPremium'] = imputer.fit_transform(data[['TotalPremium']])
data['TotalClaims'] = imputer.fit_transform(data[['TotalClaims']])

# Drop rows with missing values in categorical columns
data.dropna(subset=['Province', 'PostalCode', 'Gender'], inplace=True)


In [2]:
#1.2 Feature Engineering
# Convert TransactionMonth to datetime and extract relevant features
data['TransactionMonth'] = pd.to_datetime(data['TransactionMonth'], errors='coerce')
data['TransactionYear'] = data['TransactionMonth'].dt.year
data['TransactionMonth'] = data['TransactionMonth'].dt.month

# Feature Engineering: Example - Creating a new feature for vehicle age
data['VehicleAge'] = 2024 - data['RegistrationYear']

# Example - Creating a new feature for total claims ratio
data['ClaimsRatio'] = data['TotalClaims'] / data['TotalPremium']
data['ClaimsRatio'].replace([np.inf, -np.inf], 0, inplace=True)
data['ClaimsRatio'].fillna(0, inplace=True)



The behavior will change in pandas 3.0. This inplace method will never work because the intermediate object on which we are setting values always behaves as a copy.

For example, when doing 'df[col].method(value, inplace=True)', try using 'df.method({col: value}, inplace=True)' or df[col] = df[col].method(value) instead, to perform the operation inplace on the original object.


  data['ClaimsRatio'].replace([np.inf, -np.inf], 0, inplace=True)
The behavior will change in pandas 3.0. This inplace method will never work because the intermediate object on which we are setting values always behaves as a copy.

For example, when doing 'df[col].method(value, inplace=True)', try using 'df.method({col: value}, inplace=True)' or df[col] = df[col].method(value) instead, to perform the operation inplace on the original object.


  data['ClaimsRatio'].fillna(0, inplace=True)


In [3]:
#1.3 Encoding Categorical Data
# Encoding Categorical Data
categorical_features = ['Province', 'PostalCode', 'Gender', 'VehicleType', 'CoverType']

# One-hot encoding
data = pd.get_dummies(data, columns=categorical_features, drop_first=True)



In [4]:
#1.4 Handling Non-Numeric Values
# Ensure all columns are numeric and handle non-numeric values
data = data.apply(pd.to_numeric, errors='coerce').fillna(0)


In [None]:
#1.4 Train-Test Split
# Drop irrelevant or non-numeric columns before splitting
data.drop(['UnderwrittenCoverID', 'PolicyID', 'Country', 'MainCrestaZone', 'SubCrestaZone',
           'make', 'Model', 'bodytype', 'Title', 'Language', 'Bank', 'AccountType', 'LegalType', 'MaritalStatus'], axis=1, inplace=True)

# Train-Test Split
X = data.drop(['TotalPremium', 'TotalClaims'], axis=1)
y = data['TotalPremium']
X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.3, random_state=42)

# Feature Scaling
scaler = StandardScaler()
X_train = scaler.fit_transform(X_train)
X_test = scaler.transform(X_test)

In [None]:
#