## Task 4: Insurance Claim Prediction

## Exploratory Data Analysis

In [1]:
## To have a brief exploration on the dataset: high level statistic
## Import csv into pandas dataframe
import pandas as pd

train_df = pd.read_csv("./2024_DS_data_mocked_pol_merge_claim.csv")
test_df = pd.read_csv("./2024_DS_data_mocked_test.csv")

In [2]:
## Take only available data in test_df as train_df
## Adding target variable into the column list
column_list = test_df.columns.to_list()
column_list.append("incurred_amount")
train_df = train_df[column_list]

In [3]:
## To check the basic information of the dataset
train_df.info()

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 3529 entries, 0 to 3528
Data columns (total 15 columns):
 #   Column                 Non-Null Count  Dtype  
---  ------                 --------------  -----  
 0   mocked_policy_no       3529 non-null   object 
 1   subclass               3529 non-null   int64  
 2   policy_effective_date  3529 non-null   object 
 3   policy_expiry_date     3529 non-null   object 
 4   net_premium            3529 non-null   float64
 5   discount_amount        3073 non-null   float64
 6   insured_gender         3521 non-null   object 
 7   nationality            3073 non-null   object 
 8   policyholder_postcode  3529 non-null   int64  
 9   height                 3073 non-null   float64
 10  weight                 3073 non-null   float64
 11  marital_status         3045 non-null   object 
 12  insured_birthyear      3529 non-null   object 
 13  occupation             3045 non-null   object 
 14  incurred_amount        2968 non-null   float64
dtypes: f

In [4]:
## To check the numerical data's statistic
train_df.describe()

Unnamed: 0,subclass,net_premium,discount_amount,policyholder_postcode,height,weight,incurred_amount
count,3529.0,3529.0,3073.0,3529.0,3073.0,3073.0,2968.0
mean,514.004534,9484.225123,482.853791,18015.400963,132.795802,43.592818,7834.680617
std,2.394126,7080.669003,666.481126,18810.019985,41.619461,28.646362,20898.568502
min,505.0,0.0,0.0,10100.0,35.0,2.51,0.0
25%,514.0,4420.0,0.0,10240.0,95.0,14.5,800.0
50%,514.0,7793.0,0.0,10510.0,156.0,50.0,1351.0
75%,514.0,14437.0,789.0,12000.0,165.0,64.0,1785.25
max,525.0,47636.0,2750.5,93190.0,190.0,180.0,250000.0


In [5]:
## To check missing values
train_df.isnull().sum()

mocked_policy_no           0
subclass                   0
policy_effective_date      0
policy_expiry_date         0
net_premium                0
discount_amount          456
insured_gender             8
nationality              456
policyholder_postcode      0
height                   456
weight                   456
marital_status           484
insured_birthyear          0
occupation               484
incurred_amount          561
dtype: int64

In [6]:
## To drop any rows without target variable: no meaning value if target variable is null.
train_df = train_df.dropna(subset=['incurred_amount'])

In [7]:
## To test if dropping occupation null value
testing_df = train_df.dropna(subset=['occupation'])
testing_df.isnull().sum()

mocked_policy_no         0
subclass                 0
policy_effective_date    0
policy_expiry_date       0
net_premium              0
discount_amount          0
insured_gender           0
nationality              0
policyholder_postcode    0
height                   0
weight                   0
marital_status           0
insured_birthyear        0
occupation               0
incurred_amount          0
dtype: int64

#### Finding :
Since dropping occupation null values will clean all the other null values in other columns, I will suspect this is not a random null value. Hence, I will create a null value indicator as one of the feature to test if it is significant to be a predicting feature.

## Feature Engineering

#### 1. Create null indicator as feature

In [8]:
import statsmodels.api as sm
from statsmodels.formula.api import ols
import pandas as pd

## To create a new column 'null_indicator'
train_df['null_indicator'] = train_df['occupation'].isnull().map({True: 'Yes', False: 'No'})

In [9]:
## To fit ANOVA model
model = ols('incurred_amount ~ C(null_indicator)', data=train_df).fit()

## To perform ANOVA
anova_table = sm.stats.anova_lm(model, typ=2)
print(anova_table)

                         sum_sq      df         F   PR(>F)
C(null_indicator)  2.543831e+09     1.0  5.833942  0.01578
Residual           1.293294e+12  2966.0       NaN      NaN


#### Finding :
null indicator's P value is less than 0.05 means that it is significant in predicting incurred amount.

#### 2. Extract birthyear & get age_group as feature

In [10]:
import datetime

## To extract birth year from insured_birthyear column
train_df['insured_birthyear'] = pd.to_datetime(train_df['insured_birthyear'])
train_df['birth_year'] = train_df['insured_birthyear'].dt.year

current_year = datetime.datetime.now().year
train_df['age'] = current_year - train_df['birth_year']
train_df['age_group'] = pd.cut(train_df['age'], bins=[0, 18, 30, 50, 70, 100], labels=['0-18', '19-30', '31-50', '51-70', '71-100'])

  train_df['insured_birthyear'] = pd.to_datetime(train_df['insured_birthyear'])


In [11]:
## To fit ANOVA model
model = ols('incurred_amount ~ C(age_group)', data=train_df).fit()

## To perform ANOVA
anova_table = sm.stats.anova_lm(model, typ=2)
print(anova_table)

                    sum_sq      df         F        PR(>F)
C(age_group)  2.147887e+10     4.0  12.48508  4.502669e-10
Residual      1.274359e+12  2963.0       NaN           NaN


#### Finding :
age_group's P value is less than 0.05 means that it is significant in predicting incurred amount.

#### 3. Missing value handling

In [12]:
## To replace categorical variable missing values with mode value
categorical_columns = ['subclass','insured_gender','nationality','policyholder_postcode','marital_status','occupation','age_group']
for col in categorical_columns:
    train_df[col] = train_df[col].fillna(train_df[col].mode()[0])
    train_df[col] = train_df[col].astype('str')

## To replace categorical variable missing values with mean value
numerical_columns = ['net_premium','discount_amount','height','weight']
for col in numerical_columns:
    train_df[col] = train_df[col].fillna(train_df[col].mean())

In [13]:
## To check missing values
train_df.isnull().sum()

mocked_policy_no         0
subclass                 0
policy_effective_date    0
policy_expiry_date       0
net_premium              0
discount_amount          0
insured_gender           0
nationality              0
policyholder_postcode    0
height                   0
weight                   0
marital_status           0
insured_birthyear        0
occupation               0
incurred_amount          0
null_indicator           0
birth_year               0
age                      0
age_group                0
dtype: int64

#### 3. Remove outliers

In [14]:
import numpy as np
Q1 = np.percentile(train_df['net_premium'], 25)
Q3 = np.percentile(train_df['net_premium'], 75)

IQR = Q3 - Q1
lower_bound = Q1 - 1.5 * IQR
upper_bound = Q3 + 1.5 * IQR
print(f'lower_bound : {lower_bound}, upper_bound : {upper_bound}')

## To remove outier on net_premium column
train_df = train_df[train_df['net_premium'] < upper_bound]

lower_bound : -10483.0, upper_bound : 31309.0


#### 4. Data normalization - log transformation

In [15]:
from scipy.stats import kurtosis
from scipy.stats import skew

#To check the skewness and kurtosis of the independent variable
for col in numerical_columns:
    skewness = skew(train_df[col])
    kurt = kurtosis(train_df[col])
    print(f'{col}: \nskewness : {skewness} \nkurtosis : {kurt}')

net_premium: 
skewness : 0.6492945386690216 
kurtosis : -0.45625450177406135
discount_amount: 
skewness : 1.249017977704119 
kurtosis : 0.7139095052090085
height: 
skewness : -0.5692550454347352 
kurtosis : -0.9881646729017737
weight: 
skewness : 0.3966932261342633 
kurtosis : -0.2926590754446816


## Finding :
All the skewness are within -0.5 and 0.5 except discount_amount. Hence, log transformation will be applied.

In [16]:
## Apply log transformation on discount_amount
train_df['log_discount_amount'] = np.log(train_df['discount_amount'] + 1)
print(skew(train_df['log_discount_amount']))

-0.12138656799415577


A value is trying to be set on a copy of a slice from a DataFrame.
Try using .loc[row_indexer,col_indexer] = value instead

See the caveats in the documentation: https://pandas.pydata.org/pandas-docs/stable/user_guide/indexing.html#returning-a-view-versus-a-copy
  train_df['log_discount_amount'] = np.log(train_df['discount_amount'] + 1)


## Mutivariate data analysis

#### 1. Multicolinearity

In [17]:
## Check the multicollinearity for numerical data
from statsmodels.stats.outliers_influence import variance_inflation_factor

vif_df = train_df[numerical_columns]
vif_df.fillna(vif_df.mean(), inplace=True)
X = vif_df.assign(const=1)
pd.Series([variance_inflation_factor(X.values, i) 
               for i in range(X.shape[1])], 
              index=X.columns)

A value is trying to be set on a copy of a slice from a DataFrame

See the caveats in the documentation: https://pandas.pydata.org/pandas-docs/stable/user_guide/indexing.html#returning-a-view-versus-a-copy
  vif_df.fillna(vif_df.mean(), inplace=True)


net_premium         1.395849
discount_amount     1.252226
height              4.832201
weight              4.701737
const              28.048903
dtype: float64

#### 2. Feature Importance / Relevancy & Feature Selection

In [18]:
model = ols('incurred_amount ~ C(null_indicator) + C(subclass) + net_premium + discount_amount + C(insured_gender) + C(nationality) + C(policyholder_postcode) + height + weight + C(marital_status) + C(age_group) + C(occupation)', data=train_df).fit()
print(model.summary())

                            OLS Regression Results                            
Dep. Variable:        incurred_amount   R-squared:                       0.274
Model:                            OLS   Adj. R-squared:                  0.229
Method:                 Least Squares   F-statistic:                     6.163
Date:                Thu, 22 Feb 2024   Prob (F-statistic):          7.85e-102
Time:                        11:30:49   Log-Likelihood:                -32893.
No. Observations:                2936   AIC:                         6.613e+04
Df Residuals:                    2766   BIC:                         6.714e+04
Df Model:                         169                                         
Covariance Type:            nonrobust                                         
                                                                                                      coef    std err          t      P>|t|      [0.025      0.975]
----------------------------------------------

## Findings :
We will remove discount_amount, insured_gender, nationality and weight as the p-value is greater than 0.05 which indicates ont significant in predicting incurred amount
Hence, the features selected:
1. null_indicator - categorical
2. subclass - categorical
3. marital_status - categorical
4. net_premium - numerical
5. policyholder_postcode - categorical
6. height - numerical
7. age_group - categorical
8. occupation - categorical

## Model Training

In [19]:
import mlflow
## Start MLflow for model tracing
# Set our tracking server uri for logging
mlflow.set_tracking_uri(uri="http://127.0.0.1:8080")

# Create a new MLflow Experiment
mlflow.set_experiment("Insurance ML Model")

2024/02/22 11:30:49 INFO mlflow.tracking.fluent: Experiment with name 'Insurance ML Model' does not exist. Creating a new experiment.


<Experiment: artifact_location='mlflow-artifacts:/189262570475027135', creation_time=1708572649597, experiment_id='189262570475027135', last_update_time=1708572649597, lifecycle_stage='active', name='Insurance ML Model', tags={}>

#### 1. Encoding categorical data

In [20]:
train_df = train_df.reset_index(drop=True)

In [21]:
from sklearn.preprocessing import LabelEncoder
import joblib

def save_encoder(column, filename):
    encoder = LabelEncoder()
    encoder.fit(column)
    joblib.dump(encoder, filename)

categorical_columns_list = ['subclass', 'policyholder_postcode', 'marital_status', 'age_group', 'null_indicator', 'occupation']

for col in categorical_columns_list:
    save_encoder(train_df[col], f'./encoder/{col}_encoder.pkl')

# Create the LabelEncoder object
subclass_encoder = joblib.load('./encoder/subclass_encoder.pkl')
policyholder_postcode_encoder = joblib.load('./encoder/policyholder_postcode_encoder.pkl')
marital_status_encoder = joblib.load('./encoder/marital_status_encoder.pkl')
age_group_encoder = joblib.load('./encoder/age_group_encoder.pkl')
null_indicator_encoder = joblib.load('./encoder/null_indicator_encoder.pkl')
occupation_encoder = joblib.load('./encoder/occupation_encoder.pkl')

# Fit and transform the categorical data
train_df['subclass'] = subclass_encoder.fit_transform(train_df['subclass'])
train_df['policyholder_postcode'] = policyholder_postcode_encoder.fit_transform(train_df['policyholder_postcode'])
train_df['marital_status'] = marital_status_encoder.fit_transform(train_df['marital_status'])
train_df['age_group'] = age_group_encoder.fit_transform(train_df['age_group'])
train_df['null_indicator'] = age_group_encoder.fit_transform(train_df['null_indicator'])
train_df['occupation'] = age_group_encoder.fit_transform(train_df['occupation'])

#### 2. Split training data

In [22]:
from sklearn.model_selection import train_test_split

X = train_df[['subclass','net_premium','policyholder_postcode','height','marital_status','age_group','null_indicator','occupation']]
y = train_df['incurred_amount']

# Split the dataset into training and testing sets
X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.3, random_state=42)

# Print the shapes of the resulting sets to verify the split
print("X_train shape:", X_train.shape)
print("X_test shape:", X_test.shape)
print("y_train shape:", y_train.shape)
print("y_test shape:", y_test.shape)

X_train shape: (2055, 8)
X_test shape: (881, 8)
y_train shape: (2055,)
y_test shape: (881,)


#### 3. Compare and Select base Model
- SVM
- Random Forest
- XGBoost

In [23]:
from sklearn.metrics import roc_auc_score
from bayes_opt import BayesianOptimization
from sklearn.metrics import f1_score
from sklearn.metrics import accuracy_score
from sklearn import svm
from sklearn.metrics import mean_squared_error

# Define the black box function to optimize.
def black_box_function(C, gamma):
    # C: SVC hyper parameter to optimize for.
    model = svm.SVR(kernel='rbf', C = C, gamma= gamma)
    model.fit(X_train, y_train)
    # y_prob = model.predict_proba(X_test_tfidf)
    # f = roc_auc_score(y_test, y_prob, multi_class='ovr')
    y_pred = model.predict(X_test)
    errors = mean_squared_error(y_test, y_pred)
    return - errors
# Set range of C to optimize for.
# bayes_opt requires this to be a dictionary.
pbounds = {"C": [0.01, 10], 'gamma': [0.01, 10]}
# Create a BayesianOptimization optimizer,
# and optimize the given black_box_function.
svm_optimizer = BayesianOptimization(f = black_box_function,
                                 pbounds = pbounds, verbose = 2,
                                 random_state = 4,
                                 allow_duplicate_points=True)
svm_optimizer.maximize(init_points = 5, n_iter = 50)
print("Best result: {}; f(x) = {}.".format(svm_optimizer.max["params"], svm_optimizer.max["target"]))

|   iter    |  target   |     C     |   gamma   |
-------------------------------------------------
| [0m1        [0m | [0m-3.84e+08[0m | [0m9.671    [0m | [0m5.477    [0m |
| [95m2        [0m | [95m-3.84e+08[0m | [95m9.727    [0m | [95m7.151    [0m |
| [95m3        [0m | [95m-3.839e+0[0m | [95m6.98     [0m | [95m2.169    [0m |
| [0m4        [0m | [0m-3.839e+0[0m | [0m9.763    [0m | [0m0.07224  [0m |
| [95m5        [0m | [95m-3.838e+0[0m | [95m2.537    [0m | [95m4.354    [0m |
| [95m6        [0m | [95m-3.837e+0[0m | [95m0.01     [0m | [95m7.377    [0m |
| [0m7        [0m | [0m-3.837e+0[0m | [0m0.01     [0m | [0m10.0     [0m |
| [95m8        [0m | [95m-3.837e+0[0m | [95m0.01     [0m | [95m0.01     [0m |
| [0m9        [0m | [0m-3.837e+0[0m | [0m0.01     [0m | [0m3.068    [0m |
| [0m10       [0m | [0m-3.837e+0[0m | [0m0.01     [0m | [0m5.196    [0m |
| [0m11       [0m | [0m-3.837e+0[0m | [0m0.01     [0m 

In [24]:
svm_classifier = svm.SVR(kernel='rbf', C= svm_optimizer.max["params"]['C'], gamma=svm_optimizer.max["params"]['gamma'])
svm_classifier.fit(X_train, y_train)

In [25]:
from sklearn.model_selection import cross_val_score, KFold

k = 5
kf = KFold(n_splits=k, shuffle=True, random_state=42)

In [26]:
# Perform cross-validation
scores = cross_val_score(svm_classifier, X_test, y_test, cv=kf, scoring='r2')

# Calculate and print the mean and standard deviation of the scores
r2_mean_score = scores.mean()
r2_std_deviation = scores.std()
print(f"Mean Score: {r2_mean_score:.2f}")
print(f"Standard Deviation: {r2_std_deviation:.2f}")

Mean Score: -0.10
Standard Deviation: 0.03


#### Save and trace SVM model in MLflow

In [27]:
svm_params = {
    "kernel": "rbf",
    "C": svm_optimizer.max["params"]['C'],
    "gamma": svm_optimizer.max["params"]['gamma'],
}

In [28]:
# Start an MLflow run
with mlflow.start_run():
    # Log the hyperparameters
    mlflow.log_params(svm_params)

    # Log the loss metric
    mlflow.log_metric("r2", r2_mean_score)

    # Set a tag that we can use to remind ourselves what this run was for
    mlflow.set_tag("Training Info", "Basic SVM model for incurred amount prediction - regression")

    # Log the model
    model_info = mlflow.sklearn.log_model(
        sk_model=svm_classifier,
        artifact_path="insurance_model",
        input_example=X_train,
        registered_model_name="tracking-svm-v1.0",
    )

Successfully registered model 'tracking-svm-v1.0'.
2024/02/22 11:31:10 INFO mlflow.store.model_registry.abstract_store: Waiting up to 300 seconds for model version to finish creation. Model name: tracking-svm-v1.0, version 1
Created version '1' of model 'tracking-svm-v1.0'.


In [29]:
from sklearn.linear_model import LogisticRegression
from sklearn import svm
from sklearn.ensemble import RandomForestClassifier, RandomForestRegressor
from sklearn.metrics import roc_auc_score
from bayes_opt import BayesianOptimization
from sklearn.metrics import f1_score
from sklearn.metrics import accuracy_score
from sklearn.metrics import mean_squared_error

# Define the black box function to optimize.
def black_box_function(n_estimators, max_depth, min_sample_split):
    # C: SVC hyper parameter to optimize for.
    model = RandomForestRegressor(random_state=42, n_estimators= int(n_estimators), max_depth= int(max_depth), min_samples_split= int(min_sample_split))
    model.fit(X_train, y_train)
    # y_prob = model.predict_proba(X_test_tfidf)
    # f = roc_auc_score(y_test, y_prob, multi_class='ovr')
    y_pred = model.predict(X_test)
    errors = mean_squared_error(y_test, y_pred)
    return - errors
# Set range of C to optimize for.
# bayes_opt requires this to be a dictionary.
pbounds = {"n_estimators": [200, 5000], 'max_depth': [10, 100], 'min_sample_split': [2, 10]}
# Create a BayesianOptimization optimizer,
# and optimize the given black_box_function.
rf_optimizer = BayesianOptimization(f = black_box_function,
                                 pbounds = pbounds, verbose = 2,
                                 random_state = 4)
rf_optimizer.maximize(init_points = 5, n_iter = 50)
print("Best result: {}; f(x) = {}.".format(rf_optimizer.max["params"], rf_optimizer.max["target"]))

|   iter    |  target   | max_depth | min_sa... | n_esti... |
-------------------------------------------------------------
| [0m1        [0m | [0m-2.749e+0[0m | [0m97.03    [0m | [0m6.378    [0m | [0m4.869e+03[0m |
| [95m2        [0m | [95m-2.732e+0[0m | [95m74.33    [0m | [95m7.582    [0m | [95m1.237e+03[0m |
| [0m3        [0m | [0m-2.808e+0[0m | [0m97.86    [0m | [0m2.05     [0m | [0m1.414e+03[0m |
| [95m4        [0m | [95m-2.718e+0[0m | [95m49.13    [0m | [95m8.235    [0m | [95m1.149e+03[0m |
| [95m5        [0m | [95m-2.705e+0[0m | [95m87.67    [0m | [95m9.867    [0m | [95m986.4    [0m |
| [0m6        [0m | [0m-2.725e+0[0m | [0m10.0     [0m | [0m2.0      [0m | [0m801.4    [0m |
| [95m7        [0m | [95m-2.686e+0[0m | [95m100.0    [0m | [95m10.0     [0m | [95m472.7    [0m |
| [0m8        [0m | [0m-2.719e+0[0m | [0m99.05    [0m | [0m8.183    [0m | [0m475.9    [0m |
| [0m9        [0m | [0m-2.75e+08[0m 

In [30]:
rf_classifier = RandomForestRegressor(random_state=42, n_estimators= int(rf_optimizer.max["params"]['n_estimators']), min_samples_split= int(rf_optimizer.max["params"]['min_sample_split']), max_depth= int(rf_optimizer.max["params"]['max_depth']))
rf_classifier.fit(X_train, y_train)

In [31]:
from sklearn.model_selection import cross_val_score, KFold

k = 5
kf = KFold(n_splits=k, shuffle=True, random_state=42)

In [32]:
# Perform cross-validation
scores = cross_val_score(rf_classifier, X_test, y_test, cv=kf, scoring='r2')

# Calculate and print the mean and standard deviation of the scores
r2_mean_score = scores.mean()
r2_std_deviation = scores.std()
print(f"Mean Score: {r2_mean_score:.2f}")
print(f"Standard Deviation: {r2_std_deviation:.2f}")

Mean Score: 0.15
Standard Deviation: 0.09


#### Save and trace RF model in MLflow

In [33]:
rf_params = {
    "random_state": 42,
    "n_estimators": int(rf_optimizer.max["params"]['n_estimators']),
    "min_samples_split": int(rf_optimizer.max["params"]['min_sample_split']),
    "max_depth": int(rf_optimizer.max["params"]['max_depth']),
}

In [40]:
# Start an MLflow run
with mlflow.start_run():
    # Log the hyperparameters
    mlflow.log_params(rf_params)

    # Log the loss metric
    mlflow.log_metric("r2", r2_mean_score)

    # Set a tag that we can use to remind ourselves what this run was for
    mlflow.set_tag("Training Info", "Basic RF model for incurred amount prediction - regression")

    # Log the model
    model_info = mlflow.sklearn.log_model(
        sk_model=rf_classifier,
        artifact_path="insurance_model",
        input_example=X_train,
        registered_model_name="tracking-rf-v1.0",
    )

Registered model 'tracking-rf-v1.0' already exists. Creating a new version of this model...
2024/02/22 11:35:39 INFO mlflow.store.model_registry.abstract_store: Waiting up to 300 seconds for model version to finish creation. Model name: tracking-rf-v1.0, version 2
Created version '2' of model 'tracking-rf-v1.0'.


## Concluding Model and train model with whole dataset

In [43]:
## Conclude to use RF model
final_rf_classifier = RandomForestRegressor(random_state=42, n_estimators= int(rf_optimizer.max["params"]['n_estimators']), min_samples_split= int(rf_optimizer.max["params"]['min_sample_split']), max_depth= int(rf_optimizer.max["params"]['max_depth']))
final_rf_classifier.fit(X, y)

## Model Interpretability

In [53]:
# Get feature importances
importances = final_rf_classifier.feature_importances_

# Sort feature importances in descending order
indices = np.argsort(importances)[::-1]
column_names = X.columns

# Print feature ranking
print("Feature importance:")
for f in range(X.shape[1]):
    print(f"{f + 1}. {column_names[indices[f]]} ({importances[indices[f]]})")

Feature importance:
1. net_premium (0.401732369277261)
2. height (0.18746455630138584)
3. occupation (0.18252519887963095)
4. policyholder_postcode (0.1618850187908545)
5. subclass (0.03640118786271308)
6. age_group (0.012330338208376786)
7. marital_status (0.011967878595499767)
8. null_indicator (0.005693452084278016)


## Predict test dataset

In [44]:
import pandas as pd
test_df = pd.read_csv('./2024_DS_data_mocked_test.csv')

#### To perform data preprocessing and feature engineering as in training stage

In [45]:
## To create a new column 'null_indicator'
test_df['null_indicator'] = test_df['occupation'].isnull().map({True: 'Yes', False: 'No'})

## To extract birth year from insured_birthyear column
test_df['insured_birthyear'] = pd.to_datetime(test_df['insured_birthyear'])
test_df['birth_year'] = test_df['insured_birthyear'].dt.year
current_year = datetime.datetime.now().year
test_df['age'] = current_year - test_df['birth_year']
test_df['age_group'] = pd.cut(test_df['age'], bins=[0, 18, 30, 50, 70, 100], labels=['0-18', '19-30', '31-50', '51-70', '71-100'])

## To replace categorical variable missing values with mode value
categorical_columns = ['subclass','insured_gender','nationality','policyholder_postcode','marital_status','occupation','age_group']
for col in categorical_columns:
    test_df[col] = test_df[col].fillna(test_df[col].mode()[0])
    test_df[col] = test_df[col].astype('str')

## To replace categorical variable missing values with mean value
numerical_columns = ['net_premium','discount_amount','height','weight']
for col in numerical_columns:
    test_df[col] = test_df[col].fillna(test_df[col].mean())

## Apply log transformation on discount_amount
test_df['log_discount_amount'] = np.log(test_df['discount_amount'] + 1)

  test_df['insured_birthyear'] = pd.to_datetime(test_df['insured_birthyear'])


In [46]:
def save_encoder(column, filename):
    encoder = LabelEncoder()
    encoder.fit(column)
    joblib.dump(encoder, filename)

categorical_columns_list = ['subclass', 'policyholder_postcode', 'marital_status', 'age_group', 'null_indicator', 'occupation']

for col in categorical_columns_list:
    save_encoder(test_df[col], f'./encoder/{col}_encoder.pkl')

# Create the LabelEncoder object
subclass_encoder = joblib.load('./encoder/subclass_encoder.pkl')
policyholder_postcode_encoder = joblib.load('./encoder/policyholder_postcode_encoder.pkl')
marital_status_encoder = joblib.load('./encoder/marital_status_encoder.pkl')
age_group_encoder = joblib.load('./encoder/age_group_encoder.pkl')
null_indicator_encoder = joblib.load('./encoder/null_indicator_encoder.pkl')
occupation_encoder = joblib.load('./encoder/occupation_encoder.pkl')

# Fit and transform the categorical data
test_df['subclass'] = subclass_encoder.fit_transform(test_df['subclass'])
test_df['policyholder_postcode'] = policyholder_postcode_encoder.fit_transform(test_df['policyholder_postcode'])
test_df['marital_status'] = marital_status_encoder.fit_transform(test_df['marital_status'])
test_df['age_group'] = age_group_encoder.fit_transform(test_df['age_group'])
test_df['null_indicator'] = age_group_encoder.fit_transform(test_df['null_indicator'])
test_df['occupation'] = age_group_encoder.fit_transform(test_df['occupation'])

In [47]:
## Predict the total_incurred_amount for the test data
test_X = test_df[['subclass','net_premium','policyholder_postcode','height','marital_status','age_group','null_indicator','occupation']]
predictions = final_rf_classifier.predict(test_X)
test_df['total_incurred_amount'] = predictions

In [48]:
## Save answer to csv
answer = test_df[['mocked_policy_no','total_incurred_amount']]
answer.to_csv('answer.csv',index=False)

#### Or you can predict using MLflow model artifact:

In [41]:
import mlflow
logged_model = 'runs:/3d3406f2f5aa43b29ffd0bfcec1a3e30/insurance_model'

# Load model as a PyFuncModel
loaded_model = mlflow.pyfunc.load_model(logged_model)
predictions = loaded_model.predict(test_X)