In [1]:
import pandas as pd
import numpy as np

import matplotlib.pyplot as plt
%matplotlib inline

from sklearn.ensemble import RandomForestClassifier
from sklearn.model_selection import train_test_split

from imblearn.over_sampling import SMOTE, ADASYN
from collections import Counter

First, we read the data in memory. The file uses ; as a delimiter

In [2]:
# read training data

train_data = pd.read_csv('train.csv', delimiter=';')
#train_data.head()

Split the data before cleaning it to avoid data leakage for sure

In [3]:
train_data, test_data = train_test_split(train_data, test_size=0.4, random_state=10)

Just checking that we have a rational and representative split of possitive cases in both splits

In [4]:
print((train_data.fraud == "Y").sum())
print((test_data.fraud == "Y").sum())

171
137


### Clean variables starting with claim_

#### Some interesting choices, they apply to all variables in the data

claim_time_occured -> binarized, between 07:00 and 22:00 is day, others AND missing is night

all categorical NaN values -> encode as MISSING

all numerical NaN values -> encode with extreme outlier, for example vehicle_date_inuse == 1900

high cardinality categoricals -> frequency encoding, NaN values get frequency of 0

In [5]:
# claim variables

train_data['claim_amount'] = train_data['claim_amount'].str.replace(',','.').astype('float64')

train_data['claim_date_registered'] = pd.to_datetime(train_data['claim_date_registered'], format='%Y%m%d')

train_data['claim_date_occured'] = pd.to_datetime(train_data['claim_date_occured'], format='%Y%m%d')

mask_night = (train_data['claim_time_occured'] >= 2200) | (train_data['claim_time_occured'] <= 700)
train_data.loc[~mask_night, 'claim_time_occured'] = 0
train_data.loc[mask_night, 'claim_time_occured'] = 1

postal_code_counts = train_data['claim_postal_code'].value_counts()
train_data = train_data.merge(postal_code_counts, how='left', left_on='claim_postal_code', right_index=True)

train_data['claim_alcohol'].fillna("MISSING", inplace=True)

train_data['claim_language'].fillna("MISSING", inplace=True)
mask = train_data['claim_language'] == 1.0
train_data.loc[mask, 'claim_language'] = "LANG A"
mask = train_data['claim_language'] == 2.0
train_data.loc[mask, 'claim_language'] = "LANG B"

train_data['claim_vehicle_id'].fillna("MISSING", inplace=True)
claim_vehicle_id_count = train_data['claim_vehicle_id'].value_counts()
claim_vehicle_id_count["MISSING"] = 0
train_data = train_data.merge(claim_vehicle_id_count, how='left', 
                              left_on='claim_vehicle_id', right_index=True)

train_data['claim_vehicle_brand'].fillna('MISSING', inplace=True)
claim_vehicle_brand_counts = train_data['claim_vehicle_brand'].value_counts()
claim_vehicle_brand_counts['MISSING'] = 0
train_data = train_data.merge(claim_vehicle_brand_counts, how='left', 
                              left_on='claim_vehicle_brand', right_index=True)


train_data['claim_vehicle_type'].fillna('MISSING', inplace=True)

train_data['claim_vehicle_date_inuse'].fillna(190001, inplace=True)
train_data['claim_vehicle_date_inuse'] = train_data['claim_vehicle_date_inuse'].astype(int)
mask = (train_data['claim_vehicle_date_inuse'] > 220000)
train_data.loc[mask, 'claim_vehicle_date_inuse'] = 190001
train_data['claim_vehicle_date_inuse'] = pd.to_datetime(train_data['claim_vehicle_date_inuse'].astype(str), 
                                                        format='%Y%m')

train_data['claim_vehicle_cyl'].fillna(10000, inplace=True)

train_data['claim_vehicle_load'].fillna(500, inplace=True)

train_data['claim_vehicle_fuel_type'].fillna('MISSING', inplace=True)
mask = train_data['claim_vehicle_fuel_type'] == 1.0
train_data.loc[mask, 'claim_vehicle_fuel_type'] = "FUEL A"
mask = train_data['claim_vehicle_fuel_type'] == 2.0
train_data.loc[mask, 'claim_vehicle_fuel_type'] = "FUEL B"

train_data['claim_vehicle_power'].fillna(1000, inplace=True)

In [6]:
# policy variables

policy_holder_id_count = train_data['policy_holder_id'].value_counts()
train_data = train_data.merge(policy_holder_id_count, how='left', 
                              left_on='policy_holder_id', right_index=True)

train_data['policy_holder_postal_code'].fillna(0, inplace=True)
policy_holder_postal_code_counts = train_data['policy_holder_postal_code'].value_counts()
policy_holder_postal_code_counts.loc[0] = 0
train_data = train_data.merge(policy_holder_postal_code_counts, how='left', 
                              left_on='policy_holder_postal_code', right_index=True)

train_data['policy_holder_year_birth'].fillna(1800, inplace=True)

train_data['policy_holder_expert_id'].fillna("MISSING", inplace=True)
policy_holder_expert_id_count = train_data['policy_holder_expert_id'].value_counts()
policy_holder_expert_id_count['MISSING'] = 0
train_data = train_data.merge(policy_holder_expert_id_count, how='left', 
                              left_on='policy_holder_expert_id', right_index=True)

In [7]:
# driver variables

driver_id_count = train_data['driver_id'].value_counts()
train_data = train_data.merge(driver_id_count, how='left', 
                              left_on='driver_id', right_index=True)

train_data['driver_postal_code'].fillna(0, inplace=True)
driver_postal_code_count = train_data['driver_postal_code'].value_counts()
driver_postal_code_count.loc[0] = 0
train_data = train_data.merge(driver_postal_code_count, how='left', 
                              left_on='driver_postal_code', right_index=True)

train_data['driver_year_birth'].fillna(1801, inplace=True)

train_data['driver_expert_id'].fillna("MISSING", inplace=True)
driver_expert_id_count = train_data['driver_expert_id'].value_counts()
driver_expert_id_count['MISSING'] = 0
train_data = train_data.merge(driver_expert_id_count, how='left', 
                              left_on='driver_expert_id', right_index=True)

train_data['driver_vehicle_id'].fillna("MISSING", inplace=True)
driver_vehicle_id_count = train_data['driver_vehicle_id'].value_counts()
driver_vehicle_id_count["MISSING"] = 0
train_data = train_data.merge(driver_vehicle_id_count, how='left', 
                              left_on='driver_vehicle_id', right_index=True)

In [8]:
# third party 1st variables

train_data['third_party_1_id'].fillna("MISSING", inplace=True)
third_party_1_id_count = train_data['third_party_1_id'].value_counts()
third_party_1_id_count["MISSING"] = 0
train_data = train_data.merge(third_party_1_id_count, how='left', 
                              left_on='third_party_1_id', right_index=True)

train_data['third_party_1_postal_code'].fillna(0, inplace=True)
third_party_1_postal_code_count = train_data['third_party_1_postal_code'].value_counts()
third_party_1_postal_code_count[0] = 0
train_data = train_data.merge(third_party_1_postal_code_count, how='left', 
                              left_on='third_party_1_postal_code', right_index=True)

train_data['third_party_1_injured'].fillna("MISSING", inplace=True)

train_data['third_party_1_vehicle_type'].fillna("MISSING", inplace=True)

train_data['third_party_1_form'].fillna("MISSING", inplace=True)

train_data['third_party_1_year_birth'].fillna(1802, inplace=True)

train_data['third_party_1_country'].fillna("MISSING", inplace=True)

train_data['third_party_1_vehicle_id'].fillna("MISSING", inplace=True)
third_party_1_vehicle_id_count = train_data['third_party_1_vehicle_id'].value_counts()
third_party_1_vehicle_id_count["MISSING"] = 0
train_data = train_data.merge(third_party_1_vehicle_id_count, how='left', 
                              left_on='third_party_1_vehicle_id', right_index=True)

train_data['third_party_1_expert_id'].fillna("MISSING", inplace=True)
third_party_1_expert_id_count = train_data['third_party_1_expert_id'].value_counts()
third_party_1_expert_id_count["MISSING"] = 0
train_data = train_data.merge(third_party_1_expert_id_count, how='left', 
                 left_on='third_party_1_expert_id', right_index=True)

### 3rd party 2nd and 3rd variables

These were encoded as simple 0/1 for "missing" and "available". We believe that they are too far removed from the event and this aggregation should be more than enough.

In [9]:
# third party rest of variables

mask = train_data['third_party_2_id'].isna()
train_data.loc[mask, 'third_party_2_id'] = 0
train_data.loc[~mask, 'third_party_2_id'] = 1

mask = train_data['third_party_2_postal_code'].isna()
train_data.loc[mask, 'third_party_2_postal_code'] = 0
train_data.loc[~mask, 'third_party_2_postal_code'] = 1

mask = train_data['third_party_2_injured'].isna()
train_data.loc[mask, 'third_party_2_injured'] = 0
train_data.loc[~mask, 'third_party_2_injured'] = 1

mask = train_data['third_party_2_vehicle_type'].isna()
train_data.loc[mask, 'third_party_2_vehicle_type'] = 0
train_data.loc[~mask, 'third_party_2_vehicle_type'] = 1

mask = train_data['third_party_2_form'].isna()
train_data.loc[mask, 'third_party_2_form'] = 0
train_data.loc[~mask, 'third_party_2_form'] = 1

mask = train_data['third_party_2_year_birth'].isna()
train_data.loc[mask, 'third_party_2_year_birth'] = 0
train_data.loc[~mask, 'third_party_2_year_birth'] = 1

mask = train_data['third_party_2_country'].isna()
train_data.loc[mask, 'third_party_2_country'] = 0
train_data.loc[~mask, 'third_party_2_country'] = 1

mask = train_data['third_party_2_vehicle_id'].isna()
train_data.loc[mask, 'third_party_2_vehicle_id'] = 0
train_data.loc[~mask, 'third_party_2_vehicle_id'] = 1

mask = train_data['third_party_2_expert_id'].isna()
train_data.loc[mask, 'third_party_2_expert_id'] = 0
train_data.loc[~mask, 'third_party_2_expert_id'] = 1

mask = train_data['third_party_3_id'].isna()
train_data.loc[mask, 'third_party_3_id'] = 0
train_data.loc[~mask, 'third_party_3_id'] = 1

mask = train_data['third_party_3_postal_code'].isna()
train_data.loc[mask, 'third_party_3_postal_code'] = 0
train_data.loc[~mask, 'third_party_3_postal_code'] = 1

mask = train_data['third_party_3_injured'].isna()
train_data.loc[mask, 'third_party_3_injured'] = 0
train_data.loc[~mask, 'third_party_3_injured'] = 1

mask = train_data['third_party_3_vehicle_type'].isna()
train_data.loc[mask, 'third_party_3_vehicle_type'] = 0
train_data.loc[~mask, 'third_party_3_vehicle_type'] = 1

mask = train_data['third_party_3_form'].isna()
train_data.loc[mask, 'third_party_3_form'] = 0
train_data.loc[~mask, 'third_party_3_form'] = 1

mask = train_data['third_party_3_year_birth'].isna()
train_data.loc[mask, 'third_party_3_year_birth'] = 0
train_data.loc[~mask, 'third_party_3_year_birth'] = 1

mask = train_data['third_party_3_country'].isna()
train_data.loc[mask, 'third_party_3_country'] = 0
train_data.loc[~mask, 'third_party_3_country'] = 1

mask = train_data['third_party_3_vehicle_id'].isna()
train_data.loc[mask, 'third_party_3_vehicle_id'] = 0
train_data.loc[~mask, 'third_party_3_vehicle_id'] = 1

mask = train_data['third_party_3_expert_id'].isna()
train_data.loc[mask, 'third_party_3_expert_id'] = 0
train_data.loc[~mask, 'third_party_3_expert_id'] = 1

In [10]:
# repair variables

train_data['repair_id'].fillna("MISSING", inplace=True)
repair_id_count = train_data['repair_id'].value_counts()
repair_id_count["MISSING"] = 0
train_data = train_data.merge(repair_id_count, how='left', 
                 left_on='repair_id', right_index=True)

train_data['repair_postal_code'].fillna(0, inplace=True)
repair_postal_code_count = train_data['repair_postal_code'].value_counts()
repair_postal_code_count[0] = 0
train_data = train_data.merge(repair_postal_code_count, how='left', 
                 left_on='repair_postal_code', right_index=True)

train_data['repair_form'].fillna("MISSING", inplace=True)

train_data['repair_year_birth'].fillna(1804, inplace=True)

train_data['repair_country'].fillna("MISSING", inplace=True)

In [11]:
# final variables

train_data['policy_date_start'].fillna(180501, inplace=True)
train_data['policy_date_start'] = pd.to_datetime(train_data['policy_date_start'].astype(int).astype(str),
                                                 format='%Y%m')

train_data['policy_date_next_expiry'].fillna(180501, inplace=True)
train_data['policy_date_next_expiry'] = pd.to_datetime(train_data['policy_date_next_expiry'].astype(int).astype(str),
                                                       format='%Y%m')

train_data['policy_date_last_renewed'].fillna(180501, inplace=True)
train_data['policy_date_last_renewed'] = pd.to_datetime(train_data['policy_date_last_renewed'].astype(int).astype(str),
                                                        format='%Y%m')

train_data['policy_premium_100'].fillna(200, inplace=True)

train_data['policy_coverage_1000'].fillna(300, inplace=True)

train_data['policy_coverage_type'].fillna("MISSING", inplace=True)
policy_coverage_type_count = train_data['policy_coverage_type'].value_counts()
policy_coverage_type_count["MISSING"] = 0
train_data = train_data.merge(policy_coverage_type_count, how='left', 
                 left_on='policy_coverage_type', right_index=True)

### Temporal data

Temporal data was featurized into Month, Day, DayOfWeek, (Date claimed - Date Happened), DaysLeftInContract, DaysSinceContract, etc.

However, validation later showed that these variables make the model worse as they mostly correlate with claim amounts of small value. Thus, they were left out of the model.

In [12]:
third_cols = train_data.filter(regex='third_party_2|third_party_3').columns

### We now do the same for the validation data. When performing frequency encoding, we use the frequencies calculated with the training data. This code could probably be optimized and modularized.

In [13]:
# claim variables

#train_data['claim_amount'] = train_data['claim_amount'].str.replace(',','.').astype('float64')

test_data['claim_date_registered'] = pd.to_datetime(test_data['claim_date_registered'], format='%Y%m%d')

test_data['claim_date_occured'] = pd.to_datetime(test_data['claim_date_occured'], format='%Y%m%d')

mask_night = (test_data['claim_time_occured'] >= 2200) | (test_data['claim_time_occured'] <= 700)
test_data.loc[~mask_night, 'claim_time_occured'] = 0
test_data.loc[mask_night, 'claim_time_occured'] = 1

# postal_code_counts = train_data['claim_postal_code'].value_counts()
test_data = test_data.merge(postal_code_counts, how='left', left_on='claim_postal_code', right_index=True)

test_data['claim_alcohol'].fillna("MISSING", inplace=True)

test_data['claim_language'].fillna("MISSING", inplace=True)
mask = test_data['claim_language'] == 1.0
test_data.loc[mask, 'claim_language'] = "LANG A"
mask = test_data['claim_language'] == 2.0
test_data.loc[mask, 'claim_language'] = "LANG B"

test_data['claim_vehicle_id'].fillna("MISSING", inplace=True)
#claim_vehicle_id_count = train_data['claim_vehicle_id'].value_counts()
#claim_vehicle_id_count["MISSING"] = 0
test_data = test_data.merge(claim_vehicle_id_count, how='left', 
                              left_on='claim_vehicle_id', right_index=True)

test_data['claim_vehicle_brand'].fillna('MISSING', inplace=True)
#claim_vehicle_brand_counts = train_data['claim_vehicle_brand'].value_counts()
#claim_vehicle_brand_counts['MISSING'] = 0
test_data = test_data.merge(claim_vehicle_brand_counts, how='left', 
                              left_on='claim_vehicle_brand', right_index=True)

test_data['claim_vehicle_date_inuse'].fillna(190001.0, inplace=True)
mask = (test_data['claim_vehicle_date_inuse'] > 210001.0)
test_data.loc[mask, 'claim_vehicle_date_inuse'] = 190001.0
test_data['claim_vehicle_date_inuse'] = pd.to_datetime(test_data['claim_vehicle_date_inuse'].astype(int).astype(str), 
                                                        format='%Y%m')

test_data['claim_vehicle_cyl'].fillna(10000, inplace=True)

test_data['claim_vehicle_load'].fillna(500, inplace=True)

test_data['claim_vehicle_fuel_type'].fillna('MISSING', inplace=True)
mask = test_data['claim_vehicle_fuel_type'] == 1.0
test_data.loc[mask, 'claim_vehicle_fuel_type'] = "FUEL A"
mask = test_data['claim_vehicle_fuel_type'] == 2.0
test_data.loc[mask, 'claim_vehicle_fuel_type'] = "FUEL B"

test_data['claim_vehicle_type'].fillna('MISSING', inplace=True)

test_data['claim_vehicle_power'].fillna(1000, inplace=True)

In [14]:
# policy variables

#policy_holder_id_count = train_data['policy_holder_id'].value_counts()
test_data = test_data.merge(policy_holder_id_count, how='left', 
                              left_on='policy_holder_id', right_index=True)

test_data['policy_holder_postal_code'].fillna(0, inplace=True)
#policy_holder_postal_code_counts = train_data['policy_holder_postal_code'].value_counts()
#policy_holder_postal_code_counts.loc[0] = 0
test_data = test_data.merge(policy_holder_postal_code_counts, how='left', 
                              left_on='policy_holder_postal_code', right_index=True)

test_data['policy_holder_year_birth'].fillna(1800, inplace=True)

test_data['policy_holder_expert_id'].fillna("MISSING", inplace=True)
#policy_holder_expert_id_count = train_data['policy_holder_expert_id'].value_counts()
#policy_holder_expert_id_count['MISSING'] = 0
test_data = test_data.merge(policy_holder_expert_id_count, how='left', 
                              left_on='policy_holder_expert_id', right_index=True)

In [15]:
# driver variables

#driver_id_count = train_data['driver_id'].value_counts()
test_data = test_data.merge(driver_id_count, how='left', 
                              left_on='driver_id', right_index=True)

test_data['driver_postal_code'].fillna(0, inplace=True)
#driver_postal_code_count = train_data['driver_postal_code'].value_counts()
#driver_postal_code_count.loc[0] = 0
test_data = test_data.merge(driver_postal_code_count, how='left', 
                              left_on='driver_postal_code', right_index=True)

test_data['driver_year_birth'].fillna(1801, inplace=True)

test_data['driver_expert_id'].fillna("MISSING", inplace=True)
#driver_expert_id_count = train_data['driver_expert_id'].value_counts()
#driver_expert_id_count['MISSING'] = 0
test_data = test_data.merge(driver_expert_id_count, how='left', 
                              left_on='driver_expert_id', right_index=True)

test_data['driver_vehicle_id'].fillna("MISSING", inplace=True)
#driver_vehicle_id_count = train_data['driver_vehicle_id'].value_counts()
#driver_vehicle_id_count["MISSING"] = 0
test_data = test_data.merge(driver_vehicle_id_count, how='left', 
                              left_on='driver_vehicle_id', right_index=True)

In [16]:
# third party 1st variables

test_data['third_party_1_id'].fillna("MISSING", inplace=True)
#third_party_1_id_count = train_data['third_party_1_id'].value_counts()
#third_party_1_id_count["MISSING"] = 0
test_data = test_data.merge(third_party_1_id_count, how='left', 
                              left_on='third_party_1_id', right_index=True)

test_data['third_party_1_postal_code'].fillna(0, inplace=True)
#third_party_1_postal_code_count = train_data['third_party_1_postal_code'].value_counts()
#third_party_1_postal_code_count[0] = 0
test_data = test_data.merge(third_party_1_postal_code_count, how='left', 
                              left_on='third_party_1_postal_code', right_index=True)

test_data['third_party_1_injured'].fillna("MISSING", inplace=True)

test_data['third_party_1_vehicle_type'].fillna("MISSING", inplace=True)

test_data['third_party_1_form'].fillna("MISSING", inplace=True)

test_data['third_party_1_year_birth'].fillna(1802, inplace=True)

test_data['third_party_1_country'].fillna("MISSING", inplace=True)

test_data['third_party_1_vehicle_id'].fillna("MISSING", inplace=True)
#third_party_1_vehicle_id_count = train_data['third_party_1_vehicle_id'].value_counts()
#third_party_1_vehicle_id_count["MISSING"] = 0
test_data = test_data.merge(third_party_1_vehicle_id_count, how='left', 
                              left_on='third_party_1_vehicle_id', right_index=True)

test_data['third_party_1_expert_id'].fillna("MISSING", inplace=True)
#third_party_1_expert_id_count = train_data['third_party_1_expert_id'].value_counts()
#third_party_1_expert_id_count["MISSING"] = 0
test_data = test_data.merge(third_party_1_expert_id_count, how='left', 
                 left_on='third_party_1_expert_id', right_index=True)

In [17]:
# third party rest of variables

mask = test_data['third_party_2_id'].isna()
test_data.loc[mask, 'third_party_2_id'] = 0
test_data.loc[~mask, 'third_party_2_id'] = 1

mask = test_data['third_party_2_postal_code'].isna()
test_data.loc[mask, 'third_party_2_postal_code'] = 0
test_data.loc[~mask, 'third_party_2_postal_code'] = 1

mask = test_data['third_party_2_injured'].isna()
test_data.loc[mask, 'third_party_2_injured'] = 0
test_data.loc[~mask, 'third_party_2_injured'] = 1

mask = test_data['third_party_2_vehicle_type'].isna()
test_data.loc[mask, 'third_party_2_vehicle_type'] = 0
test_data.loc[~mask, 'third_party_2_vehicle_type'] = 1

mask = test_data['third_party_2_form'].isna()
test_data.loc[mask, 'third_party_2_form'] = 0
test_data.loc[~mask, 'third_party_2_form'] = 1

mask = test_data['third_party_2_year_birth'].isna()
test_data.loc[mask, 'third_party_2_year_birth'] = 0
test_data.loc[~mask, 'third_party_2_year_birth'] = 1

mask = test_data['third_party_2_country'].isna()
test_data.loc[mask, 'third_party_2_country'] = 0
test_data.loc[~mask, 'third_party_2_country'] = 1

mask = test_data['third_party_2_vehicle_id'].isna()
test_data.loc[mask, 'third_party_2_vehicle_id'] = 0
test_data.loc[~mask, 'third_party_2_vehicle_id'] = 1

mask = test_data['third_party_2_expert_id'].isna()
test_data.loc[mask, 'third_party_2_expert_id'] = 0
test_data.loc[~mask, 'third_party_2_expert_id'] = 1

mask = test_data['third_party_3_id'].isna()
test_data.loc[mask, 'third_party_3_id'] = 0
test_data.loc[~mask, 'third_party_3_id'] = 1

mask = test_data['third_party_3_postal_code'].isna()
test_data.loc[mask, 'third_party_3_postal_code'] = 0
test_data.loc[~mask, 'third_party_3_postal_code'] = 1

mask = test_data['third_party_3_injured'].isna()
test_data.loc[mask, 'third_party_3_injured'] = 0
test_data.loc[~mask, 'third_party_3_injured'] = 1

mask = test_data['third_party_3_vehicle_type'].isna()
test_data.loc[mask, 'third_party_3_vehicle_type'] = 0
test_data.loc[~mask, 'third_party_3_vehicle_type'] = 1

mask = test_data['third_party_3_form'].isna()
test_data.loc[mask, 'third_party_3_form'] = 0
test_data.loc[~mask, 'third_party_3_form'] = 1

mask = test_data['third_party_3_year_birth'].isna()
test_data.loc[mask, 'third_party_3_year_birth'] = 0
test_data.loc[~mask, 'third_party_3_year_birth'] = 1

mask = test_data['third_party_3_country'].isna()
test_data.loc[mask, 'third_party_3_country'] = 0
test_data.loc[~mask, 'third_party_3_country'] = 1

mask = test_data['third_party_3_vehicle_id'].isna()
test_data.loc[mask, 'third_party_3_vehicle_id'] = 0
test_data.loc[~mask, 'third_party_3_vehicle_id'] = 1

mask = test_data['third_party_3_expert_id'].isna()
test_data.loc[mask, 'third_party_3_expert_id'] = 0
test_data.loc[~mask, 'third_party_3_expert_id'] = 1

In [18]:
# repair variables

test_data['repair_id'].fillna("MISSING", inplace=True)
#repair_id_count = train_data['repair_id'].value_counts()
#repair_id_count["MISSING"] = 0
test_data = test_data.merge(repair_id_count, how='left', 
                 left_on='repair_id', right_index=True)

test_data['repair_postal_code'].fillna(0, inplace=True)
#repair_postal_code_count = train_data['repair_postal_code'].value_counts()
#repair_postal_code_count[0] = 0
test_data = test_data.merge(repair_postal_code_count, how='left', 
                 left_on='repair_postal_code', right_index=True)

test_data['repair_form'].fillna("MISSING", inplace=True)

test_data['repair_year_birth'].fillna(1804, inplace=True)

test_data['repair_country'].fillna("MISSING", inplace=True)

In [19]:
# final variables

test_data['policy_date_start'].fillna(180501, inplace=True)
test_data['policy_date_start'] = pd.to_datetime(test_data['policy_date_start'].astype(int).astype(str),
                                                 format='%Y%m')

test_data['policy_date_next_expiry'].fillna(180501, inplace=True)
test_data['policy_date_next_expiry'] = pd.to_datetime(test_data['policy_date_next_expiry'].astype(int).astype(str),
                                                       format='%Y%m')

test_data['policy_date_last_renewed'].fillna(180501, inplace=True)
test_data['policy_date_last_renewed'] = pd.to_datetime(test_data['policy_date_last_renewed'].astype(int).astype(str),
                                                        format='%Y%m')

test_data['policy_premium_100'].fillna(200, inplace=True)

test_data['policy_coverage_1000'].fillna(300, inplace=True)

test_data['policy_coverage_type'].fillna("MISSING", inplace=True)
#policy_coverage_type_count = train_data['policy_coverage_type'].value_counts()
#policy_coverage_type_count["MISSING"] = 0
test_data = test_data.merge(policy_coverage_type_count, how='left', 
                 left_on='policy_coverage_type', right_index=True)

Some missing values in the validation data, new ones that were not handled by the pipeline, are assigned as 0

In [20]:
test_data.fillna(0, inplace=True)

### Drop the original columns

In [21]:
# drop cols for now
dropcols = ['claim_id', 'claim_postal_code_x', 'claim_vehicle_id_x', 'claim_vehicle_brand_x',
       'policy_holder_id_x', 'policy_holder_postal_code_x',
       'policy_holder_expert_id_x', 'driver_id_x', 'driver_postal_code_x',
       'driver_expert_id_x', 'driver_vehicle_id_x', 'third_party_1_id_x',
       'third_party_1_postal_code_x', 'third_party_1_vehicle_id_x',
       'third_party_1_expert_id_x', 'repair_id_x', 'repair_postal_code_x',
       'claim_date_registered', 'claim_date_occured', 'claim_vehicle_date_inuse',
       'policy_date_start', 'policy_date_next_expiry', 'policy_date_last_renewed', 'policy_coverage_type_x']

clean_data = train_data.drop(columns=dropcols)

# encode predictors
test_data['claim_amount'] = test_data['claim_amount'].str.replace(',','.').astype('float64')
X_test = test_data.drop(columns=dropcols).drop(columns=['fraud', 'claim_amount'])
encoded_predictors = pd.get_dummies(X_test)

### The strategy

Our strategy is to train a Random Forest on the data. The goal is to maximize the claim amount in the 100 most probable claims, as classified by the model. In order to do so, we can upsample the high value claims, or downsample the low value cases. 

Here, we remove the entries with a claim amount below some value and proceed to train the model accordingly. Since we are constrained in looking at only 100 cases, we really do NOT want to find claims with 500 euros, since there exist some cases with claim amounts of 15.000 euros, which are 30 times more valueable.

In [22]:
# find all positives with less than 8000E claim amount
indices = clean_data[(clean_data['fraud'] == 'Y') & (clean_data['claim_amount'] < 8000)].index
#print(indices)

# target and covariates
# drop tiny amount positives
y_train = clean_data.drop(indices)['fraud']
X_train = clean_data.drop(indices).drop(columns=['fraud', 'claim_amount'])

# cast correct dtype - this is just needed, no smart thing happening
X_train[third_cols] = X_train[third_cols].apply(pd.to_numeric)

# encode covars
encoded_covariates = pd.get_dummies(X_train)
encoded_target = pd.get_dummies(y_train)['Y']

######################
### run model ver ####
######################

# run model a
rf = RandomForestClassifier(n_estimators = 200, random_state=10)

# predict
rf.fit(encoded_covariates, encoded_target)

predictions = rf.predict_proba(encoded_predictors)
pred_df = pd.DataFrame(predictions)
pred_df['claim_id'] = test_data['claim_id'].values
pred_df.drop(columns=0, inplace=True)
pred_df.sort_values(by=1, ascending=False, inplace=True)

# find true positives
mask = (test_data['fraud'] == 'Y')
#keep amount and ID
frauds = test_data.loc[mask, ['claim_id', 'claim_amount']]
# merge true positives with predictions and get top 100 predictions, if they are correct, and what we got out
score = frauds.merge(pred_df.iloc[:100,:], how='right', on='claim_id')['claim_amount'].sum() / frauds.merge(pred_df, how='right', on='claim_id')['claim_amount'].sum()
print('Random forest score: ', score)

Random forest score:  0.6251959705168381


### Another model

We also try a Gradient Boosting classifier. However, the Random Forest appeared to be more robust and easier to work with.

In [None]:
for val in [500, 2000, 4000, 6000, 8000, 10000, 12000]:
    # find all positives with less than 1000E claim amount
    indices = clean_data[(clean_data['fraud'] == 'Y') & (clean_data['claim_amount'] < val)].index
    #print(indices)

    # target and covariates
    # drop tiny amount positives
    y_train = clean_data.drop(indices)['fraud']
    X_train = clean_data.drop(indices).drop(columns=['fraud', 'claim_amount'])

    # cast correct dtype - this is just needed, no smart thing happening
    X_train[third_cols] = X_train[third_cols].apply(pd.to_numeric)

    # encode covars
    encoded_covariates = pd.get_dummies(X_train)
    encoded_target = pd.get_dummies(y_train)['Y']


    # run model b
    from sklearn.ensemble import GradientBoostingClassifier
    from sklearn.model_selection import GridSearchCV

    gbc = GradientBoostingClassifier(n_estimators = 90, learning_rate = 0.01, max_depth=4, random_state=10)

    # predict
    gbc.fit(encoded_covariates, encoded_target)

    predictions = gbc.predict_proba(encoded_predictors)
    pred_df = pd.DataFrame(predictions)
    pred_df['claim_id'] = test_data['claim_id'].values
    pred_df.drop(columns=0, inplace=True)
    pred_df.sort_values(by=1, ascending=False, inplace=True)

    # find true positives
    mask = (test_data['fraud'] == 'Y')
    #keep amount and ID
    frauds = test_data.loc[mask, ['claim_id', 'claim_amount']]
    # merge true positives with predictions and get top 100 predictions, if they are correct, and what we got out
    score = frauds.merge(pred_df.iloc[:100,:], how='right', on='claim_id')['claim_amount'].sum() / frauds.merge(pred_df, how='right', on='claim_id')['claim_amount'].sum()
    print('val: ', val, 'GB score: ', score)

val:  500 GB score:  0.47201892837636733


In [None]:
gbc.best_params_, gbc.best_score_