In [1]:
import pandas as pd
import numpy as np

import matplotlib.pyplot as plt
%matplotlib inline

from sklearn.ensemble import RandomForestClassifier
from sklearn.model_selection import train_test_split

In [2]:
# read training data

train_data = pd.read_csv('train.csv', delimiter=';')
#train_data.head()

In [3]:
# claim variables

train_data['claim_amount'] = train_data['claim_amount'].str.replace(',','.').astype('float64')

train_data['claim_date_registered'] = pd.to_datetime(train_data['claim_date_registered'], format='%Y%m%d')

train_data['claim_date_occured'] = pd.to_datetime(train_data['claim_date_occured'], format='%Y%m%d')

mask_night = (train_data['claim_time_occured'] >= 2200) | (train_data['claim_time_occured'] <= 700)
train_data.loc[~mask_night, 'claim_time_occured'] = 0
train_data.loc[mask_night, 'claim_time_occured'] = 1

postal_code_counts = train_data['claim_postal_code'].value_counts()
train_data = train_data.merge(postal_code_counts, how='left', left_on='claim_postal_code', right_index=True)

train_data['claim_alcohol'].fillna("MISSING", inplace=True)

train_data['claim_language'].fillna("MISSING", inplace=True)
mask = train_data['claim_language'] == 1.0
train_data.loc[mask, 'claim_language'] = "LANG A"
mask = train_data['claim_language'] == 2.0
train_data.loc[mask, 'claim_language'] = "LANG B"

train_data['claim_vehicle_id'].fillna("MISSING", inplace=True)
claim_vehicle_id_count = train_data['claim_vehicle_id'].value_counts()
claim_vehicle_id_count["MISSING"] = 0
train_data = train_data.merge(claim_vehicle_id_count, how='left', 
                              left_on='claim_vehicle_id', right_index=True)

train_data['claim_vehicle_brand'].fillna('MISSING', inplace=True)
claim_vehicle_brand_counts = train_data['claim_vehicle_brand'].value_counts()
claim_vehicle_brand_counts['MISSING'] = 0
train_data = train_data.merge(claim_vehicle_brand_counts, how='left', 
                              left_on='claim_vehicle_brand', right_index=True)


train_data['claim_vehicle_type'].fillna('MISSING', inplace=True)

train_data['claim_vehicle_date_inuse'].fillna(190001, inplace=True)
train_data['claim_vehicle_date_inuse'] = train_data['claim_vehicle_date_inuse'].astype(int)
mask = (train_data['claim_vehicle_date_inuse'] > 220000)
train_data.loc[mask, 'claim_vehicle_date_inuse'] = 190001
train_data['claim_vehicle_date_inuse'] = pd.to_datetime(train_data['claim_vehicle_date_inuse'].astype(str), 
                                                        format='%Y%m')

train_data['claim_vehicle_cyl'].fillna(10000, inplace=True)

train_data['claim_vehicle_load'].fillna(500, inplace=True)

train_data['claim_vehicle_fuel_type'].fillna('MISSING', inplace=True)
mask = train_data['claim_vehicle_fuel_type'] == 1.0
train_data.loc[mask, 'claim_vehicle_fuel_type'] = "FUEL A"
mask = train_data['claim_vehicle_fuel_type'] == 2.0
train_data.loc[mask, 'claim_vehicle_fuel_type'] = "FUEL B"

train_data['claim_vehicle_power'].fillna(1000, inplace=True)

In [4]:
# policy variables

policy_holder_id_count = train_data['policy_holder_id'].value_counts()
train_data = train_data.merge(policy_holder_id_count, how='left', 
                              left_on='policy_holder_id', right_index=True)

train_data['policy_holder_postal_code'].fillna(0, inplace=True)
policy_holder_postal_code_counts = train_data['policy_holder_postal_code'].value_counts()
policy_holder_postal_code_counts.loc[0] = 0
train_data = train_data.merge(policy_holder_postal_code_counts, how='left', 
                              left_on='policy_holder_postal_code', right_index=True)

train_data['policy_holder_year_birth'].fillna(1800, inplace=True)

train_data['policy_holder_expert_id'].fillna("MISSING", inplace=True)
policy_holder_expert_id_count = train_data['policy_holder_expert_id'].value_counts()
policy_holder_expert_id_count['MISSING'] = 0
train_data = train_data.merge(policy_holder_expert_id_count, how='left', 
                              left_on='policy_holder_expert_id', right_index=True)

In [5]:
# driver variables

driver_id_count = train_data['driver_id'].value_counts()
train_data = train_data.merge(driver_id_count, how='left', 
                              left_on='driver_id', right_index=True)

train_data['driver_postal_code'].fillna(0, inplace=True)
driver_postal_code_count = train_data['driver_postal_code'].value_counts()
driver_postal_code_count.loc[0] = 0
train_data = train_data.merge(driver_postal_code_count, how='left', 
                              left_on='driver_postal_code', right_index=True)

train_data['driver_year_birth'].fillna(1801, inplace=True)

train_data['driver_expert_id'].fillna("MISSING", inplace=True)
driver_expert_id_count = train_data['driver_expert_id'].value_counts()
driver_expert_id_count['MISSING'] = 0
train_data = train_data.merge(driver_expert_id_count, how='left', 
                              left_on='driver_expert_id', right_index=True)

train_data['driver_vehicle_id'].fillna("MISSING", inplace=True)
driver_vehicle_id_count = train_data['driver_vehicle_id'].value_counts()
driver_vehicle_id_count["MISSING"] = 0
train_data = train_data.merge(driver_vehicle_id_count, how='left', 
                              left_on='driver_vehicle_id', right_index=True)

In [6]:
# third party 1st variables

train_data['third_party_1_id'].fillna("MISSING", inplace=True)
third_party_1_id_count = train_data['third_party_1_id'].value_counts()
third_party_1_id_count["MISSING"] = 0
train_data = train_data.merge(third_party_1_id_count, how='left', 
                              left_on='third_party_1_id', right_index=True)

train_data['third_party_1_postal_code'].fillna(0, inplace=True)
third_party_1_postal_code_count = train_data['third_party_1_postal_code'].value_counts()
third_party_1_postal_code_count[0] = 0
train_data = train_data.merge(third_party_1_postal_code_count, how='left', 
                              left_on='third_party_1_postal_code', right_index=True)

train_data['third_party_1_injured'].fillna("MISSING", inplace=True)

train_data['third_party_1_vehicle_type'].fillna("MISSING", inplace=True)

train_data['third_party_1_form'].fillna("MISSING", inplace=True)

train_data['third_party_1_year_birth'].fillna(1802, inplace=True)

train_data['third_party_1_country'].fillna("MISSING", inplace=True)

train_data['third_party_1_vehicle_id'].fillna("MISSING", inplace=True)
third_party_1_vehicle_id_count = train_data['third_party_1_vehicle_id'].value_counts()
third_party_1_vehicle_id_count["MISSING"] = 0
train_data = train_data.merge(third_party_1_vehicle_id_count, how='left', 
                              left_on='third_party_1_vehicle_id', right_index=True)

train_data['third_party_1_expert_id'].fillna("MISSING", inplace=True)
third_party_1_expert_id_count = train_data['third_party_1_expert_id'].value_counts()
third_party_1_expert_id_count["MISSING"] = 0
train_data = train_data.merge(third_party_1_expert_id_count, how='left', 
                 left_on='third_party_1_expert_id', right_index=True)

In [7]:
# third party rest of variables

mask = train_data['third_party_2_id'].isna()
train_data.loc[mask, 'third_party_2_id'] = 0
train_data.loc[~mask, 'third_party_2_id'] = 1

mask = train_data['third_party_2_postal_code'].isna()
train_data.loc[mask, 'third_party_2_postal_code'] = 0
train_data.loc[~mask, 'third_party_2_postal_code'] = 1

mask = train_data['third_party_2_injured'].isna()
train_data.loc[mask, 'third_party_2_injured'] = 0
train_data.loc[~mask, 'third_party_2_injured'] = 1

mask = train_data['third_party_2_vehicle_type'].isna()
train_data.loc[mask, 'third_party_2_vehicle_type'] = 0
train_data.loc[~mask, 'third_party_2_vehicle_type'] = 1

mask = train_data['third_party_2_form'].isna()
train_data.loc[mask, 'third_party_2_form'] = 0
train_data.loc[~mask, 'third_party_2_form'] = 1

mask = train_data['third_party_2_year_birth'].isna()
train_data.loc[mask, 'third_party_2_year_birth'] = 0
train_data.loc[~mask, 'third_party_2_year_birth'] = 1

mask = train_data['third_party_2_country'].isna()
train_data.loc[mask, 'third_party_2_country'] = 0
train_data.loc[~mask, 'third_party_2_country'] = 1

mask = train_data['third_party_2_vehicle_id'].isna()
train_data.loc[mask, 'third_party_2_vehicle_id'] = 0
train_data.loc[~mask, 'third_party_2_vehicle_id'] = 1

mask = train_data['third_party_2_expert_id'].isna()
train_data.loc[mask, 'third_party_2_expert_id'] = 0
train_data.loc[~mask, 'third_party_2_expert_id'] = 1

mask = train_data['third_party_3_id'].isna()
train_data.loc[mask, 'third_party_3_id'] = 0
train_data.loc[~mask, 'third_party_3_id'] = 1

mask = train_data['third_party_3_postal_code'].isna()
train_data.loc[mask, 'third_party_3_postal_code'] = 0
train_data.loc[~mask, 'third_party_3_postal_code'] = 1

mask = train_data['third_party_3_injured'].isna()
train_data.loc[mask, 'third_party_3_injured'] = 0
train_data.loc[~mask, 'third_party_3_injured'] = 1

mask = train_data['third_party_3_vehicle_type'].isna()
train_data.loc[mask, 'third_party_3_vehicle_type'] = 0
train_data.loc[~mask, 'third_party_3_vehicle_type'] = 1

mask = train_data['third_party_3_form'].isna()
train_data.loc[mask, 'third_party_3_form'] = 0
train_data.loc[~mask, 'third_party_3_form'] = 1

mask = train_data['third_party_3_year_birth'].isna()
train_data.loc[mask, 'third_party_3_year_birth'] = 0
train_data.loc[~mask, 'third_party_3_year_birth'] = 1

mask = train_data['third_party_3_country'].isna()
train_data.loc[mask, 'third_party_3_country'] = 0
train_data.loc[~mask, 'third_party_3_country'] = 1

mask = train_data['third_party_3_vehicle_id'].isna()
train_data.loc[mask, 'third_party_3_vehicle_id'] = 0
train_data.loc[~mask, 'third_party_3_vehicle_id'] = 1

mask = train_data['third_party_3_expert_id'].isna()
train_data.loc[mask, 'third_party_3_expert_id'] = 0
train_data.loc[~mask, 'third_party_3_expert_id'] = 1

In [8]:
# repair variables

train_data['repair_id'].fillna("MISSING", inplace=True)
repair_id_count = train_data['repair_id'].value_counts()
repair_id_count["MISSING"] = 0
train_data = train_data.merge(repair_id_count, how='left', 
                 left_on='repair_id', right_index=True)

train_data['repair_postal_code'].fillna(0, inplace=True)
repair_postal_code_count = train_data['repair_postal_code'].value_counts()
repair_postal_code_count[0] = 0
train_data = train_data.merge(repair_postal_code_count, how='left', 
                 left_on='repair_postal_code', right_index=True)

train_data['repair_form'].fillna("MISSING", inplace=True)

train_data['repair_year_birth'].fillna(1804, inplace=True)

train_data['repair_country'].fillna("MISSING", inplace=True)

In [9]:
# final variables

train_data['policy_date_start'].fillna(180501, inplace=True)
train_data['policy_date_start'] = pd.to_datetime(train_data['policy_date_start'].astype(int).astype(str),
                                                 format='%Y%m')

train_data['policy_date_next_expiry'].fillna(180501, inplace=True)
train_data['policy_date_next_expiry'] = pd.to_datetime(train_data['policy_date_next_expiry'].astype(int).astype(str),
                                                       format='%Y%m')

train_data['policy_date_last_renewed'].fillna(180501, inplace=True)
train_data['policy_date_last_renewed'] = pd.to_datetime(train_data['policy_date_last_renewed'].astype(int).astype(str),
                                                        format='%Y%m')

train_data['policy_premium_100'].fillna(200, inplace=True)

train_data['policy_coverage_1000'].fillna(300, inplace=True)

train_data['policy_coverage_type'].fillna("MISSING", inplace=True)
policy_coverage_type_count = train_data['policy_coverage_type'].value_counts()
policy_coverage_type_count["MISSING"] = 0
train_data = train_data.merge(policy_coverage_type_count, how='left', 
                 left_on='policy_coverage_type', right_index=True)

In [10]:
third_cols = train_data.filter(regex='third_party_2|third_party_3').columns

In [11]:
# drop cols for now

dropcols = ['claim_id', 'claim_postal_code_x', 'claim_vehicle_id_x', 'claim_vehicle_brand_x',
       'policy_holder_id_x', 'policy_holder_postal_code_x',
       'policy_holder_expert_id_x', 'driver_id_x', 'driver_postal_code_x',
       'driver_expert_id_x', 'driver_vehicle_id_x', 'third_party_1_id_x',
       'third_party_1_postal_code_x', 'third_party_1_vehicle_id_x',
       'third_party_1_expert_id_x', 'repair_id_x', 'repair_postal_code_x',
       'claim_date_registered', 'claim_date_occured', 'claim_vehicle_date_inuse',
       'policy_date_start', 'policy_date_next_expiry', 'policy_date_last_renewed', 'policy_coverage_type_x']

clean_data = train_data.drop(columns=dropcols)

In [12]:
# target and covariates

indices = clean_data[(clean_data['fraud'] == 'Y') & (clean_data['claim_amount'] < 500)].index
double = clean_data[(clean_data['fraud'] == 'Y') & (clean_data['claim_amount'] > 2500)]

oversampled_data = pd.concat([clean_data, double])

# target and covariates
y = clean_data.drop(indices)['fraud']
X = clean_data.drop(indices).drop(columns=['fraud', 'claim_amount'])

In [13]:
# cast correct dtype

X[third_cols] = X[third_cols].apply(pd.to_numeric)

In [14]:
# save for sanity check

mask = (X.dtypes == object)
train_obj = X.dtypes.loc[mask].index

In [15]:
# encode vars

encoded_covariates = pd.get_dummies(X)
encoded_target = pd.get_dummies(y)['Y']

In [16]:
# run model

rf = RandomForestClassifier(n_estimators=250)
rf.fit(encoded_covariates, encoded_target)

RandomForestClassifier(n_estimators=250)

In [17]:
# save for sanity check

train_cols = encoded_covariates.columns

In [18]:
# read test data

test_data = pd.read_csv('test.csv', delimiter=';')
#test_data.shape

In [19]:
# claim variables

#train_data['claim_amount'] = train_data['claim_amount'].str.replace(',','.').astype('float64')

test_data['claim_date_registered'] = pd.to_datetime(test_data['claim_date_registered'], format='%Y%m%d')

test_data['claim_date_occured'] = pd.to_datetime(test_data['claim_date_occured'], format='%Y%m%d')

mask_night = (test_data['claim_time_occured'] >= 2200) | (test_data['claim_time_occured'] <= 700)
test_data.loc[~mask_night, 'claim_time_occured'] = 0
test_data.loc[mask_night, 'claim_time_occured'] = 1

# postal_code_counts = train_data['claim_postal_code'].value_counts()
test_data = test_data.merge(postal_code_counts, how='left', left_on='claim_postal_code', right_index=True)

test_data['claim_alcohol'].fillna("MISSING", inplace=True)

test_data['claim_language'].fillna("MISSING", inplace=True)
mask = test_data['claim_language'] == 1.0
test_data.loc[mask, 'claim_language'] = "LANG A"
mask = test_data['claim_language'] == 2.0
test_data.loc[mask, 'claim_language'] = "LANG B"

test_data['claim_vehicle_id'].fillna("MISSING", inplace=True)
#claim_vehicle_id_count = train_data['claim_vehicle_id'].value_counts()
#claim_vehicle_id_count["MISSING"] = 0
test_data = test_data.merge(claim_vehicle_id_count, how='left', 
                              left_on='claim_vehicle_id', right_index=True)

test_data['claim_vehicle_brand'].fillna('MISSING', inplace=True)
#claim_vehicle_brand_counts = train_data['claim_vehicle_brand'].value_counts()
#claim_vehicle_brand_counts['MISSING'] = 0
test_data = test_data.merge(claim_vehicle_brand_counts, how='left', 
                              left_on='claim_vehicle_brand', right_index=True)

test_data['claim_vehicle_date_inuse'].fillna(190001.0, inplace=True)
mask = (test_data['claim_vehicle_date_inuse'] > 210001.0)
test_data.loc[mask, 'claim_vehicle_date_inuse'] = 190001.0
test_data['claim_vehicle_date_inuse'] = pd.to_datetime(test_data['claim_vehicle_date_inuse'].astype(int).astype(str), 
                                                        format='%Y%m')

test_data['claim_vehicle_cyl'].fillna(10000, inplace=True)

test_data['claim_vehicle_load'].fillna(500, inplace=True)

test_data['claim_vehicle_fuel_type'].fillna('MISSING', inplace=True)
mask = test_data['claim_vehicle_fuel_type'] == 1.0
test_data.loc[mask, 'claim_vehicle_fuel_type'] = "FUEL A"
mask = test_data['claim_vehicle_fuel_type'] == 2.0
test_data.loc[mask, 'claim_vehicle_fuel_type'] = "FUEL B"

test_data['claim_vehicle_type'].fillna('MISSING', inplace=True)

test_data['claim_vehicle_power'].fillna(1000, inplace=True)

In [20]:
# policy variables

#policy_holder_id_count = train_data['policy_holder_id'].value_counts()
test_data = test_data.merge(policy_holder_id_count, how='left', 
                              left_on='policy_holder_id', right_index=True)

test_data['policy_holder_postal_code'].fillna(0, inplace=True)
#policy_holder_postal_code_counts = train_data['policy_holder_postal_code'].value_counts()
#policy_holder_postal_code_counts.loc[0] = 0
test_data = test_data.merge(policy_holder_postal_code_counts, how='left', 
                              left_on='policy_holder_postal_code', right_index=True)

test_data['policy_holder_year_birth'].fillna(1800, inplace=True)

test_data['policy_holder_expert_id'].fillna("MISSING", inplace=True)
#policy_holder_expert_id_count = train_data['policy_holder_expert_id'].value_counts()
#policy_holder_expert_id_count['MISSING'] = 0
test_data = test_data.merge(policy_holder_expert_id_count, how='left', 
                              left_on='policy_holder_expert_id', right_index=True)

In [21]:
# driver variables

#driver_id_count = train_data['driver_id'].value_counts()
test_data = test_data.merge(driver_id_count, how='left', 
                              left_on='driver_id', right_index=True)

test_data['driver_postal_code'].fillna(0, inplace=True)
#driver_postal_code_count = train_data['driver_postal_code'].value_counts()
#driver_postal_code_count.loc[0] = 0
test_data = test_data.merge(driver_postal_code_count, how='left', 
                              left_on='driver_postal_code', right_index=True)

test_data['driver_year_birth'].fillna(1801, inplace=True)

test_data['driver_expert_id'].fillna("MISSING", inplace=True)
#driver_expert_id_count = train_data['driver_expert_id'].value_counts()
#driver_expert_id_count['MISSING'] = 0
test_data = test_data.merge(driver_expert_id_count, how='left', 
                              left_on='driver_expert_id', right_index=True)

test_data['driver_vehicle_id'].fillna("MISSING", inplace=True)
#driver_vehicle_id_count = train_data['driver_vehicle_id'].value_counts()
#driver_vehicle_id_count["MISSING"] = 0
test_data = test_data.merge(driver_vehicle_id_count, how='left', 
                              left_on='driver_vehicle_id', right_index=True)

In [22]:
# third party 1st variables

test_data['third_party_1_id'].fillna("MISSING", inplace=True)
#third_party_1_id_count = train_data['third_party_1_id'].value_counts()
#third_party_1_id_count["MISSING"] = 0
test_data = test_data.merge(third_party_1_id_count, how='left', 
                              left_on='third_party_1_id', right_index=True)

test_data['third_party_1_postal_code'].fillna(0, inplace=True)
#third_party_1_postal_code_count = train_data['third_party_1_postal_code'].value_counts()
#third_party_1_postal_code_count[0] = 0
test_data = test_data.merge(third_party_1_postal_code_count, how='left', 
                              left_on='third_party_1_postal_code', right_index=True)

test_data['third_party_1_injured'].fillna("MISSING", inplace=True)

test_data['third_party_1_vehicle_type'].fillna("MISSING", inplace=True)

test_data['third_party_1_form'].fillna("MISSING", inplace=True)

test_data['third_party_1_year_birth'].fillna(1802, inplace=True)

test_data['third_party_1_country'].fillna("MISSING", inplace=True)

test_data['third_party_1_vehicle_id'].fillna("MISSING", inplace=True)
#third_party_1_vehicle_id_count = train_data['third_party_1_vehicle_id'].value_counts()
#third_party_1_vehicle_id_count["MISSING"] = 0
test_data = test_data.merge(third_party_1_vehicle_id_count, how='left', 
                              left_on='third_party_1_vehicle_id', right_index=True)

test_data['third_party_1_expert_id'].fillna("MISSING", inplace=True)
#third_party_1_expert_id_count = train_data['third_party_1_expert_id'].value_counts()
#third_party_1_expert_id_count["MISSING"] = 0
test_data = test_data.merge(third_party_1_expert_id_count, how='left', 
                 left_on='third_party_1_expert_id', right_index=True)

In [23]:
# third party rest of variables

mask = test_data['third_party_2_id'].isna()
test_data.loc[mask, 'third_party_2_id'] = 0
test_data.loc[~mask, 'third_party_2_id'] = 1

mask = test_data['third_party_2_postal_code'].isna()
test_data.loc[mask, 'third_party_2_postal_code'] = 0
test_data.loc[~mask, 'third_party_2_postal_code'] = 1

mask = test_data['third_party_2_injured'].isna()
test_data.loc[mask, 'third_party_2_injured'] = 0
test_data.loc[~mask, 'third_party_2_injured'] = 1

mask = test_data['third_party_2_vehicle_type'].isna()
test_data.loc[mask, 'third_party_2_vehicle_type'] = 0
test_data.loc[~mask, 'third_party_2_vehicle_type'] = 1

mask = test_data['third_party_2_form'].isna()
test_data.loc[mask, 'third_party_2_form'] = 0
test_data.loc[~mask, 'third_party_2_form'] = 1

mask = test_data['third_party_2_year_birth'].isna()
test_data.loc[mask, 'third_party_2_year_birth'] = 0
test_data.loc[~mask, 'third_party_2_year_birth'] = 1

mask = test_data['third_party_2_country'].isna()
test_data.loc[mask, 'third_party_2_country'] = 0
test_data.loc[~mask, 'third_party_2_country'] = 1

mask = test_data['third_party_2_vehicle_id'].isna()
test_data.loc[mask, 'third_party_2_vehicle_id'] = 0
test_data.loc[~mask, 'third_party_2_vehicle_id'] = 1

mask = test_data['third_party_2_expert_id'].isna()
test_data.loc[mask, 'third_party_2_expert_id'] = 0
test_data.loc[~mask, 'third_party_2_expert_id'] = 1

mask = test_data['third_party_3_id'].isna()
test_data.loc[mask, 'third_party_3_id'] = 0
test_data.loc[~mask, 'third_party_3_id'] = 1

mask = test_data['third_party_3_postal_code'].isna()
test_data.loc[mask, 'third_party_3_postal_code'] = 0
test_data.loc[~mask, 'third_party_3_postal_code'] = 1

mask = test_data['third_party_3_injured'].isna()
test_data.loc[mask, 'third_party_3_injured'] = 0
test_data.loc[~mask, 'third_party_3_injured'] = 1

mask = test_data['third_party_3_vehicle_type'].isna()
test_data.loc[mask, 'third_party_3_vehicle_type'] = 0
test_data.loc[~mask, 'third_party_3_vehicle_type'] = 1

mask = test_data['third_party_3_form'].isna()
test_data.loc[mask, 'third_party_3_form'] = 0
test_data.loc[~mask, 'third_party_3_form'] = 1

mask = test_data['third_party_3_year_birth'].isna()
test_data.loc[mask, 'third_party_3_year_birth'] = 0
test_data.loc[~mask, 'third_party_3_year_birth'] = 1

mask = test_data['third_party_3_country'].isna()
test_data.loc[mask, 'third_party_3_country'] = 0
test_data.loc[~mask, 'third_party_3_country'] = 1

mask = test_data['third_party_3_vehicle_id'].isna()
test_data.loc[mask, 'third_party_3_vehicle_id'] = 0
test_data.loc[~mask, 'third_party_3_vehicle_id'] = 1

mask = test_data['third_party_3_expert_id'].isna()
test_data.loc[mask, 'third_party_3_expert_id'] = 0
test_data.loc[~mask, 'third_party_3_expert_id'] = 1

In [24]:
# repair variables

test_data['repair_id'].fillna("MISSING", inplace=True)
#repair_id_count = train_data['repair_id'].value_counts()
#repair_id_count["MISSING"] = 0
test_data = test_data.merge(repair_id_count, how='left', 
                 left_on='repair_id', right_index=True)

test_data['repair_postal_code'].fillna(0, inplace=True)
#repair_postal_code_count = train_data['repair_postal_code'].value_counts()
#repair_postal_code_count[0] = 0
test_data = test_data.merge(repair_postal_code_count, how='left', 
                 left_on='repair_postal_code', right_index=True)

test_data['repair_form'].fillna("MISSING", inplace=True)

test_data['repair_year_birth'].fillna(1804, inplace=True)

test_data['repair_country'].fillna("MISSING", inplace=True)

In [25]:
# final variables

test_data['policy_date_start'].fillna(180501, inplace=True)
test_data['policy_date_start'] = pd.to_datetime(test_data['policy_date_start'].astype(int).astype(str),
                                                 format='%Y%m')

test_data['policy_date_next_expiry'].fillna(180501, inplace=True)
test_data['policy_date_next_expiry'] = pd.to_datetime(test_data['policy_date_next_expiry'].astype(int).astype(str),
                                                       format='%Y%m')

test_data['policy_date_last_renewed'].fillna(180501, inplace=True)
test_data['policy_date_last_renewed'] = pd.to_datetime(test_data['policy_date_last_renewed'].astype(int).astype(str),
                                                        format='%Y%m')

test_data['policy_premium_100'].fillna(200, inplace=True)

test_data['policy_coverage_1000'].fillna(300, inplace=True)

test_data['policy_coverage_type'].fillna("MISSING", inplace=True)
#policy_coverage_type_count = train_data['policy_coverage_type'].value_counts()
#policy_coverage_type_count["MISSING"] = 0
test_data = test_data.merge(policy_coverage_type_count, how='left', 
                 left_on='policy_coverage_type', right_index=True)

In [26]:
test_data.fillna(0, inplace=True)

In [27]:
# drop cols for now

X_test = test_data.drop(columns=dropcols)

In [28]:
mask = (X_test.dtypes == object)
test_obj = X_test.dtypes.loc[mask].index

In [29]:
test_obj == train_obj

array([ True,  True,  True,  True,  True,  True,  True,  True,  True,
        True,  True,  True,  True,  True,  True,  True,  True,  True,
        True])

In [30]:
# predictors

encoded_predictors = pd.get_dummies(X_test)

In [31]:
# check

pred_cols = encoded_predictors.columns 
train_cols.shape, pred_cols.shape

((112,), (112,))

In [32]:
# we are cooking baby

(train_cols == pred_cols).sum()

112

In [33]:
predictions = rf.predict_proba(encoded_predictors)

In [34]:
predictions = pd.DataFrame(predictions, index=test_data['claim_id'])
#predictions.head(20)

In [35]:
predictions[1].to_csv('test_proba_8.csv', index=True)