## Import the source data

In [1]:
import pandas as pd
import numpy as np

In [2]:
PATH = '~/Documents/To transfer out/job applications/QBE/QBE_Data_Science_Challenge/'
fnames = ['smetradesmanliability.csv','data_dictionary.xlsx']

In [3]:
df = pd.read_csv(f'{PATH}{fnames[0]}')

In [4]:
df.head()

Unnamed: 0.1,Unnamed: 0,Source System,Product,Underwriting Year,Effective Date,Expiry Date,Transaction Type,Public Liability Limit,Employers Liability Limit,Tools Sum Insured,...,Location,Public_Liability_Limit_5000000.2,Public_Liability_Limit_5000000.3,Professional_Indemnity_Limit_g,Risk_Postcode2,TotalEmployees,Claim Count,Claim Incurred,Capped Incurred (£50k),Capped Incurred (£100k)
0,0,Simply Business,Maltings Tradesman,2014,19/09/2014 01:00,18/09/2015 01:00,New Business,1000000,0,0.0,...,BRIGHTON,0,0,1,BN21 3,1,0,0.0,0.0,0.0
1,1,Simply Business,Maltings Tradesman,2014,08/10/2014 01:00,07/10/2015 01:00,New Business,5000000,0,0.0,...,BRISTOL,1,0,0,BS15 4,1,0,0.0,0.0,0.0
2,2,Simply Business,Maltings Tradesman,2014,18/09/2014 01:00,17/09/2015 01:00,New Business,5000000,10000000,4000.0,...,NORTHAMPTON,1,0,0,NN10 8,8,0,0.0,0.0,0.0
3,3,Simply Business,Maltings Tradesman,2014,03/10/2014 01:00,02/10/2015 01:00,New Business,2000000,0,2000.0,...,CAMBRIDGE,0,0,0,CB8 7,10,0,0.0,0.0,0.0
4,4,Simply Business,Maltings Tradesman,2014,20/09/2014 01:00,19/09/2015 01:00,New Business,1000000,0,0.0,...,NEWCASTLE UPON TYNE,0,0,1,NE5 1,1,0,0.0,0.0,0.0


In [5]:
df['Claim Count'].value_counts()

0    229689
1      3098
2       150
3        22
4         3
Name: Claim Count, dtype: int64

### make a few amendments to the dataframe

In [6]:
df.columns,df.shape

(Index(['Unnamed: 0', 'Source System', 'Product', 'Underwriting Year',
        'Effective Date', 'Expiry Date', 'Transaction Type',
        'Public Liability Limit', 'Employers Liability Limit',
        'Tools Sum Insured', 'Professional Indemnity Limit',
        'Contract Works Sum Insured', 'Hired in Plan Sum Insured',
        'Own Plant Sum Insured', 'Trade 1', 'Trade 2', 'Manual EE',
        'Clerical EE', 'Subcontractor EE', 'Match Type', 'Trade 1 Category',
        'Trade 2 Category', 'Trade 1 Risk Level', 'Trade 2 Risk Level',
        'Effective_Date2', 'CancellationEffectiveDate',
        'Total Gross Premium excl IPT', 'Commission Amount',
        'Net Premium to UW', 'Policy Count', 'Gross Premium Excl PI',
        'Gross PI Premium', 'DurationofPolicy', 'CombinedTradeRiskLevel',
        'Public_Liability_Limit_1000000', 'Public_Liability_Limit_1000000.1',
        'Public_Liability_Limit_2000000', 'Public_Liability_Limit_5000000',
        'Public_Liability_Limit_5000000.1', '

In [7]:
df.set_index('Unnamed: 0', inplace= True)

### look into the date types - this is a chunk of code worth re-running as we tidy the data up

In [8]:
data_types = df.dtypes

data_types.value_counts()

int64      44
object     13
float64    10
dtype: int64

### look into the object data types

In [9]:
objects = data_types[data_types == 'object'].index.values.tolist()

In [10]:
cardinality = [(a,df[a].value_counts().count()) for a in objects]
cardinality

[('Source System', 2),
 ('Product', 6),
 ('Effective Date', 3290),
 ('Expiry Date', 3438),
 ('Transaction Type', 3),
 ('Trade 1', 880),
 ('Trade 2', 759),
 ('Match Type', 8),
 ('Trade 1 Category', 16),
 ('Trade 2 Category', 17),
 ('Effective_Date2', 1813),
 ('Location', 121),
 ('Risk_Postcode2', 20857)]

In [11]:
df[objects].isnull().sum()

Source System           0
Product                 0
Effective Date          0
Expiry Date             0
Transaction Type        0
Trade 1                 0
Trade 2             25172
Match Type              0
Trade 1 Category        0
Trade 2 Category        0
Effective_Date2         0
Location              351
Risk_Postcode2          0
dtype: int64

In [12]:
df['Trade 2'].fillna('Unknown',inplace = True)
#also add this in for blanks
df['Trade 2'] = df['Trade 2'].apply(lambda x : 'Unknown' if x == ' ' else x)

In [13]:
df['Location'].fillna('Unknown',inplace = True)

In [14]:
df.drop(['Effective_Date2'],inplace= True,axis = 1)

In [15]:
data_types = df.dtypes

data_types.value_counts()

int64      44
object     12
float64    10
dtype: int64

### date fields

In [16]:
df['Effective Date'] = pd.to_datetime(df['Effective Date'])
df['Expiry Date'] = pd.to_datetime(df['Expiry Date'])

In [17]:
lamdafunc = lambda x : pd.Series([x['Effective Date'].weekofyear
                                  ,x['Effective Date'].dayofweek
                                  ,x['Effective Date'].is_month_start
                                  ,x['Effective Date'].is_month_end
                                  ,x['Effective Date'].quarter
                                  ,x['Effective Date'].is_quarter_start
                                  ,x['Effective Date'].is_quarter_end
                                  ,x['Effective Date'].month
                                  ,x['Effective Date'].day
                                 ])

In [18]:
df[['eff_woy','eff_dow','eff_month_start'
    ,'eff_month_end','eff_quarter','eff_qstart','eff_qend','eff_month','eff_day']] = df.apply(lamdafunc,axis = 1)

In [19]:
data_types = df.dtypes

data_types.value_counts()

int64             49
float64           10
object            10
bool               4
datetime64[ns]     2
dtype: int64

### integer data types

In [20]:
ints = data_types[data_types == 'int64'].index.values.tolist()

In [21]:
df[ints].isnull().sum()

Underwriting Year                   0
Public Liability Limit              0
Employers Liability Limit           0
Professional Indemnity Limit        0
Contract Works Sum Insured          0
Hired in Plan Sum Insured           0
Own Plant Sum Insured               0
Manual EE                           0
Clerical EE                         0
Subcontractor EE                    0
Trade 1 Risk Level                  0
Trade 2 Risk Level                  0
Policy Count                        0
DurationofPolicy                    0
CombinedTradeRiskLevel              0
Public_Liability_Limit_1000000      0
Public_Liability_Limit_1000000.1    0
Public_Liability_Limit_2000000      0
Public_Liability_Limit_5000000      0
Public_Liability_Limit_5000000.1    0
Public_Liability_Limit_1000000.2    0
Public_Liability_Limit_1000000.3    0
Employers_Liability_Limit_1000      0
Professional_Indemnity_Limit_5      0
Professional_Indemnity_Limit_5.1    0
Professional_Indemnity_Limit_1      0
Professional

In [22]:
data_types = df.dtypes

data_types.value_counts()

int64             49
float64           10
object            10
bool               4
datetime64[ns]     2
dtype: int64

### continuous variables

In [23]:
floats = data_types[data_types == 'float64'].index.values.tolist()

In [24]:
df[floats].head()

Unnamed: 0_level_0,Tools Sum Insured,CancellationEffectiveDate,Total Gross Premium excl IPT,Commission Amount,Net Premium to UW,Gross Premium Excl PI,Gross PI Premium,Claim Incurred,Capped Incurred (£50k),Capped Incurred (£100k)
Unnamed: 0,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1,Unnamed: 5_level_1,Unnamed: 6_level_1,Unnamed: 7_level_1,Unnamed: 8_level_1,Unnamed: 9_level_1,Unnamed: 10_level_1
0,0.0,,266.39,98.46,167.93,199.75,66.64,0.0,0.0,0.0
1,0.0,,91.53,33.83,57.7,91.53,0.0,0.0,0.0,0.0
2,4000.0,,844.32,312.06,532.26,844.32,0.0,0.0,0.0,0.0
3,2000.0,,377.9,139.67,238.23,377.9,0.0,0.0,0.0,0.0
4,0.0,,232.88,86.07,146.81,38.5,194.38,0.0,0.0,0.0


In [25]:
df[floats].isnull().sum()

Tools Sum Insured                    0
CancellationEffectiveDate       232962
Total Gross Premium excl IPT         0
Commission Amount                    0
Net Premium to UW                    0
Gross Premium Excl PI                0
Gross PI Premium                 35138
Claim Incurred                       0
Capped Incurred (£50k)               0
Capped Incurred (£100k)              0
dtype: int64

In [26]:
df.drop(['CancellationEffectiveDate'],inplace = True,axis = 1)
df.drop(['Capped Incurred (£50k)'],inplace = True,axis = 1)
df.drop(['Capped Incurred (£100k)'],inplace = True,axis = 1)

In [None]:
floats.remove('CancellationEffectiveDate')
floats.remove('Capped Incurred (£50k)')
floats.remove('Capped Incurred (£1000k)')

In [27]:
a = df[['Source System','Gross PI Premium','Gross Premium Excl PI','Total Gross Premium excl IPT']]
a[a['Source System'] != 'Simply Business']

df['Gross PI Premium'].fillna(df['Gross Premium Excl PI'],inplace = True)

In [28]:
data_types = df.dtypes

data_types.value_counts()

int64             49
object            10
float64            7
bool               4
datetime64[ns]     2
dtype: int64

### look at overall dataset

In [29]:
#sort the values so we can take the last X as training set

df.sort_values(by = ['Effective Date'],inplace = True)

#### quickly convert the boolean values into ints

In [30]:
bools = data_types[data_types == 'bool'].index.values.tolist()

df[bools] = df[bools].astype(np.int64)

In [31]:
data_types = df.dtypes

data_types.value_counts()

int64             53
object            10
float64            7
datetime64[ns]     2
dtype: int64

### back to overall datset

In [32]:
floats_f = data_types[data_types == 'float64'].index.values.tolist()
ints_f = data_types[data_types == 'int64'].index.values.tolist()
object_f = data_types[data_types == 'object'].index.values.tolist()

In [33]:
#treat the target by itself

ints_f.remove('Claim Count')
floats_f.remove('Claim Incurred')

In [34]:
df['Claim Count'] = df['Claim Count'].apply(lambda x : 0 if x == 0 else 1)

target = ['Claim Count']

In [36]:
from sklearn.preprocessing import LabelEncoder,StandardScaler

le = LabelEncoder()
scaler = StandardScaler()

#scale the floats
scaled = scaler.fit_transform(df[floats_f])

#encode the labels
encoded  = np.array(df[object_f].apply(le.fit_transform))

In [37]:
full_df = np.hstack((encoded,scaled,np.array(df[target])))
full_df

array([[ 0.        ,  5.        ,  0.        , ..., -0.33205667,
        -0.06863205,  0.        ],
       [ 0.        ,  5.        ,  0.        , ..., -0.27071234,
         0.02443333,  0.        ],
       [ 0.        ,  5.        ,  0.        , ...,  1.13824618,
         2.16196155,  0.        ],
       ...,
       [ 1.        ,  2.        ,  0.        , ..., -0.3972364 ,
        -0.02681048,  0.        ],
       [ 1.        ,  1.        ,  2.        , ...,  0.52916118,
        -0.33939774,  0.        ],
       [ 1.        ,  1.        ,  2.        , ..., -0.4186143 ,
        -0.16864014,  0.        ]])

### split into training and test

In [38]:
#take the bottom 20% as your training set

tr = np.round(len(df) * 0.2,0).astype(int)

train = full_df[tr:]
test = full_df[:tr]

In [39]:
train.shape[0] + test.shape[0]

232962

In [40]:
val = np.round(len(train) * 0.2,0).astype(int)

train = train[val:]
val = train[:val]

In [41]:
train.shape[0] + val.shape[0]

186370

## Build the model

In [42]:
train.shape,val.shape

((149096, 17), (37274, 17))

In [43]:
import xgboost as xgb

dtrain = xgb.DMatrix(train[:,:-1],label = train[:,-1:])
dval = xgb.DMatrix(val[:,:-1],label = val[:,-1:])

In [64]:
param = {'max_depth':2
         , 'eta':0.25
         , 'gamma' : 0.5
         , 'silent':1
         , 'subsample' : 0.9
         , 'colsample_bytree' : 0.67
         , 'objective':'binary:logistic'
         , 'eval_metric' : 'error'}

watchlist = [(dval, 'eval'), (dtrain, 'train')]
num_round = 100

eta_list = [0.2] * 100

bst = xgb.train(param, dtrain, num_round, watchlist, early_stopping_rounds = 10,learning_rates=eta_list)

[0]	eval-error:0.01328	train-error:0.013434
Multiple eval metrics have been passed: 'train-error' will be used for early stopping.

Will train until train-error hasn't improved in 10 rounds.
[1]	eval-error:0.01328	train-error:0.013434
[2]	eval-error:0.01328	train-error:0.013434
[3]	eval-error:0.01328	train-error:0.013434
[4]	eval-error:0.01328	train-error:0.013434
[5]	eval-error:0.01328	train-error:0.013434
[6]	eval-error:0.01328	train-error:0.013434
[7]	eval-error:0.01328	train-error:0.013434
[8]	eval-error:0.01328	train-error:0.013434
[9]	eval-error:0.01328	train-error:0.013434
[10]	eval-error:0.01328	train-error:0.013434
Stopping. Best iteration:
[0]	eval-error:0.01328	train-error:0.013434



In [65]:
bst.save_model('100_rounds_v1')

bst2 = xgb.Booster(model_file='100_rounds_v1')

In [69]:
from sklearn.metrics import accuracy_score,confusion_matrix

print(confusion_matrix(val[:,-1:],bst2.predict(dval).astype(int)))

[[36779     0]
 [  495     0]]


## do some oversampling

In [60]:
from imblearn.over_sampling import SMOTE

In [70]:
train.shape,val.shape

((149096, 17), (37274, 17))

In [71]:
sm = SMOTE(random_state=12, ratio = 1.0)

In [75]:
train_res,train_target_res = sm.fit_sample(train[:,:-1],train[:,-1:])

  y = column_or_1d(y, warn=True)


In [81]:
pd.Series(train_target_res).value_counts()

1.0    147093
0.0    147093
dtype: int64

### revisit the model but with oversampled data

In [102]:
dtrain2 = xgb.DMatrix(train_res,label = train_target_res)

In [103]:
param = {'max_depth':2
         , 'eta':0.25
         , 'gamma' : 0.5
         , 'silent':1
         , 'subsample' : 0.9
         , 'colsample_bytree' : 0.67
         , 'objective':'binary:logistic'
         , 'eval_metric' : 'error'}

watchlist = [(dval, 'eval'), (dtrain, 'train')]
num_round = 200

eta_list = [0.5] * 200

bst = xgb.train(param, dtrain2, num_round, watchlist, early_stopping_rounds = 10,learning_rates=eta_list)

[0]	eval-error:0.293475	train-error:0.221819
Multiple eval metrics have been passed: 'train-error' will be used for early stopping.

Will train until train-error hasn't improved in 10 rounds.
[1]	eval-error:0.263025	train-error:0.205023
[2]	eval-error:0.284515	train-error:0.207046
[3]	eval-error:0.25004	train-error:0.17491
[4]	eval-error:0.223695	train-error:0.16304
[5]	eval-error:0.200757	train-error:0.143154
[6]	eval-error:0.157617	train-error:0.125954
[7]	eval-error:0.141788	train-error:0.116018
[8]	eval-error:0.165826	train-error:0.112007
[9]	eval-error:0.163143	train-error:0.110138
[10]	eval-error:0.110506	train-error:0.093104
[11]	eval-error:0.104818	train-error:0.090045
[12]	eval-error:0.104255	train-error:0.087373
[13]	eval-error:0.100553	train-error:0.083583
[14]	eval-error:0.101814	train-error:0.080187
[15]	eval-error:0.111847	train-error:0.077917
[16]	eval-error:0.093819	train-error:0.076251
[17]	eval-error:0.085743	train-error:0.071836
[18]	eval-error:0.086334	train-error:0

[175]	eval-error:0.014004	train-error:0.01376
[176]	eval-error:0.014004	train-error:0.013699
[177]	eval-error:0.014031	train-error:0.013627
[178]	eval-error:0.014004	train-error:0.013648
[179]	eval-error:0.014004	train-error:0.013546
[180]	eval-error:0.014004	train-error:0.013495
[181]	eval-error:0.013978	train-error:0.01342
[182]	eval-error:0.013978	train-error:0.013369
[183]	eval-error:0.013978	train-error:0.013349
[184]	eval-error:0.014004	train-error:0.013352
[185]	eval-error:0.013978	train-error:0.013311
[186]	eval-error:0.013978	train-error:0.013308
[187]	eval-error:0.013978	train-error:0.013308
[188]	eval-error:0.013978	train-error:0.013257
[189]	eval-error:0.013978	train-error:0.013233
[190]	eval-error:0.013978	train-error:0.013209
[191]	eval-error:0.013978	train-error:0.013199
[192]	eval-error:0.013951	train-error:0.013196
[193]	eval-error:0.013951	train-error:0.013158
[194]	eval-error:0.014004	train-error:0.013182
[195]	eval-error:0.014004	train-error:0.013182
[196]	eval-erro

In [104]:
bst.save_model('200_rounds_withOverSampling')

bst2 = xgb.Booster(model_file='200_rounds_withOverSampling')

In [105]:
from sklearn.metrics import accuracy_score,confusion_matrix

print(confusion_matrix(val[:,-1:],bst2.predict(dval).astype(int)))

[[36779     0]
 [  495     0]]


In [106]:
print(confusion_matrix(train_target_res,bst2.predict(dtrain2).astype(int)))

[[147093      0]
 [140012   7081]]


In [107]:
accuracy_score(train_target_res,bst2.predict(dtrain).astype(int))

0.5240698061770445

In [108]:
495/(36779+495)

0.01328003434029082