In [1]:
#使用5折交叉验证,增加scale_pos_weight参数
#Score: 0.87666
#交叉特征增加location_type，不使用G/L，#增加scale_pos_weight参数2.5
#Score: 0.87503
#增加incident_location_type、incident_location_street、policy_bind_date_year&auto_year_diff、incident_date_year&auto_year_diff、duration、出险情况2、出险车信息
#Score: 0.88316

In [2]:
import pandas as pd
import numpy as np
%matplotlib inline
import matplotlib # 注意这个也要import一次
import matplotlib.pyplot as plt 
import seaborn as sns
import pickle
from sklearn import metrics
from sklearn.metrics import roc_auc_score, accuracy_score, recall_score, mean_squared_error, roc_curve, auc
from sklearn.model_selection import GridSearchCV
from sklearn.model_selection import train_test_split
from sklearn.model_selection import cross_val_score
import lightgbm as lgb

import warnings
warnings.filterwarnings("ignore")

In [3]:
#数据加载
train_data = pd.read_csv('./data/train.csv')
test_data = pd.read_csv('./data/test.csv')

In [4]:
# train_data.head(10).T
# train_data.isnull().sum()

In [5]:
datas = pd.concat((train_data, test_data))
datas.reset_index(drop=True, inplace=True)

## 数据探索

In [6]:
# datas.tail(5).T

In [7]:
datas.info()

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 1000 entries, 0 to 999
Data columns (total 40 columns):
 #   Column                       Non-Null Count  Dtype  
---  ------                       --------------  -----  
 0   months_as_customer           1000 non-null   int64  
 1   age                          1000 non-null   int64  
 2   policy_number                1000 non-null   int64  
 3   policy_bind_date             1000 non-null   object 
 4   policy_state                 1000 non-null   object 
 5   policy_csl                   1000 non-null   object 
 6   policy_deductable            1000 non-null   int64  
 7   policy_annual_premium        1000 non-null   float64
 8   umbrella_limit               1000 non-null   int64  
 9   insured_zip                  1000 non-null   int64  
 10  insured_sex                  1000 non-null   object 
 11  insured_education_level      1000 non-null   object 
 12  insured_occupation           1000 non-null   object 
 13  insured_hobbies    

In [8]:
# datas.describe().T

In [9]:
# datas.isnull().sum()

In [10]:
# datas.drop(columns=['_c39'], inplace=True)

In [11]:
for col in datas.columns:
    print(col, datas[col].nunique())

months_as_customer 391
age 46
policy_number 1000
policy_bind_date 951
policy_state 3
policy_csl 3
policy_deductable 3
policy_annual_premium 991
umbrella_limit 11
insured_zip 995
insured_sex 2
insured_education_level 7
insured_occupation 14
insured_hobbies 20
insured_relationship 6
capital-gains 338
capital-loss 354
incident_date 60
incident_type 4
collision_type 4
incident_severity 4
authorities_contacted 5
incident_state 7
incident_city 7
incident_location 1000
incident_hour_of_the_day 24
number_of_vehicles_involved 4
property_damage 3
bodily_injuries 3
witnesses 4
police_report_available 3
total_claim_amount 763
injury_claim 638
property_claim 626
vehicle_claim 726
auto_make 14
auto_model 39
auto_year 21
fraud_reported 2
_c39 0


In [12]:
cat_columns = datas.select_dtypes(include='O').columns
numerical_columns = datas.select_dtypes(exclude='O').columns

In [13]:
cat_columns, numerical_columns

(Index(['policy_bind_date', 'policy_state', 'policy_csl', 'insured_sex',
        'insured_education_level', 'insured_occupation', 'insured_hobbies',
        'insured_relationship', 'incident_date', 'incident_type',
        'collision_type', 'incident_severity', 'authorities_contacted',
        'incident_state', 'incident_city', 'incident_location',
        'property_damage', 'police_report_available', 'auto_make',
        'auto_model'],
       dtype='object'),
 Index(['months_as_customer', 'age', 'policy_number', 'policy_deductable',
        'policy_annual_premium', 'umbrella_limit', 'insured_zip',
        'capital-gains', 'capital-loss', 'incident_hour_of_the_day',
        'number_of_vehicles_involved', 'bodily_injuries', 'witnesses',
        'total_claim_amount', 'injury_claim', 'property_claim', 'vehicle_claim',
        'auto_year', 'fraud_reported', '_c39'],
       dtype='object'))

In [14]:
col_name = []
unique_value = []
for col in cat_columns:
    # print(col, datas[col].nunique())
    col_name.append(col)
    unique_value.append(datas[col].nunique())

df_cat_col_unique = pd.DataFrame()
df_cat_col_unique['col_name'] = col_name
df_cat_col_unique['unique_value'] = unique_value
df_cat_col_unique = df_cat_col_unique.sort_values('unique_value', ascending= False)
# df_cat_col_unique

In [15]:
col_name = []
unique_value = []
for col in numerical_columns:
    # print(col, datas[col].nunique())
    col_name.append(col)
    unique_value.append(datas[col].nunique())

df_numerical_col_unique = pd.DataFrame()
df_numerical_col_unique['col_name'] = col_name
df_numerical_col_unique['unique_value'] = unique_value
df_numerical_col_unique = df_numerical_col_unique.sort_values('unique_value', ascending= False)
# df_numerical_col_unique

In [16]:
# df_cat_col_unique
df_numerical_col_unique

Unnamed: 0,col_name,unique_value
2,policy_number,1000
6,insured_zip,995
4,policy_annual_premium,991
13,total_claim_amount,763
16,vehicle_claim,726
14,injury_claim,638
15,property_claim,626
0,months_as_customer,391
8,capital-loss,354
7,capital-gains,338


## 特征工程

#### 标签编码 policy_bind_date、incident_date

In [17]:
datas['policy_bind_date'] = pd.to_datetime(datas['policy_bind_date'])
datas['incident_date'] = pd.to_datetime(datas['incident_date'])

In [18]:
datas['policy_bind_date_year'] = datas['policy_bind_date'].dt.year
datas['policy_bind_date_month'] = datas['policy_bind_date'].dt.month
datas['policy_bind_date_day'] = datas['policy_bind_date'].dt.day
datas['policy_bind_date_weekday'] = datas['policy_bind_date'].dt.weekday

datas['incident_date_year'] = datas['incident_date'].dt.year
datas['incident_date_month'] = datas['incident_date'].dt.month
datas['incident_date_day'] = datas['incident_date'].dt.day
datas['incident_date_weekday'] = datas['incident_date'].dt.weekday

In [19]:
# 查看最小、最大日期
# datas.policy_bind_date.min() #1990-01-08
# datas.policy_bind_date.max() #2015-02-22

# datas.incident_date.min() #2015-01-01
# datas.incident_date.max() #2015-03-01

In [20]:
base_date = datas.policy_bind_date.min()
#转换为diff
datas['policy_bind_date_diff'] = (datas['policy_bind_date'] - base_date).dt.days
datas['incident_date_diff'] = (datas['incident_date'] - base_date).dt.days
datas['incident_date&policy_bind_date_diff'] = datas['incident_date_diff'] - datas['policy_bind_date_diff']

In [21]:
datas.T

Unnamed: 0,0,1,2,3,4,5,6,7,8,9,...,990,991,992,993,994,995,996,997,998,999
months_as_customer,187,243,24,215,85,310,297,108,46,286,...,359,160,80,334,282,31,297,251,54,155
age,37,44,33,42,30,48,48,29,41,41,...,47,38,27,47,43,36,47,39,35,34
policy_number,125591,967713,649082,519312,190588,670142,751612,237418,315041,507545,...,828890,497929,193442,156694,982678,679370,272330,315631,445195,914815
policy_bind_date,2013-08-08 00:00:00,1997-12-25 00:00:00,1996-01-19 00:00:00,2008-10-28 00:00:00,2001-12-09 00:00:00,1999-08-06 00:00:00,2009-06-22 00:00:00,2007-12-04 00:00:00,2010-11-02 00:00:00,1998-12-07 00:00:00,...,1993-10-20 00:00:00,2009-09-19 00:00:00,1996-08-05 00:00:00,2001-05-24 00:00:00,2006-07-19 00:00:00,1999-08-15 00:00:00,2009-11-29 00:00:00,1999-04-09 00:00:00,2010-09-27 00:00:00,1990-09-27 00:00:00
policy_state,IN,IL,IL,OH,OH,IN,IN,IN,OH,IL,...,OH,OH,IL,IL,OH,IL,IN,IN,IN,IN
policy_csl,500/1000,250/500,500/1000,500/1000,100/300,100/300,250/500,500/1000,100/300,250/500,...,100/300,250/500,100/300,500/1000,250/500,500/1000,250/500,500/1000,100/300,100/300
policy_deductable,1000,500,1000,500,1000,500,1000,1000,2000,1000,...,2000,500,1000,500,500,2000,500,2000,500,500
policy_annual_premium,1412.06,809.11,1922.84,1848.81,796.35,1516.34,1464.73,1337.92,998.19,1298.85,...,1367.68,1733.56,1474.17,1238.89,1452.27,1318.24,1616.65,1231.98,1261.28,1706.79
umbrella_limit,5000000,0,0,0,0,0,3000000,0,0,6000000,...,0,0,0,0,0,9000000,7000000,0,0,0
insured_zip,450947,600208,431277,435489,614166,474167,443861,441536,611556,435967,...,613247,441425,440327,600561,611996,601748,456363,612908,453265,462479


#### 构造交叉业务特征

In [22]:
datas['injury_claim_pct'] = datas['injury_claim']/datas['total_claim_amount']
datas['property_claim_pct'] = datas['property_claim']/datas['total_claim_amount']
datas['vehicle_claim_pct'] = datas['vehicle_claim']/datas['total_claim_amount']

In [23]:
datas['is_injury_claim'] = datas['injury_claim'].apply(lambda x: '1' if x >0 else '0')
datas['is_property_claim'] = datas['property_claim'].apply(lambda x: '1' if x >0 else '0')
datas['is_vehicle_claim'] = datas['vehicle_claim'].apply(lambda x: '1' if x >0 else '0')

In [24]:
datas['incident_type_&_is_injury_claim'] = datas['incident_type'] + '_' + datas['is_injury_claim']
datas['incident_type_&_is_property_claim'] = datas['incident_type'] + '_' + datas['is_property_claim']
datas['incident_type_&_is_vehicle_claim'] = datas['incident_type'] + '_' + datas['is_vehicle_claim']

datas['collision_type_&_is_injury_claim'] = datas['collision_type'] + '_' + datas['is_injury_claim']
datas['collision_type_&_is_property_claim'] = datas['collision_type'] + '_' + datas['is_property_claim']
datas['collision_type_&_is_vehicle_claim'] = datas['collision_type'] + '_' + datas['is_vehicle_claim']

In [25]:
datas['umbrella_limit_2_total_claim_amount'] = datas['umbrella_limit']/datas['total_claim_amount']

In [26]:
# datas['G/L'] = datas['capital-gains'] + datas['capital-loss']

In [27]:
datas['incident_location_type'] = datas['incident_location'].apply(lambda x: x.split()[-1])
datas['incident_location_street'] = datas['incident_location'].apply(lambda x: ' '.join(x.split()[-2:]))

In [28]:
#购车年份与投保年份之差
datas['policy_bind_date_year&auto_year_diff'] = datas['policy_bind_date_year'] - datas['auto_year']

#购车年份与出险年份之差
datas['incident_date_year&auto_year_diff'] = datas['incident_date_year'] - datas['auto_year']

#成为客户的时长与购报后出险时长之差
datas['duration'] = round((datas['incident_date'] - datas['policy_bind_date']).dt.days/30) - datas['months_as_customer']

In [29]:
#出险情况2
datas['incident_type&collision_type'] = (datas['incident_type'] + "_" + datas['collision_type'])

datas['incident_type&incident_severity'] = (datas['incident_type'] + "_" + datas['incident_severity'])
datas['collision_type&incident_severity'] = (datas['collision_type'] + "_" + datas['incident_severity'])

datas['incident_type&bodily_injuries'] = (datas['incident_type'] + "_" + datas['bodily_injuries'].apply(lambda x : str(x)))
datas['collision_type&bodily_injuries'] = (datas['collision_type'] + "_" + datas['bodily_injuries'].apply(lambda x : str(x)))

datas['incident_type&property_damage'] = (datas['incident_type'] + "_" + datas['property_damage'])
datas['collision_type&property_damage'] = (datas['collision_type'] + "_" + datas['property_damage'])

datas['incident_type&collision_type&incident_severity&bodily_injuries&property_damage'] = (datas['incident_type'] + "_" + datas['collision_type'] + "_" + datas['incident_severity'] 
 + "_" + datas['bodily_injuries'].apply(lambda x : str(x)) + "_" + datas['property_damage'])

In [30]:
#出险车信息
datas['auto_make&auto_model'] = datas['auto_make'] + "_" + datas['auto_model']

In [31]:
# datas[['incident_type', 'collision_type', 'policy_csl', 'policy_annual_premium', 'umbrella_limit', 'total_claim_amount', 'injury_claim', 'property_claim', 'vehicle_claim', 'capital-gains', 'capital-loss']][datas['fraud_reported'] > 0]

In [32]:
#检查单个特征
datas.incident_location.nunique()
# datas.incident_location.value_counts()

1000

In [33]:
datas.drop(['policy_bind_date', 'incident_date', '_c39', 'incident_location', 'policy_number', 'insured_zip', 'auto_make', 'auto_model'], axis= 1, inplace=True)

In [34]:
# from sklearn.preprocessing import LabelEncoder

In [35]:
cat_columns = datas.select_dtypes(include= 'O').columns

In [36]:
# for col in cat_columns:
#     le = LabelEncoder()
#     datas[col] = le.fit_transform(datas[col])

# datas[cat_columns]

In [37]:
# 数据集切分
train = datas[datas['fraud_reported'].notnull()]
test = datas[datas['fraud_reported'].isnull()]

In [38]:
train_X = train.drop(columns=['fraud_reported'])
train_y = train['fraud_reported']
test_X = test.drop(columns=['fraud_reported'])

## 使用五折交叉验证的catboost 训练

In [39]:
from sklearn.model_selection import StratifiedKFold, KFold
import catboost as cb
from catboost import CatBoostClassifier, cv, Pool

In [40]:
# ### 标记分类变量
# categorical_fea = ['initialListStatus', 'employmentTitle', 'applicationType', 
#                    'title', 'policyCode', 'purpose', 'regionCode', 'postCode',
#                    'verificationStatus', 'homeOwnership']

categorical_fea = list(cat_columns)

In [41]:
# for i in data_X.columns:
#     if i in categorical_fea:
#         data_X[i] = data_X[i].astype('str')

In [42]:
# for i in data_X_testA.columns:
#     if i in categorical_fea:
#         data_X_testA[i] = data_X_testA[i].astype('str')

In [43]:
clfs = []
answers = []
mean_score = 0
cv_scores = []
NFOLD = 5
seed = 2022
CB_INFO_PATH = './catboost_info'

In [44]:
kf = StratifiedKFold(n_splits=NFOLD, shuffle=True, random_state=seed)

In [45]:
from catboost.utils import get_gpu_device_count
print('I see %i GPU devices' % get_gpu_device_count())

I see 0 GPU devices


In [46]:
cv_cat_model = cb.CatBoostClassifier(loss_function='Logloss', eval_metric='AUC', 
                                     iterations=20000, #max_leaves=2**6-1, 
                                     depth=6, learning_rate=0.005,
                                     random_state=2022, od_type="Iter",
#                                      subsample=0.8, colsample_bylevel=0.8, 
                                     min_data_in_leaf=3, l2_leaf_reg=0.5,
                                     scale_pos_weight = 2.5,
#                                      use_best_model=True, 
                                     metric_period=100)

In [47]:
for fold, (train_index, val_index) in enumerate(kf.split(train_X, train_y)):
    X_train_fold, X_val_fold = train_X.iloc[train_index], train_X.iloc[val_index]
    y_train_fold, y_val_fold = train_y.iloc[train_index], train_y.iloc[val_index]
    
    print("fold:", fold)
    clf = cv_cat_model.fit(X_train_fold, y_train_fold, eval_set=(X_val_fold, y_val_fold),
                           verbose=100, 
                           cat_features=categorical_fea,
                           )
    clfs.append(clf)

    pred_val_fold = clfs[fold].predict(X_val_fold, prediction_type='Probability',
                                       ntree_end = clfs[fold].get_best_iteration())[:,-1]
    
    print('cat验证的auc:{}'.format(roc_auc_score(y_val_fold, pred_val_fold)))
    mean_score += roc_auc_score(y_val_fold, pred_val_fold) / NFOLD
    cv_scores.append(roc_auc_score(y_val_fold, pred_val_fold))

    pred = clfs[fold].predict(test_X, prediction_type='Probability',
                              ntree_end = clfs[fold].get_best_iteration())[:,-1]
    answers.append(pred)

fold: 0




0:	test: 0.8440171	best: 0.8440171 (0)	total: 118ms	remaining: 39m 15s
Stopped by overfitting detector  (20 iterations wait)

bestTest = 0.8696581197
bestIteration = 14

Shrink model to first 15 iterations.
cat验证的auc:0.8600427350427351
fold: 1
0:	test: 0.8579060	best: 0.8579060 (0)	total: 70.7ms	remaining: 23m 32s




Stopped by overfitting detector  (20 iterations wait)

bestTest = 0.9017094017
bestIteration = 16

Shrink model to first 17 iterations.
cat验证的auc:0.8952991452991452
fold: 2
0:	test: 0.8360043	best: 0.8360043 (0)	total: 60.9ms	remaining: 20m 17s




Stopped by overfitting detector  (20 iterations wait)

bestTest = 0.9033119658
bestIteration = 1

Shrink model to first 2 iterations.
cat验证的auc:0.8360042735042734
fold: 3




0:	test: 0.8301282	best: 0.8301282 (0)	total: 60.8ms	remaining: 20m 15s
Stopped by overfitting detector  (20 iterations wait)

bestTest = 0.8739316239
bestIteration = 11

Shrink model to first 12 iterations.
cat验证的auc:0.8418803418803419
fold: 4
0:	test: 0.7467949	best: 0.7467949 (0)	total: 62.3ms	remaining: 20m 45s




Stopped by overfitting detector  (20 iterations wait)

bestTest = 0.8028846154
bestIteration = 16

Shrink model to first 17 iterations.
cat验证的auc:0.7911324786324786
fold: 5




0:	test: 0.8493590	best: 0.8493590 (0)	total: 61.8ms	remaining: 20m 35s
Stopped by overfitting detector  (20 iterations wait)

bestTest = 0.8830128205
bestIteration = 3

Shrink model to first 4 iterations.
cat验证的auc:0.8541666666666667
fold: 6




0:	test: 0.8712607	best: 0.8712607 (0)	total: 71ms	remaining: 23m 39s
Stopped by overfitting detector  (20 iterations wait)

bestTest = 0.936965812
bestIteration = 35

Shrink model to first 36 iterations.
cat验证的auc:0.9358974358974359
fold: 7




0:	test: 0.8878205	best: 0.8878205 (0)	total: 62.7ms	remaining: 20m 53s
Stopped by overfitting detector  (20 iterations wait)

bestTest = 0.9075854701
bestIteration = 1

Shrink model to first 2 iterations.
cat验证的auc:0.8878205128205129
fold: 8




0:	test: 0.9583333	best: 0.9583333 (0)	total: 61.9ms	remaining: 20m 37s
Stopped by overfitting detector  (20 iterations wait)

bestTest = 0.9780982906
bestIteration = 4

Shrink model to first 5 iterations.
cat验证的auc:0.9759615384615384
fold: 9




0:	test: 0.8818369	best: 0.8818369 (0)	total: 63.7ms	remaining: 21m 14s
Stopped by overfitting detector  (20 iterations wait)

bestTest = 0.9035087719
bestIteration = 4

Shrink model to first 5 iterations.
cat验证的auc:0.8864809081527347


In [48]:
print("cat_scotrainre_list:{}".format(cv_scores))
print("cat_score_mean:{}".format(np.mean(cv_scores)))
print("cat_score_std:{}".format(np.std(cv_scores)))

cat_scotrainre_list:[0.8600427350427351, 0.8952991452991452, 0.8360042735042734, 0.8418803418803419, 0.7911324786324786, 0.8541666666666667, 0.9358974358974359, 0.8878205128205129, 0.9759615384615384, 0.8864809081527347]
cat_score_mean:0.8764686036357862
cat_score_std:0.04989737589681912


In [49]:
#最终加权平均的预测结果
cat_pre=sum(answers)/NFOLD
cat_pre

array([0.47626881, 0.51579695, 0.47443731, 0.47606637, 0.47487242,
       0.47672481, 0.4756046 , 0.47455183, 0.47559588, 0.47425319,
       0.47536411, 0.4734496 , 0.51590927, 0.47624116, 0.47322468,
       0.51534503, 0.47525071, 0.47551028, 0.52007971, 0.47482024,
       0.47490324, 0.47659505, 0.51264022, 0.47553237, 0.4741492 ,
       0.51625511, 0.47535279, 0.47389808, 0.52609363, 0.4746964 ,
       0.51658198, 0.47505882, 0.51618675, 0.47637964, 0.47340828,
       0.47688113, 0.51605373, 0.47406961, 0.4764513 , 0.47406192,
       0.51691598, 0.47652283, 0.47494282, 0.47524334, 0.47638281,
       0.47597233, 0.47477261, 0.47255475, 0.51661579, 0.47313118,
       0.47658721, 0.47568509, 0.47405941, 0.51554653, 0.51533016,
       0.47649938, 0.47573806, 0.51619042, 0.47343009, 0.51670999,
       0.4759902 , 0.47596603, 0.47447317, 0.51645159, 0.47476022,
       0.47523434, 0.52407754, 0.51590036, 0.47481613, 0.4748374 ,
       0.47574859, 0.47541095, 0.5158206 , 0.52521463, 0.47396

In [50]:
submission = pd.read_csv('./data/sampleSubmission.csv')
submission['fraud_reported'] = cat_pre
submission.to_csv('./submission/submission_KF_catboost.csv', index=None)