In [1]:
#增加交叉特征
#Score: Score: 0.89048
#使用5折交叉验证,增加scale_pos_weight参数
#Score: Score: 0.89292

In [2]:
import pandas as pd
import numpy as np
%matplotlib inline
import matplotlib # 注意这个也要import一次
import matplotlib.pyplot as plt 
import seaborn as sns
import pickle
from sklearn import metrics
from sklearn.metrics import roc_auc_score, accuracy_score, recall_score, mean_squared_error, roc_curve, auc
from sklearn.model_selection import GridSearchCV
from sklearn.model_selection import train_test_split
from sklearn.model_selection import cross_val_score
import lightgbm as lgb

import warnings
warnings.filterwarnings("ignore")

In [3]:
#数据加载
train_data = pd.read_csv('./fraud-detection-in-insurance-claims/train.csv')
test_data = pd.read_csv('./fraud-detection-in-insurance-claims/test.csv')

In [4]:
datas = pd.concat((train_data, test_data))
datas.reset_index(drop=True, inplace=True)

## 数据探索

In [5]:
cat_columns = datas.select_dtypes(include=['O']).columns
numerical_columns = datas.select_dtypes(exclude=['O']).columns

In [6]:
col_name = []
unique_value = []
for col in cat_columns:
    # print(col, datas[col].nunique())
    col_name.append(col)
    unique_value.append(datas[col].nunique())

df_cat_col_unique = pd.DataFrame()
df_cat_col_unique['col_name'] = col_name
df_cat_col_unique['unique_value'] = unique_value
df_cat_col_unique = df_cat_col_unique.sort_values('unique_value', ascending= False)
# df_cat_col_unique

In [7]:
col_name = []
unique_value = []
for col in numerical_columns:
    # print(col, datas[col].nunique())
    col_name.append(col)
    unique_value.append(datas[col].nunique())

df_numerical_col_unique = pd.DataFrame()
df_numerical_col_unique['col_name'] = col_name
df_numerical_col_unique['unique_value'] = unique_value
df_numerical_col_unique = df_numerical_col_unique.sort_values('unique_value', ascending= False)
# df_numerical_col_unique

In [8]:
df_cat_col_unique
# df_numerical_col_unique

Unnamed: 0,col_name,unique_value
6,incident_location,1000
16,policy_bind_date,951
5,incident_date,60
2,auto_model,39
11,insured_hobbies,20
12,insured_occupation,14
1,auto_make,14
10,insured_education_level,7
4,incident_city,7
8,incident_state,7


## 特征工程

#### 标签编码 policy_bind_date、incident_date

In [9]:
datas['policy_bind_date'] = pd.to_datetime(datas['policy_bind_date'])
datas['incident_date'] = pd.to_datetime(datas['incident_date'])

In [10]:
datas['policy_bind_date_year'] = datas['policy_bind_date'].dt.year
datas['policy_bind_date_month'] = datas['policy_bind_date'].dt.month
datas['policy_bind_date_day'] = datas['policy_bind_date'].dt.day
datas['policy_bind_date_weekday'] = datas['policy_bind_date'].dt.weekday

datas['incident_date_year'] = datas['incident_date'].dt.year
datas['incident_date_month'] = datas['incident_date'].dt.month
datas['incident_date_day'] = datas['incident_date'].dt.day
datas['incident_date_weekday'] = datas['incident_date'].dt.weekday

In [11]:
# 查看最小、最大日期
# datas.policy_bind_date.min() #1990-01-08
# datas.policy_bind_date.max() #2015-02-22

# datas.incident_date.min() #2015-01-01
# datas.incident_date.max() #2015-03-01

In [12]:
base_date = datas.policy_bind_date.min()
#转换为diff
datas['policy_bind_date_diff'] = (datas['policy_bind_date'] - base_date).dt.days
datas['incident_date_diff'] = (datas['incident_date'] - base_date).dt.days
datas['incident_date&policy_bind_date_diff'] = datas['incident_date_diff'] - datas['policy_bind_date_diff']

#### 构造交叉业务特征

In [13]:
datas['injury_claim_pct'] = datas['injury_claim']/datas['total_claim_amount']
datas['property_claim_pct'] = datas['property_claim']/datas['total_claim_amount']
datas['vehicle_claim_pct'] = datas['vehicle_claim']/datas['total_claim_amount']

In [14]:
datas['is_injury_claim'] = datas['injury_claim'].apply(lambda x: '1' if x >0 else '0')
datas['is_property_claim'] = datas['property_claim'].apply(lambda x: '1' if x >0 else '0')
datas['is_vehicle_claim'] = datas['vehicle_claim'].apply(lambda x: '1' if x >0 else '0')

In [15]:
datas['incident_type_&_is_injury_claim'] = datas['incident_type'] + '_' + datas['is_injury_claim']
datas['incident_type_&_is_property_claim'] = datas['incident_type'] + '_' + datas['is_property_claim']
datas['incident_type_&_is_vehicle_claim'] = datas['incident_type'] + '_' + datas['is_vehicle_claim']

datas['collision_type_&_is_injury_claim'] = datas['collision_type'] + '_' + datas['is_injury_claim']
datas['collision_type_&_is_property_claim'] = datas['collision_type'] + '_' + datas['is_property_claim']
datas['collision_type_&_is_vehicle_claim'] = datas['collision_type'] + '_' + datas['is_vehicle_claim']

In [16]:
datas['umbrella_limit_2_total_claim_amount'] = datas['umbrella_limit']/datas['total_claim_amount']

In [17]:
# datas[['incident_type', 'collision_type', 'policy_csl', 'policy_annual_premium', 'umbrella_limit', 'total_claim_amount', 'injury_claim', 'property_claim', 'vehicle_claim', 'capital-gains', 'capital-loss']][datas['fraud_reported'] > 0]

In [18]:
#检查单个特征
# datas.incident_city.nunique()
# datas['incident_type'].value_counts()

In [19]:
datas.drop(['policy_bind_date', 'incident_date', '_c39', 'incident_location', 'policy_number', 'insured_zip'], axis= 1, inplace=True)

In [20]:
from sklearn.preprocessing import LabelEncoder

In [21]:
cat_columns = datas.select_dtypes(include= ['O']).columns

In [22]:
for col in cat_columns:
    le = LabelEncoder()
    datas[col] = le.fit_transform(datas[col])

datas[cat_columns]

Unnamed: 0,authorities_contacted,auto_make,auto_model,collision_type,incident_city,incident_severity,incident_state,incident_type,insured_education_level,insured_hobbies,...,property_damage,is_injury_claim,is_property_claim,is_vehicle_claim,incident_type_&_is_injury_claim,incident_type_&_is_property_claim,incident_type_&_is_vehicle_claim,collision_type_&_is_injury_claim,collision_type_&_is_property_claim,collision_type_&_is_vehicle_claim
0,0,9,26,3,5,2,0,2,5,15,...,0,1,1,0,5,5,2,7,7,3
1,3,6,10,3,6,1,0,0,3,14,...,0,1,1,0,1,1,0,7,7,3
2,4,7,36,3,3,2,6,2,2,16,...,0,1,1,0,5,5,2,7,7,3
3,1,11,21,1,3,0,6,0,3,18,...,2,1,1,0,1,1,0,3,3,1
4,1,5,14,2,3,2,4,0,4,18,...,2,1,1,0,1,1,0,5,5,2
5,4,10,2,2,6,0,4,0,3,17,...,2,1,1,0,1,1,0,5,5,2
6,1,12,18,3,0,2,1,0,6,9,...,0,1,1,0,1,1,0,7,7,3
7,0,11,21,3,1,1,6,0,6,3,...,1,1,1,0,1,1,0,7,7,3
8,0,4,30,2,2,2,4,0,4,18,...,1,1,1,0,1,1,0,5,5,2
9,0,10,1,1,1,0,1,0,2,4,...,0,1,1,0,1,1,0,3,3,1


In [23]:
# 数据集切分
train = datas[datas['fraud_reported'].notnull()]
test = datas[datas['fraud_reported'].isnull()]

In [24]:
train_X = train.drop(['fraud_reported'], axis=1)
train_y = train['fraud_reported']
test_X = test.drop(['fraud_reported'], axis=1)

## 使用lgbm 训练

In [None]:
import lightgbm as lgb

In [None]:
model_lgb = lgb.LGBMClassifier(
            num_leaves=2**5-1, reg_alpha=0.25, reg_lambda=0.25, objective='binary',
            max_depth=-1, learning_rate=0.005, min_child_samples=3, random_state=2022,
            n_estimators=2000, subsample=1, colsample_bytree=1,
        )

In [None]:
model_lgb.fit(train_X, train_y)

In [None]:
y_pred = model_lgb.predict_proba(test_X)[:, 1]

In [None]:
y_pred

In [None]:
fig, ax = plt.subplots(figsize=(6,18))
lgb.plot_importance(model_lgb, max_num_features=30, ax = ax)

In [None]:
submission = pd.read_csv('./fraud-detection-in-insurance-claims/sampleSubmission.csv')
submission['fraud_reported'] = y_pred
submission.to_csv('./submission/submission.csv', index=None)

## 使用五折交叉验证的LightGBM

In [25]:
from sklearn.model_selection import StratifiedKFold, KFold
import lightgbm as lgb

In [43]:
params_lgb = {
    'num_leaves': 2**5-1, 'reg_alpha': 0.25, 'reg_lambda': 0.25, 'objective': 'binary',
    'max_depth': -1, 'learning_rate': 0.005, 'min_child_samples': 3, 'random_state': 2022,
    #'n_estimators': 2000, 
    'subsample': 1, 'colsample_bytree': 1, 'scale_pos_weight': 3
}

In [44]:
NFOLD = 5

In [45]:
kf = StratifiedKFold(n_splits = NFOLD, shuffle = True, random_state = 2022)

y_pred = np.zeros(len(test_X))

In [46]:
for fold, (train_index, val_index) in enumerate(kf.split(train_X, train_y)):
    X_fold_train, X_fold_val = train_X.iloc[train_index, :], train_X.iloc[val_index, :]
    y_fold_train, y_fold_val = train_y[train_index], train_y[val_index]
    train_set = lgb.Dataset(X_fold_train, y_fold_train)
    val_set = lgb.Dataset(X_fold_val, y_fold_val, reference=train_set)

    model_lgb = lgb.train(params_lgb, train_set, num_boost_round=2000, #early_stopping_rounds=50,
                      valid_sets = val_set, verbose_eval=100)

    y_pred += model_lgb.predict(test_X, num_iteration=model_lgb.best_iteration)/kf.n_splits

# y_pred = [1 if y > 0.5 else 0 for y in y_pred]
# rmse = metrics.accuracy_score(y_pred,y_test)
# print(rmse)

[LightGBM] [Info] Number of positive: 144, number of negative: 415
You can set `force_col_wise=true` to remove the overhead.
[LightGBM] [Info] Total Bins 2226
[LightGBM] [Info] Number of data points in the train set: 559, number of used features: 55
[LightGBM] [Info] [binary:BoostFromScore]: pavg=0.257603 -> initscore=-1.058465
[LightGBM] [Info] Start training from score -1.058465
[100]	valid_0's binary_logloss: 0.47781
[200]	valid_0's binary_logloss: 0.471643
[300]	valid_0's binary_logloss: 0.487122
[400]	valid_0's binary_logloss: 0.518514
[500]	valid_0's binary_logloss: 0.545755
[600]	valid_0's binary_logloss: 0.572504
[700]	valid_0's binary_logloss: 0.599596
[800]	valid_0's binary_logloss: 0.626956
[900]	valid_0's binary_logloss: 0.65012
[1000]	valid_0's binary_logloss: 0.666518
[1100]	valid_0's binary_logloss: 0.68066
[1200]	valid_0's binary_logloss: 0.692544
[1300]	valid_0's binary_logloss: 0.707959
[1400]	valid_0's binary_logloss: 0.723149
[1500]	valid_0's binary_logloss: 0.73367

[LightGBM] [Info] Number of positive: 145, number of negative: 415
You can set `force_col_wise=true` to remove the overhead.
[LightGBM] [Info] Total Bins 2228
[LightGBM] [Info] Number of data points in the train set: 560, number of used features: 55
[LightGBM] [Info] [binary:BoostFromScore]: pavg=0.258929 -> initscore=-1.051545
[LightGBM] [Info] Start training from score -1.051545
[100]	valid_0's binary_logloss: 0.411199
[200]	valid_0's binary_logloss: 0.35438
[300]	valid_0's binary_logloss: 0.328783
[400]	valid_0's binary_logloss: 0.311909
[500]	valid_0's binary_logloss: 0.306959
[600]	valid_0's binary_logloss: 0.306109
[700]	valid_0's binary_logloss: 0.307024
[800]	valid_0's binary_logloss: 0.306935
[900]	valid_0's binary_logloss: 0.309005
[1000]	valid_0's binary_logloss: 0.309893
[1100]	valid_0's binary_logloss: 0.313134
[1200]	valid_0's binary_logloss: 0.317474
[1300]	valid_0's binary_logloss: 0.320159
[1400]	valid_0's binary_logloss: 0.323265
[1500]	valid_0's binary_logloss: 0.326

[1900]	valid_0's binary_logloss: 0.49217
[2000]	valid_0's binary_logloss: 0.497399
[LightGBM] [Info] Number of positive: 145, number of negative: 415
You can set `force_col_wise=true` to remove the overhead.
[LightGBM] [Info] Total Bins 2223
[LightGBM] [Info] Number of data points in the train set: 560, number of used features: 55
[LightGBM] [Info] [binary:BoostFromScore]: pavg=0.258929 -> initscore=-1.051545
[LightGBM] [Info] Start training from score -1.051545
[100]	valid_0's binary_logloss: 0.422431
[200]	valid_0's binary_logloss: 0.368003
[300]	valid_0's binary_logloss: 0.340331
[400]	valid_0's binary_logloss: 0.326933
[500]	valid_0's binary_logloss: 0.3343
[600]	valid_0's binary_logloss: 0.342886
[700]	valid_0's binary_logloss: 0.348744
[800]	valid_0's binary_logloss: 0.351957
[900]	valid_0's binary_logloss: 0.356668
[1000]	valid_0's binary_logloss: 0.3617
[1100]	valid_0's binary_logloss: 0.367324
[1200]	valid_0's binary_logloss: 0.372812
[1300]	valid_0's binary_logloss: 0.378291


[600]	valid_0's binary_logloss: 0.436487
[700]	valid_0's binary_logloss: 0.449244
[800]	valid_0's binary_logloss: 0.45803
[900]	valid_0's binary_logloss: 0.466748
[1000]	valid_0's binary_logloss: 0.475611
[1100]	valid_0's binary_logloss: 0.482627
[1200]	valid_0's binary_logloss: 0.490035
[1300]	valid_0's binary_logloss: 0.493799
[1400]	valid_0's binary_logloss: 0.499312
[1500]	valid_0's binary_logloss: 0.505021
[1600]	valid_0's binary_logloss: 0.509381
[1700]	valid_0's binary_logloss: 0.512558
[1800]	valid_0's binary_logloss: 0.515665
[1900]	valid_0's binary_logloss: 0.52263
[2000]	valid_0's binary_logloss: 0.53115


In [47]:
submission = pd.read_csv('./fraud-detection-in-insurance-claims/sampleSubmission.csv')
submission['fraud_reported'] = y_pred
submission.to_csv('./submission/submission.csv', index=None)