In [1]:
import pandas as pd
import numpy as np
from tqdm import tqdm
import pickle
import catboost as cb
from catboost import metrics
from sklearn.metrics import f1_score,precision_score,recall_score
import time
import os

In [2]:
with open('./data_type.pkl','rb') as f:
    data_type = pickle.load(f)
print('loading data...')
train = pd.read_csv('./train_data_extracted.csv',dtype=data_type)
time.sleep(10)

loading data...


In [3]:
val = pd.read_csv('./val_data_extracted.csv',dtype=data_type)
public_test = pd.read_csv('./public_data_extracted.csv',dtype=data_type)
private_test_1 = pd.read_csv('./private1_data_extracted.csv',dtype=data_type)
public_ans = pd.read_csv('./dataset_2nd/public.csv')
public_ans = public_ans[['txkey','label']]
public_ans.set_index('txkey',inplace=True)
gt = public_ans.loc[public_test.set_index('交易序號').index,'label']

In [4]:
tables = [train,val,public_test,private_test_1]

for table in tables:
    table.fillna(0,inplace=True)

In [5]:
cat_col = ['顧客ID', '交易卡號',
        '授權日期','授權週數',
        '授權週日','時段',
        '授權小時','授權分鐘','授權秒', 
        '交易類別', '交易型態', '特店代號', '收單行代碼',
        '商戶類別代碼', '分期期數', '消費地國別', '消費城市', '狀態碼', '支付型態', '消費地幣別',
        '是否符合網路消費習慣','是否符合國內外消費習慣',
        '授權週日_時段','交易類別_交易型態',
        '新消費者',
            ]

drop_col = ['盜刷註記','交易序號','loading_cycle',
            '授權日期','授權週數',
            '卡號在網路交易註記的比例',
            '授權秒',
            '高金額',
            '新消費者',
            '國內消費比例',
            '是否符合網路消費習慣',
            '消費頻率_週期'
            ]

cat_col = [col for col in cat_col if col not in drop_col]
selected_col = [col for col in train.columns if col not in drop_col]

In [6]:
print('creating dataset ...')
train_dataset = cb.Pool(train[selected_col], train['盜刷註記'],cat_features=cat_col) 
val_dataset = cb.Pool(val[selected_col], val['盜刷註記'],cat_features=cat_col) 
time.sleep(10)
print('training model without public testset ...')

creating dataset ...
training model without public testset ...


In [7]:
model = cb.CatBoostClassifier(iterations=500,learning_rate=0.2,depth=16,loss_function='Logloss',
                              grow_policy='Depthwise',
                              one_hot_max_size=10,
                              class_weights=[1,220],
                              task_type='GPU',
                              random_seed=5,
                              l2_leaf_reg=7,
                              eval_metric=metrics.F1(use_weights=False)
                              ) #class weight 220 best
model.fit(train_dataset,eval_set=val_dataset)
model.save_model('base_model_no_val.cbm')

0:	learn: 0.5782539	test: 0.4668833	best: 0.4668833 (0)	total: 1.12s	remaining: 9m 20s
1:	learn: 0.6758160	test: 0.6134949	best: 0.6134949 (1)	total: 2.35s	remaining: 9m 45s
2:	learn: 0.7084199	test: 0.6707339	best: 0.6707339 (2)	total: 3.78s	remaining: 10m 26s
3:	learn: 0.7186715	test: 0.6821933	best: 0.6821933 (3)	total: 5.5s	remaining: 11m 22s
4:	learn: 0.7274121	test: 0.6961058	best: 0.6961058 (4)	total: 6.9s	remaining: 11m 23s
5:	learn: 0.7377350	test: 0.7037736	best: 0.7037736 (5)	total: 8.23s	remaining: 11m 18s
6:	learn: 0.7521767	test: 0.7130036	best: 0.7130036 (6)	total: 9.53s	remaining: 11m 11s
7:	learn: 0.7649793	test: 0.7215777	best: 0.7215777 (7)	total: 10.9s	remaining: 11m 8s
8:	learn: 0.7728598	test: 0.7263875	best: 0.7263875 (8)	total: 11.8s	remaining: 10m 42s
9:	learn: 0.7817819	test: 0.7336743	best: 0.7336743 (9)	total: 12.8s	remaining: 10m 26s
10:	learn: 0.7919419	test: 0.7452944	best: 0.7452944 (10)	total: 13.7s	remaining: 10m 10s
11:	learn: 0.7974623	test: 0.748490

In [8]:
train_dataset = cb.Pool(pd.concat([train,val])[selected_col], pd.concat([train['盜刷註記'],val['盜刷註記']]),cat_features=cat_col) 
test_dataset = cb.Pool(public_test[selected_col], gt,cat_features=cat_col) 

In [9]:
model = cb.CatBoostClassifier(iterations=500,learning_rate=0.2,depth=16,loss_function='Logloss',
                              grow_policy='Depthwise',
                              one_hot_max_size=10,
                              class_weights=[1,220],
                              task_type='GPU',
                              random_seed=5,
                              l2_leaf_reg=7,
                              eval_metric=metrics.F1(use_weights=False)
                              ) #class weight 220 best
model.fit(train_dataset,eval_set=test_dataset)
model.save_model('base_model_with_val.cbm')

0:	learn: 0.5779750	test: 0.5314247	best: 0.5314247 (0)	total: 531ms	remaining: 4m 24s
1:	learn: 0.6717038	test: 0.6076459	best: 0.6076459 (1)	total: 1.76s	remaining: 7m 19s
2:	learn: 0.7049200	test: 0.6283452	best: 0.6283452 (2)	total: 3.18s	remaining: 8m 46s
3:	learn: 0.7295812	test: 0.6500574	best: 0.6500574 (3)	total: 4.55s	remaining: 9m 24s
4:	learn: 0.7394722	test: 0.6569536	best: 0.6569536 (4)	total: 5.88s	remaining: 9m 42s
5:	learn: 0.7544386	test: 0.6727798	best: 0.6727798 (5)	total: 7.07s	remaining: 9m 42s
6:	learn: 0.7641568	test: 0.6815221	best: 0.6815221 (6)	total: 8.16s	remaining: 9m 35s
7:	learn: 0.7787936	test: 0.6977237	best: 0.6977237 (7)	total: 9.2s	remaining: 9m 25s
8:	learn: 0.7853878	test: 0.7052933	best: 0.7052933 (8)	total: 10.1s	remaining: 9m 9s
9:	learn: 0.7908720	test: 0.7098399	best: 0.7098399 (9)	total: 11s	remaining: 8m 58s
10:	learn: 0.7999500	test: 0.7158926	best: 0.7158926 (10)	total: 11.8s	remaining: 8m 43s
11:	learn: 0.8037139	test: 0.7199854	best: 0.

In [10]:
print('training final model with public testset ...')
train_dataset = cb.Pool(pd.concat([train,val,public_test])[selected_col], pd.concat([train['盜刷註記'],val['盜刷註記'],gt]),cat_features=cat_col) 
time.sleep(10)
final_model = cb.CatBoostClassifier(iterations=500,learning_rate=0.2,depth=16,loss_function='Logloss',
                              grow_policy='Depthwise',
                              one_hot_max_size=10,
                              class_weights=[1,220],
                              task_type='GPU',
                              random_seed=5,
                              l2_leaf_reg=7,
                              eval_metric=metrics.F1(use_weights=False)
                              ) #class weight 220 best
final_model.fit(train_dataset)
final_model.save_model('base_model_with_public.cbm')

training final model with public testset ...
0:	learn: 0.5657693	total: 1.92s	remaining: 15m 56s
1:	learn: 0.6519595	total: 3.94s	remaining: 16m 22s
2:	learn: 0.6903769	total: 5.61s	remaining: 15m 28s
3:	learn: 0.7149757	total: 7.17s	remaining: 14m 49s
4:	learn: 0.7301050	total: 8.73s	remaining: 14m 24s
5:	learn: 0.7451070	total: 10.1s	remaining: 13m 52s
6:	learn: 0.7530682	total: 11.6s	remaining: 13m 35s
7:	learn: 0.7663658	total: 12.9s	remaining: 13m 12s
8:	learn: 0.7757875	total: 14s	remaining: 12m 45s
9:	learn: 0.7821338	total: 15.1s	remaining: 12m 19s
10:	learn: 0.7890938	total: 16.3s	remaining: 12m 2s
11:	learn: 0.7992234	total: 17.3s	remaining: 11m 44s
12:	learn: 0.8058009	total: 18.3s	remaining: 11m 24s
13:	learn: 0.8141424	total: 19.1s	remaining: 11m 2s
14:	learn: 0.8196273	total: 20s	remaining: 10m 48s
15:	learn: 0.8269964	total: 21s	remaining: 10m 35s
16:	learn: 0.8322339	total: 22.1s	remaining: 10m 28s
17:	learn: 0.8368701	total: 23.3s	remaining: 10m 23s
18:	learn: 0.842264