In [1]:
import pandas as pd
import numpy as np
from tqdm import tqdm
import pickle
import catboost as cb
from catboost import metrics
from sklearn.metrics import f1_score,precision_score,recall_score
import time
import os

In [2]:
with open('./data_type.pkl','rb') as f:
    data_type = pickle.load(f)
print('loading data...')
train = pd.read_csv('./train_data_extracted.csv',dtype=data_type)
time.sleep(10)

loading data...


In [3]:
val = pd.read_csv('./val_data_extracted.csv',dtype=data_type)
public_test = pd.read_csv('./public_data_extracted.csv',dtype=data_type)
private_test_1 = pd.read_csv('./private1_data_extracted.csv',dtype=data_type)
public_ans = pd.read_csv('./dataset_2nd/public.csv')
public_ans = public_ans[['txkey','label']]
public_ans.set_index('txkey',inplace=True)
gt = public_ans.loc[public_test.set_index('交易序號').index,'label']

In [4]:
tables = [train,val,public_test,private_test_1]

for table in tables:
    table.fillna(0,inplace=True)
    table['當天交易次數'] = table.groupby(['交易卡號','授權日期'])['盜刷註記'].cumcount()

In [5]:
from sklearn.ensemble import IsolationForest

drop_col = ['盜刷註記','交易序號',
            '顧客ID', '交易卡號',
            'num_授權日期', 'num_授權週數', 'num_授權週日',
            '特店代號','收單行代碼','時段','授權週日_時段','交易類別_交易型態','新消費者',
            '特店代號_3碼','收單行代碼_3碼'
            ]
selected_col = [col for col in val.columns if col not in drop_col]

clf = IsolationForest(random_state=0).fit(train[selected_col])

tables = [train,val,public_test,private_test_1]

for table in tables:
    table['isolation'] = clf.predict(table[selected_col])

In [6]:
from Preprocess.df_formatting_and_extract import *
all_data = pd.concat([train,val,public_test,private_test_1])
del train,public_test,private_test_1
date_list = create_date_list(sorted(all_data['授權日期'].unique()))

In [7]:
previous_dates = date_list[0].copy()
for i in tqdm(range(1,len(date_list)),desc='每日平均刷卡次數'):
    mapping = all_data[all_data['授權日期'].isin(previous_dates)].groupby(['交易卡號','授權日期'])['盜刷註記'].count().groupby(level=0).mean().to_dict()
    all_data.loc[all_data['授權日期'].isin(date_list[i]),'每日平均刷卡次數']=all_data[all_data['授權日期'].isin(date_list[i])]['交易卡號'].map(mapping)
    previous_dates+=date_list[i]
all_data['每日平均刷卡次數'].fillna(1,inplace=True)
all_data['當天交易次數超越個人平均'] = all_data['當天交易次數']/all_data['每日平均刷卡次數']


weekly_刷卡_count: 100%|██████████| 15/15 [02:29<00:00,  9.97s/it]


In [8]:
def matching(x,mapping):
    if x['交易卡號'] not in mapping:
        return 0
    else:
        return x['授權日期']-mapping[x['交易卡號']]
previous_dates = [0]
for i in tqdm(range(1,all_data['授權日期'].max()+1)):
    mapping = all_data[all_data['授權日期'].isin(previous_dates)].groupby(['交易卡號',])['授權日期'].max().to_dict()
    all_data.loc[all_data['授權日期'].isin([i]),'距離上次刷卡天數']=all_data[all_data['授權日期'].isin([i])].apply(lambda x: matching(x,mapping),axis=1)
    previous_dates+=[i]


100%|██████████| 64/64 [08:00<00:00,  7.50s/it]


In [9]:
train = all_data[all_data['授權日期'].isin(range(0,52))]
val = all_data[all_data['授權日期'].isin(range(52,56))]
public_test = all_data[all_data['授權日期'].isin(range(56,60))]
private_test_1 = all_data[all_data['授權日期'].isin(range(60,65))]
del all_data

In [10]:
cat_col = ['顧客ID', '交易卡號',
        '授權日期','授權週數',
        '授權週日','時段',
        '授權小時','授權分鐘','授權秒', 
        '交易類別', '交易型態', '特店代號', '收單行代碼',
        '商戶類別代碼', '分期期數', '消費地國別', '消費城市', '狀態碼', '支付型態', '消費地幣別',
        '是否符合網路消費習慣','是否符合國內外消費習慣',
        '授權週日_時段','交易類別_交易型態',
        '新消費者',
            ]

drop_col = ['盜刷註記','交易序號','loading_cycle',
            '授權日期','授權週數',
            '卡號在網路交易註記的比例',
            '授權秒',
            '高金額',
            '新消費者',
            '國內消費比例',
            '是否符合網路消費習慣',
            '消費頻率_週期'
            ]

cat_col = [col for col in cat_col if col not in drop_col]
selected_col = [col for col in train.columns if col not in drop_col]

In [11]:
train_dataset = cb.Pool(pd.concat([train,val])[selected_col], pd.concat([train['盜刷註記'],val['盜刷註記']]),cat_features=cat_col) 
test_dataset = cb.Pool(public_test[selected_col], gt,cat_features=cat_col) 

In [14]:
model = cb.CatBoostClassifier(iterations=500,learning_rate=0.2,depth=16,loss_function='Logloss',
                              grow_policy='Depthwise',
                              one_hot_max_size=10,
                              class_weights=[1,220],
                              task_type='GPU',
                              random_seed=5,
                              l2_leaf_reg=7,
                              eval_metric=metrics.F1(use_weights=False)
                              ) #class weight 220 best
model.fit(train_dataset,eval_set=test_dataset)
model.save_model('best_model_with_val.cbm')

0:	learn: 0.5911775	test: 0.5299490	best: 0.5299490 (0)	total: 699ms	remaining: 5m 48s
1:	learn: 0.6594740	test: 0.5932958	best: 0.5932958 (1)	total: 1.83s	remaining: 7m 35s
2:	learn: 0.6885030	test: 0.6236259	best: 0.6236259 (2)	total: 3.11s	remaining: 8m 35s
3:	learn: 0.7131001	test: 0.6518567	best: 0.6518567 (3)	total: 4.44s	remaining: 9m 10s
4:	learn: 0.7320068	test: 0.6716767	best: 0.6716767 (4)	total: 5.81s	remaining: 9m 35s
5:	learn: 0.7480429	test: 0.6869759	best: 0.6869759 (5)	total: 6.97s	remaining: 9m 34s
6:	learn: 0.7605201	test: 0.6916307	best: 0.6916307 (6)	total: 8.21s	remaining: 9m 38s
7:	learn: 0.7702936	test: 0.6991104	best: 0.6991104 (7)	total: 9.22s	remaining: 9m 27s
8:	learn: 0.7796441	test: 0.7088877	best: 0.7088877 (8)	total: 10.2s	remaining: 9m 16s
9:	learn: 0.7917379	test: 0.7185305	best: 0.7185305 (9)	total: 11.3s	remaining: 9m 14s
10:	learn: 0.8029972	test: 0.7262143	best: 0.7262143 (10)	total: 12.2s	remaining: 9m 3s
11:	learn: 0.8117817	test: 0.7302752	best:

In [15]:
print('training final model with public testset ...')
train_dataset = cb.Pool(pd.concat([train,val,public_test])[selected_col], pd.concat([train['盜刷註記'],val['盜刷註記'],gt]),cat_features=cat_col) 
time.sleep(10)
final_model = cb.CatBoostClassifier(iterations=500,learning_rate=0.2,depth=16,loss_function='Logloss',
                              grow_policy='Depthwise',
                              one_hot_max_size=10,
                              class_weights=[1,220],
                              task_type='GPU',
                              random_seed=5,
                              l2_leaf_reg=7,
                              eval_metric=metrics.F1(use_weights=False)
                              ) #class weight 220 best
final_model.fit(train_dataset)
final_model.save_model('best_model_with_public.cbm')

training final model with public testset ...
0:	learn: 0.5837867	total: 735ms	remaining: 6m 6s
1:	learn: 0.6623498	total: 1.94s	remaining: 8m 3s
2:	learn: 0.6928045	total: 3.31s	remaining: 9m 8s
3:	learn: 0.7110178	total: 4.66s	remaining: 9m 37s
4:	learn: 0.7322240	total: 6.18s	remaining: 10m 11s
5:	learn: 0.7447824	total: 7.46s	remaining: 10m 14s
6:	learn: 0.7498489	total: 8.73s	remaining: 10m 14s
7:	learn: 0.7607828	total: 10s	remaining: 10m 17s
8:	learn: 0.7694387	total: 11.2s	remaining: 10m 12s
9:	learn: 0.7769552	total: 12.3s	remaining: 10m
10:	learn: 0.7847177	total: 13.3s	remaining: 9m 52s
11:	learn: 0.7963742	total: 14.4s	remaining: 9m 44s
12:	learn: 0.8060800	total: 15.3s	remaining: 9m 33s
13:	learn: 0.8145112	total: 16.1s	remaining: 9m 18s
14:	learn: 0.8210893	total: 16.8s	remaining: 9m 2s
15:	learn: 0.8281810	total: 17.6s	remaining: 8m 52s
16:	learn: 0.8342605	total: 18.5s	remaining: 8m 46s
17:	learn: 0.8411757	total: 19.3s	remaining: 8m 37s
18:	learn: 0.8473707	total: 20.2s