In [1]:
import pandas as pd
import numpy as np
from tqdm import tqdm
import pickle
import catboost as cb
from catboost import metrics
from sklearn.metrics import f1_score,precision_score,recall_score
import time
import os

In [2]:
with open('./data_type.pkl','rb') as f:
    data_type = pickle.load(f)
print('loading data...')
train = pd.read_csv('./train_data_extracted.csv',dtype=data_type)
time.sleep(10)

loading data...


In [3]:
val = pd.read_csv('./val_data_extracted.csv',dtype=data_type)
public_test = pd.read_csv('./public_data_extracted.csv',dtype=data_type)
private_test_1 = pd.read_csv('./private1_data_extracted.csv',dtype=data_type)
public_ans = pd.read_csv('./dataset_2nd/public.csv')
public_ans = public_ans[['txkey','label']]
public_ans.set_index('txkey',inplace=True)
gt = public_ans.loc[public_test.set_index('交易序號').index,'label']

In [4]:
tables = [train,val,public_test,private_test_1]

for table in tables:
    table.fillna(0,inplace=True)

In [5]:
from sklearn.ensemble import IsolationForest

drop_col = ['盜刷註記','交易序號',
            '顧客ID', '交易卡號',
            'num_授權日期', 'num_授權週數', 'num_授權週日',
            '特店代號','收單行代碼','時段','授權週日_時段','交易類別_交易型態','新消費者',
            '特店代號_3碼','收單行代碼_3碼'
            ]
selected_col = [col for col in val.columns if col not in drop_col]

clf = IsolationForest(random_state=0).fit(train[selected_col])

tables = [train,val,public_test,private_test_1]

for table in tables:
    table['isolation'] = clf.predict(table[selected_col])

In [6]:
tables = [train,val,public_test,private_test_1]

for table in tables:
    table['當天交易次數'] = table.groupby(['交易卡號','授權日期'])['盜刷註記'].cumcount()
    table['零元交易'] = (table['轉換後交易金額']==0).astype('int8')
    table['連續0元交易'] = table[table['轉換後交易金額']==0].groupby(['交易卡號','授權日期']).cumcount()+1

In [7]:
from Preprocess.df_formatting_and_extract import *
all_data = pd.concat([train,val,public_test,private_test_1])
del train,public_test,private_test_1
date_list = create_date_list(sorted(all_data['授權日期'].unique()))

In [8]:
previous_dates = date_list[0].copy()
for i in tqdm(range(1,len(date_list)),desc='零元比例'):
    mapping = all_data[all_data['授權日期'].isin(previous_dates)].groupby('交易卡號')['零元交易'].mean().to_dict()
    all_data.loc[all_data['授權日期'].isin(date_list[i]),'零元比例']=all_data[all_data['授權日期'].isin(date_list[i])]['交易卡號'].map(mapping)
    previous_dates+=date_list[1]

零元比例:   0%|          | 0/15 [00:00<?, ?it/s]

零元比例: 100%|██████████| 15/15 [00:39<00:00,  2.62s/it]


In [9]:
train = all_data[all_data['授權日期'].isin(range(0,52))]
val = all_data[all_data['授權日期'].isin(range(52,56))]
public_test = all_data[all_data['授權日期'].isin(range(56,60))]
private_test_1 = all_data[all_data['授權日期'].isin(range(60,65))]
del all_data

In [10]:
cat_col = ['顧客ID', '交易卡號',
        '授權日期','授權週數','4_day_cycle','4_day_count',
        '授權週日','時段',
        '授權小時','授權分鐘','授權秒', 
        '交易類別', '交易型態', '特店代號', '收單行代碼',
        '商戶類別代碼', '分期期數', '消費地國別', '消費城市', '狀態碼', '支付型態', '消費地幣別',
        '是否符合網路消費習慣','是否符合國內外消費習慣',
        '授權週日_時段','交易類別_交易型態',
        '新消費者',
            ]

drop_col = ['盜刷註記','交易序號',
            '授權日期','授權週數','4_day_cycle','4_day_count',
            '個人消費金額中位數倍率',
            '個人平均消費金額倍率',
            '卡號在網路交易註記的比例',
            # '授權分鐘',
            '授權秒',
            '是否符合網路消費習慣',
            '高金額',
            '新消費者',
            '國內消費比例',
            ]

cat_col = [col for col in cat_col if col not in drop_col]
selected_col = [col for col in train.columns if col not in drop_col]

print('creating dataset ...')
train_dataset = cb.Pool(pd.concat([train,val])[selected_col], pd.concat([train['盜刷註記'],val['盜刷註記']]),cat_features=cat_col) 
test_dataset = cb.Pool(public_test[selected_col], gt,cat_features=cat_col) 
time.sleep(10)
print('training model without public testset ...')

creating dataset ...
training model without public testset ...


In [11]:
model = cb.CatBoostClassifier(iterations=500,learning_rate=0.2,depth=12,loss_function='Logloss',
                              grow_policy='Depthwise',
                              one_hot_max_size=10,
                              class_weights=[1,220],
                              task_type='GPU',
                              random_seed=5,
                              l2_leaf_reg=7,
                              eval_metric=metrics.F1(use_weights=False)
                              ) #class weight 220 best
model.fit(train_dataset,eval_set=test_dataset)
model.save_model('zero_focus_model_with_val.cbm')

0:	learn: 0.5163492	test: 0.4767008	best: 0.4767008 (0)	total: 447ms	remaining: 3m 43s
1:	learn: 0.5803277	test: 0.5361342	best: 0.5361342 (1)	total: 861ms	remaining: 3m 34s
2:	learn: 0.6019724	test: 0.5737379	best: 0.5737379 (2)	total: 1.2s	remaining: 3m 18s
3:	learn: 0.6237788	test: 0.5834900	best: 0.5834900 (3)	total: 1.6s	remaining: 3m 18s
4:	learn: 0.6401444	test: 0.6041635	best: 0.6041635 (4)	total: 1.96s	remaining: 3m 14s
5:	learn: 0.6499105	test: 0.6153142	best: 0.6153142 (5)	total: 2.33s	remaining: 3m 12s
6:	learn: 0.6572028	test: 0.6177689	best: 0.6177689 (6)	total: 2.69s	remaining: 3m 9s
7:	learn: 0.6599241	test: 0.6246312	best: 0.6246312 (7)	total: 3.08s	remaining: 3m 9s
8:	learn: 0.6682061	test: 0.6331230	best: 0.6331230 (8)	total: 3.43s	remaining: 3m 7s
9:	learn: 0.6730465	test: 0.6352420	best: 0.6352420 (9)	total: 3.76s	remaining: 3m 4s
10:	learn: 0.6796986	test: 0.6358747	best: 0.6358747 (10)	total: 4.1s	remaining: 3m 2s
11:	learn: 0.6844719	test: 0.6350711	best: 0.6358

In [13]:
print('training final model with public testset ...')
train_dataset = cb.Pool(pd.concat([train,val,public_test])[selected_col], pd.concat([train['盜刷註記'],val['盜刷註記'],gt]),cat_features=cat_col) 
time.sleep(10)
final_model = cb.CatBoostClassifier(iterations=500,learning_rate=0.2,depth=12,loss_function='Logloss',
                              grow_policy='Depthwise',
                              one_hot_max_size=10,
                              class_weights=[1,220],
                              task_type='GPU',
                              random_seed=5,
                              l2_leaf_reg=7,
                              eval_metric=metrics.F1(use_weights=False)
                              ) #class weight 220 best
final_model.fit(train_dataset)
final_model.save_model('zero_focus_model_with_public.cbm')

training final model with public testset ...
0:	learn: 0.5170145	total: 439ms	remaining: 3m 39s
1:	learn: 0.5661084	total: 805ms	remaining: 3m 20s
2:	learn: 0.6032405	total: 1.16s	remaining: 3m 12s
3:	learn: 0.6284373	total: 1.53s	remaining: 3m 9s
4:	learn: 0.6321904	total: 1.91s	remaining: 3m 8s
5:	learn: 0.6429025	total: 2.29s	remaining: 3m 8s
6:	learn: 0.6528407	total: 2.7s	remaining: 3m 10s
7:	learn: 0.6572501	total: 3.08s	remaining: 3m 9s
8:	learn: 0.6639639	total: 3.4s	remaining: 3m 5s
9:	learn: 0.6748410	total: 3.77s	remaining: 3m 4s
10:	learn: 0.6852789	total: 4.15s	remaining: 3m 4s
11:	learn: 0.6928983	total: 4.52s	remaining: 3m 3s
12:	learn: 0.7015104	total: 4.93s	remaining: 3m 4s
13:	learn: 0.7083273	total: 5.27s	remaining: 3m 2s
14:	learn: 0.7210128	total: 5.65s	remaining: 3m 2s
15:	learn: 0.7269377	total: 5.97s	remaining: 3m
16:	learn: 0.7331336	total: 6.34s	remaining: 3m
17:	learn: 0.7382507	total: 6.69s	remaining: 2m 59s
18:	learn: 0.7441065	total: 7s	remaining: 2m 57s
1