In [1]:
import pandas as pd
import numpy as np
from tqdm import tqdm
import pickle
import catboost as cb
from catboost import metrics
from sklearn.metrics import f1_score,precision_score,recall_score
import time
import os

In [2]:
with open('./data_type.pkl','rb') as f:
    data_type = pickle.load(f)
print('loading data...')
train = pd.read_csv('./train_data_extracted.csv',dtype=data_type)
time.sleep(10)

loading data...


In [3]:
val = pd.read_csv('./val_data_extracted.csv',dtype=data_type)
public_test = pd.read_csv('./public_data_extracted.csv',dtype=data_type)
private_test_1 = pd.read_csv('./private1_data_extracted.csv',dtype=data_type)
public_ans = pd.read_csv('./dataset_2nd/public.csv')
public_ans = public_ans[['txkey','label']]
public_ans.set_index('txkey',inplace=True)
gt = public_ans.loc[public_test.set_index('交易序號').index,'label']

In [4]:
tables = [train,val,public_test,private_test_1]

for table in tables:
    table.fillna(0,inplace=True)

In [5]:
for table in tables:
    table['當天交易次數'] = table.groupby(['交易卡號','授權日期'])['盜刷註記'].cumcount()
    table['連續0元交易'] = table[table['轉換後交易金額']==0].groupby(['交易卡號','授權日期']).cumcount()+1

In [6]:
from Preprocess.df_formatting_and_extract import *
all_data = pd.concat([train,val,public_test,private_test_1])
del train,public_test,private_test_1
date_list = create_date_list(sorted(all_data['授權日期'].unique()))

In [7]:
previous_dates = date_list[0].copy()
for i in tqdm(range(1,len(date_list)),desc='weekly_刷卡_count'):
    mapping = all_data[all_data['授權日期'].isin(previous_dates)].groupby(['交易卡號','授權日期'])['盜刷註記'].count().groupby(level=0).mean().to_dict()
    all_data.loc[all_data['授權日期'].isin(date_list[i]),'每日平均刷卡次數']=all_data[all_data['授權日期'].isin(date_list[i])]['交易卡號'].map(mapping)
    previous_dates+=date_list[i]


weekly_刷卡_count: 100%|██████████| 15/15 [02:31<00:00, 10.09s/it]


In [8]:
all_data['每日平均刷卡次數'].fillna(1,inplace=True)
all_data['當天交易次數超越個人平均'] = all_data['當天交易次數']/all_data['每日平均刷卡次數']

In [9]:
train = all_data[all_data['授權日期'].isin(range(0,52))]
val = all_data[all_data['授權日期'].isin(range(52,56))]
public_test = all_data[all_data['授權日期'].isin(range(56,60))]
private_test_1 = all_data[all_data['授權日期'].isin(range(60,65))]
del all_data

In [10]:
cat_col = ['顧客ID', '交易卡號',
        '授權日期','授權週數',
        '授權週日','時段',
        '授權小時','授權分鐘','授權秒', 
        '交易類別', '交易型態', '特店代號', '收單行代碼',
        '商戶類別代碼', '分期期數', '消費地國別', '消費城市', '狀態碼', '支付型態', '消費地幣別',
        '是否符合網路消費習慣','是否符合國內外消費習慣',
        '授權週日_時段','交易類別_交易型態',
        '新消費者',
            ]

drop_col = ['盜刷註記','交易序號','loading_cycle',
            '授權日期','授權週數',
            '卡號在網路交易註記的比例',
            '授權秒',
            '高金額',
            '新消費者',
            '國內消費比例',
            '是否符合網路消費習慣',
            # '消費頻率_週期'
            ]

cat_col = [col for col in cat_col if col not in drop_col]
selected_col = [col for col in train.columns if col not in drop_col]

In [11]:
train_dataset = cb.Pool(pd.concat([train,val])[selected_col], pd.concat([train['盜刷註記'],val['盜刷註記']]),cat_features=cat_col) 
test_dataset = cb.Pool(public_test[selected_col], gt,cat_features=cat_col) 
time.sleep(10)

In [13]:
model = cb.CatBoostClassifier(iterations=500,learning_rate=0.2,depth=16,loss_function='Logloss',
                              grow_policy='Depthwise',
                              one_hot_max_size=10,
                              class_weights=[1,220],
                              task_type='GPU',
                              random_seed=5,
                              l2_leaf_reg=7,
                              eval_metric=metrics.F1(use_weights=False)
                              ) #class weight 220 best
model.fit(train_dataset,eval_set=test_dataset)

0:	learn: 0.5793235	test: 0.4941898	best: 0.4941898 (0)	total: 664ms	remaining: 5m 31s
1:	learn: 0.6621540	test: 0.5980138	best: 0.5980138 (1)	total: 1.81s	remaining: 7m 29s
2:	learn: 0.6936752	test: 0.6185663	best: 0.6185663 (2)	total: 3.08s	remaining: 8m 30s
3:	learn: 0.7151079	test: 0.6504251	best: 0.6504251 (3)	total: 4.58s	remaining: 9m 27s
4:	learn: 0.7310081	test: 0.6603491	best: 0.6603491 (4)	total: 5.91s	remaining: 9m 44s
5:	learn: 0.7440684	test: 0.6719702	best: 0.6719702 (5)	total: 7.25s	remaining: 9m 57s
6:	learn: 0.7580126	test: 0.6827420	best: 0.6827420 (6)	total: 8.43s	remaining: 9m 53s
7:	learn: 0.7705681	test: 0.6964129	best: 0.6964129 (7)	total: 9.49s	remaining: 9m 43s
8:	learn: 0.7811267	test: 0.7073474	best: 0.7073474 (8)	total: 10.5s	remaining: 9m 31s
9:	learn: 0.7906523	test: 0.7147234	best: 0.7147234 (9)	total: 11.5s	remaining: 9m 23s
10:	learn: 0.8000600	test: 0.7219086	best: 0.7219086 (10)	total: 12.4s	remaining: 9m 13s
11:	learn: 0.8076409	test: 0.7249817	best

<catboost.core.CatBoostClassifier at 0x2c32e437940>

In [14]:
model.save_model('transaction_focus_model_with_val.cbm')

In [15]:
print('training final model with public testset ...')
train_dataset = cb.Pool(pd.concat([train,val,public_test])[selected_col], pd.concat([train['盜刷註記'],val['盜刷註記'],gt]),cat_features=cat_col) 
time.sleep(10)
final_model = cb.CatBoostClassifier(iterations=500,learning_rate=0.2,depth=16,loss_function='Logloss',
                              grow_policy='Depthwise',
                              one_hot_max_size=10,
                              class_weights=[1,220],
                              task_type='GPU',
                              random_seed=5,
                              l2_leaf_reg=7,
                              eval_metric=metrics.F1(use_weights=False)
                              ) #class weight 220 best
final_model.fit(train_dataset)
final_model.save_model('transaction_focus_model_with_public.cbm')

training final model with public testset ...
0:	learn: 0.5755451	total: 651ms	remaining: 5m 24s
1:	learn: 0.6539425	total: 1.88s	remaining: 7m 47s
2:	learn: 0.6963734	total: 3.25s	remaining: 8m 58s
3:	learn: 0.7108713	total: 4.55s	remaining: 9m 23s
4:	learn: 0.7281313	total: 5.95s	remaining: 9m 49s
5:	learn: 0.7394491	total: 7.19s	remaining: 9m 51s
6:	learn: 0.7548301	total: 8.32s	remaining: 9m 46s
7:	learn: 0.7673791	total: 9.36s	remaining: 9m 35s
8:	learn: 0.7761896	total: 10.4s	remaining: 9m 29s
9:	learn: 0.7864954	total: 11.5s	remaining: 9m 22s
10:	learn: 0.7942951	total: 12.6s	remaining: 9m 21s
11:	learn: 0.8019103	total: 13.4s	remaining: 9m 6s
12:	learn: 0.8084633	total: 14.1s	remaining: 8m 49s
13:	learn: 0.8143269	total: 14.9s	remaining: 8m 35s
14:	learn: 0.8201000	total: 15.5s	remaining: 8m 21s
15:	learn: 0.8265626	total: 16.3s	remaining: 8m 13s
16:	learn: 0.8312348	total: 17.1s	remaining: 8m 5s
17:	learn: 0.8367547	total: 17.9s	remaining: 7m 59s
18:	learn: 0.8437882	total: 18.