In [1]:
import pandas as pd
import numpy as np
from tqdm import tqdm
import pickle
import time
import os
import catboost as cb
from catboost import metrics
from Preprocess.df_formatting_and_extract import *
pd.set_option('display.max_columns', None)


In [2]:
customer = pd.read_csv('./prep/customer_label.csv')

new_data = pd.read_csv('./datasets/private_2_processed.csv')

In [3]:
customer.columns = ['交易卡號', '交易週期數', '交易總次數', '第一次交易週期', '交易天數', '第一次交易日期', '最後一次交易日期',
       '盜刷總數', '消費總額', '網路消費次數', '國內消費次數', '盜刷比例', '每日平均刷卡次數', '每週期平均交易數',
       '可交易天數', '可交易週期數', '消費金額平均數', '國內消費比例', '消費頻率_週期', '消費頻率_天',
       '國內消費behavior', '網路交易behavior', '零元比例']

In [4]:
new_data.columns = column_rename(new_data)
new_data = extract_time(new_data)
new_data.sort_values(['授權日期','授權小時','授權分鐘','授權秒','交易卡號'],inplace=True)
new_data.reset_index(drop=True,inplace=True)
new_data.fillna(-1,inplace=True)
new_data = reformat(new_data)
new_data['高金額'] = (new_data['轉換後交易金額']>1000).astype('int8')
new_data['授權週數'] = new_data['授權日期'] // 7
new_data['授權週日'] = new_data['授權日期'] % 7

new_data['loading_cycle'] = 16 #manual, the next one is 16

new_data.loc[new_data['授權小時'].between(6,13),'時段'] = '早上'
new_data.loc[new_data['授權小時'].between(13,18),'時段'] = '下午'
new_data.loc[new_data['授權小時'].between(18,23),'時段'] = '晚上'
new_data['時段'].fillna('凌晨',inplace=True)
new_data['是否為國外消費'] = new_data['消費地國別']!=0
new_data['授權週日_時段'] = new_data['授權週日'].astype('str')+"_"+new_data['時段']
new_data['交易類別_交易型態'] = new_data['交易類別'].astype('str')+"_"+new_data['交易型態'].astype('str')
for col in ['授權日期','授權週數','授權週日']:#,'4_day_cycle']:
    new_data[f"num_{col}"] = new_data[col]
    
new_data['當天交易次數'] = new_data.groupby(['交易卡號','授權日期'])['轉換後交易金額'].cumcount()
new_data['連續0元交易'] = new_data[new_data['轉換後交易金額']==0].groupby(['交易卡號','授權日期']).cumcount()+1
new_data['零元交易'] = (new_data['轉換後交易金額']==0).astype('int8')
new_data['連續0元交易'].fillna(0,inplace=True)


In [5]:
for col in ['商戶類別代碼','消費城市','收單行代碼','特店代號']:
    col_data = pd.read_csv(f'./prep/{col}消費總比例.csv',index_col=0)
    col_data = col_data.iloc[:,0].to_dict()
    new_data[f'{col}消費總比例'] = new_data[col].map(col_data)
    new_data[f'{col}消費總比例'].fillna(0,inplace=True)

In [6]:
for col in tqdm(['時段','網路交易註記','商戶類別代碼','消費城市','收單行代碼','特店代號']):
    with open(f'./prep/卡號_{col}_mapping.pkl','rb') as f:
        mapping = pickle.load(f)
    new_data[f'卡號在{col}的比例'] = new_data.apply(lambda x: mapping.get((x['交易卡號'], x[col]), 0),axis=1)
    

100%|████████████████████████████████████████████████████████████████████████████| 6/6 [00:45<00:00,  7.58s/it]


In [7]:
card_list = set(customer['交易卡號'].unique())
new_data['新消費者'] = new_data['交易卡號'].apply(lambda x:x not in card_list)


In [8]:
keep_col = ['交易卡號','國內消費比例','消費金額平均數','盜刷比例','消費頻率_週期', '消費頻率_天','每日平均刷卡次數', '每週期平均交易數','國內消費behavior', '網路交易behavior','零元比例','最後一次交易日期']
new_data = pd.merge(new_data,customer[keep_col],on='交易卡號',how='left')

In [9]:
# new_data['消費金額平均數'].fillna(customer['消費金額平均數'].median(),inplace=True)
new_data['消費金額平均數'].fillna(0,inplace=True)
new_data['消費頻率_天'].fillna(1,inplace=True)
# new_data['每日平均刷卡次數'].fillna(customer['每日平均刷卡次數'].median(),inplace=True)
new_data['每日平均刷卡次數'].fillna(0,inplace=True)
new_data['零元比例'].fillna(0,inplace=True)

In [10]:
new_data['當天交易次數超越個人平均'] = new_data['轉換後交易金額']/new_data['消費金額平均數']

In [11]:
for col in ['國內消費比例','消費頻率_週期']:
    new_data[col].fillna(1,inplace=True)

for col in ['盜刷比例','每週期平均交易數','當天交易次數超越個人平均']:
    new_data[col].fillna(0,inplace=True)

for col in ['國內消費behavior',	'網路交易behavior']:
    new_data[col].fillna(-1,inplace=True)

In [12]:
new_data['是否符合網路消費習慣'] = (new_data['網路交易註記']==new_data['網路交易behavior']).astype('int8')
new_data.loc[new_data['網路交易behavior']==-1,'是否符合網路消費習慣']=-1
new_data['是否符合國內外消費習慣'] = (new_data['是否為國外消費']==new_data['國內消費behavior']).astype('int8')
new_data.loc[new_data['國內消費behavior']==-1,'是否符合國內外消費習慣']=-1

In [13]:
new_data.columns[new_data.isna().any()]

Index(['最後一次交易日期'], dtype='object')

In [14]:
with open(f'./prep/columns_for_iso.pkl','rb') as f:
        selected_col = pickle.load(f)
with open(f'./prep/isolation_tree.pkl','rb') as f:
        clf = pickle.load(f)

https://scikit-learn.org/stable/model_persistence.html#security-maintainability-limitations
https://scikit-learn.org/stable/model_persistence.html#security-maintainability-limitations


In [15]:
new_data['個人平均消費金額倍率'] = new_data['當天交易次數超越個人平均']

In [16]:
new_data.replace(np.inf, 0, inplace=True)

In [17]:
new_data['isolation'] = clf.predict(new_data[selected_col])

In [18]:
new_data.to_csv('private_2_processed.csv',index=False)