In [127]:
import pandas as pd
import numpy as np
from tqdm import tqdm
import pickle
import time
import os
from Preprocess.df_formatting_and_extract import *
pd.set_option('display.max_columns', None)

In [128]:
train = pd.read_csv('./dataset_1st/training.csv')
public_test = pd.read_csv('./dataset_2nd/public.csv')
private_1 = pd.read_csv('./dataset_2nd/private_1_processed.csv')

In [129]:
all_data = pd.concat([train,public_test,private_1])

In [130]:
train.columns = column_rename(train)
all_data.columns = column_rename(all_data)
all_data = extract_time(all_data)
all_data.sort_values(['授權日期','授權小時','授權分鐘','授權秒','交易卡號'],inplace=True)
all_data.reset_index(drop=True,inplace=True)
all_data.fillna(-1,inplace=True)
all_data = reformat(all_data)

In [131]:
all_data['高金額'] = (all_data['轉換後交易金額']>train[train['盜刷註記']==0]['轉換後交易金額'].quantile(q=.80)).astype('int8')
del train,public_test,private_1

In [132]:
all_data['授權週數'] = all_data['授權日期'] // 7
all_data['授權週日'] = all_data['授權日期'] % 7

all_data.loc[all_data['授權小時'].between(6,13),'時段'] = '早上'
all_data.loc[all_data['授權小時'].between(13,18),'時段'] = '下午'
all_data.loc[all_data['授權小時'].between(18,23),'時段'] = '晚上'
all_data['時段'].fillna('凌晨',inplace=True)
all_data['是否為國外消費'] = (all_data['消費地國別']!=0).astype('int8')

In [133]:
all_data['授權週日_時段'] = all_data['授權週日'].astype('str')+"_"+all_data['時段']
all_data['交易類別_交易型態'] = all_data['交易類別'].astype('str')+"_"+all_data['交易型態'].astype('str')
all_data['零元交易'] = (all_data['轉換後交易金額']==0).astype('int8')

In [None]:
if not os.path.exists('prep'):
    # Create the folder
    os.makedirs('prep')

In [134]:
for col in ['商戶類別代碼','消費城市','收單行代碼','特店代號']:
    count_df = all_data[col].value_counts(normalize=True).to_frame()
    count_df.to_csv(f'./prep/{col}消費總比例.csv')


In [135]:
date_list = create_date_list(sorted(all_data['授權日期'].unique()))

In [136]:
for idx,sublist in enumerate(date_list):
    all_data.loc[all_data['授權日期'].isin(sublist),'loading_cycle'] = idx
all_data['loading_cycle'] =all_data['loading_cycle'].astype('int8')

In [137]:
for col in tqdm(['時段','網路交易註記','商戶類別代碼','消費城市','收單行代碼','特店代號']):
    mapping = all_data.groupby('交易卡號')[col].value_counts(normalize=True).to_dict()
    with open(f'./prep/卡號_{col}_mapping.pkl','wb') as f:
        pickle.dump(mapping,f)

100%|██████████| 6/6 [01:06<00:00, 11.01s/it]


In [138]:
#query
card_df = all_data.groupby('交易卡號')['loading_cycle'].agg(['nunique','count','min'])
card_df.columns = ['交易週期數','交易總次數','第一次交易週期']
card_df['交易天數'] = all_data.groupby('交易卡號')['授權日期'].nunique().astype('int8')
card_df['第一次交易日期']= all_data.groupby('交易卡號')['授權日期'].min().astype('int8')
card_df['最後一次交易日期']= all_data.groupby('交易卡號')['授權日期'].max().astype('int8')
card_df['盜刷總數'] = all_data.groupby('交易卡號')['盜刷註記'].sum().astype('int16')
card_df['消費總額'] = all_data.groupby('交易卡號')['轉換後交易金額'].sum().astype('int32')
card_df['零元比例'] = all_data.groupby('交易卡號')['零元交易'].mean()
card_df['網路消費次數'] = all_data.groupby('交易卡號')['網路交易註記'].sum().astype('int16')
card_df['國內消費次數'] = all_data[all_data['消費地國別']==0].groupby('交易卡號')['轉換後交易金額'].count().astype('int16')
card_df['國內消費次數'].fillna(0,inplace=True)


In [139]:
card_df['盜刷比例'] = card_df['盜刷總數']/card_df['交易總次數']
card_df['每天平均交易數'] = card_df['交易總次數']/card_df['交易天數']
card_df['每週期平均交易數'] = card_df['交易總次數']/card_df['交易週期數']
card_df['可交易天數']= card_df['第一次交易日期'].max()-card_df['第一次交易日期']+1
card_df['可交易週期數']= card_df['第一次交易週期'].max()-card_df['第一次交易週期']+1
card_df['消費金額平均數'] = card_df['消費總額']/card_df['交易總次數']
card_df['國內消費比例'] = card_df['國內消費次數']/card_df['交易總次數']
card_df['消費頻率_週期'] = card_df['交易週期數']/card_df['可交易週期數']
card_df['消費頻率_天'] = card_df['交易天數']/card_df['可交易天數']
card_df['國內消費behavior']='-1'
card_df.loc[card_df['國內消費比例']>0.7,'國內消費behavior']='0'
card_df.loc[card_df['國內消費比例']<0.3,'國內消費behavior']='1'
card_df['網路交易behavior']='-1'
card_df.loc[card_df['網路消費次數']/card_df['交易總次數']>0.7,'網路交易behavior']='1'
card_df.loc[card_df['網路消費次數']/card_df['交易總次數']<0.3,'網路交易behavior']='0'


In [147]:
card_df.to_csv('./prep/customer_label.csv')

In [None]:
train = pd.read_csv('./train_data_extracted.csv')
train.fillna(0,inplace=True)
from sklearn.ensemble import IsolationForest

drop_col = ['盜刷註記','交易序號',
            '顧客ID', '交易卡號',
            'num_授權日期', 'num_授權週數', 'num_授權週日',
            '特店代號','收單行代碼','時段','授權週日_時段','交易類別_交易型態','新消費者',
            ]
selected_col = [col for col in train.columns if col not in drop_col]

clf = IsolationForest(random_state=0).fit(train[selected_col])
with open(f'../prep/isolation_tree.pkl','wb') as f:
    pickle.dump(clf,f)
with open(f'../prep/columns_for_iso.pkl','wb') as f:
    pickle.dump(selected_col,f)