In [29]:
import os
import json
import numpy as np
import pandas as pd
from time import time
from tqdm import tqdm, trange
from sklearn.preprocessing import MinMaxScaler

In [2]:
sample_path = './data/sample_50k'

chid_file = 'sample_50k_chid.txt'
cdtx_file = 'sample_50k_cdtx.json'
cust_f_file = 'sample_50k_cust_f.json'
chid_dict_file = 'sample_50k_chid_idx_map.npy'

In [3]:
chid_array = np.loadtxt(os.path.join(sample_path, chid_file), dtype=np.str)
chid_dict = np.load(os.path.join(sample_path, chid_dict_file), allow_pickle=True).item()

chid_array.shape, len(chid_dict)

((50000,), 50000)

In [4]:
# load 消費檔, json -> dataframe
t0 = time()
with open(os.path.join(sample_path, cdtx_file)) as f:
    cdtx_dict = json.load(f)
    
cdtx_rows = np.array(list(map(lambda x:list(x.values()), cdtx_dict.values())))
cdtx_cols = list(cdtx_dict.get('0').keys())    
df_cdtx = pd.DataFrame(data=cdtx_rows, columns=cdtx_cols)

df_cdtx.csmdt = df_cdtx.csmdt.apply(lambda x: x[:-len('T00:00:00.000Z')])
df_cdtx.sort_values(by=['csmdt', 'chid'], inplace=True, ignore_index=True)
df_cdtx.objam = df_cdtx.objam.astype(np.int64)
print(time() - t0)

df_cdtx.shape, df_cdtx.chid.nunique()

93.65322995185852


((5991848, 17), 50000)

In [5]:
# load 特徵檔, json -> dataframe
t0 = time()
with open(os.path.join(sample_path, cust_f_file)) as f:
    cust_f_dict = json.load(f)
    
cust_f_rows = np.array(list(map(lambda x:list(x.values()), cust_f_dict.values())))
cust_f_cols = list(cust_f_dict.get('0').keys())    
df_cust_f = pd.DataFrame(data=cust_f_rows, columns=cust_f_cols)

df_cust_f.data_dt = df_cust_f.data_dt.apply(lambda x: x[:-len('T00:00:00.000Z')])
df_cust_f.sort_values(by=['data_dt', 'chid'], inplace=True, ignore_index=True)
df_cust_f.drop_duplicates(ignore_index=True, inplace=True)

print(time() - t0)

df_cust_f.shape, df_cust_f.chid.nunique()

63.684427976608276


((1183871, 31), 50000)

In [6]:
# add month column, chid convert to index
df_cdtx.chid = df_cdtx.chid.map(chid_dict)+1
df_cdtx['month'] = df_cdtx.csmdt.apply(lambda x: x[:-3]+'-01')
print(df_cdtx.chid.nunique())

df_cdtx.head(2)

50000


Unnamed: 0,bnsfg,bnspt,chid,csmdt,iterm,mcc,objam,scity,tcode,hcefg,ecfg,etymd,stonc_tag,stonc_label,stonm_label,stonc_6_label,stonc_10_label,month
0,N,0,34,2018-01-01,0,5411,19945,KAOHSIUNG,5,,N,7,R2,650646,3564350.0,53003,296048,2018-01-01
1,N,0,34,2018-01-01,0,5811,1302,KAOHSIUNG,5,,N,5,F3,963675,3725180.0,119238,529490,2018-01-01


In [7]:
# add month column, chid convert to index
df_cust_f.chid = df_cust_f.chid.map(chid_dict)+1
print(df_cust_f.chid.nunique())

df_cust_f.head(2)

50000


Unnamed: 0,chid,monin,wrky,first_mob,data_dt,masts,educd,naty,trdtp,poscd,...,constant_u2_ind,constant_u3_ind,constant_u4_ind,constant_l2_ind,constant_l3_ind,constant_l4_ind,constant_change,growth_rate,monotone_up,monotone_down
0,1,392533.0,30.0,144.0,2018-01-01,5a7095e93ba2ee05247bba34dd4509a51e2798d33a1c7c...,a2822b0ed756338d1ff31695467786db874a02189bdbcf...,7fcfcd907e0d490a37d0e7df45db65bc6bf009c3d90f66...,99bc9b817c9067f33b85455a28efaba660e6aa1231be75...,9484866f7342a8442f275ab5cf94c4be10c4474bb036ca...,...,0.0,0.0,3.0,0.0,0.0,0.0,0.0,1.0,0.0,2.0
1,2,302357.0,0.0,15.0,2018-01-01,ea9d465d7361343a138603660b263e4f9fdb7bc04c5843...,a2822b0ed756338d1ff31695467786db874a02189bdbcf...,7fcfcd907e0d490a37d0e7df45db65bc6bf009c3d90f66...,f0243f3eded4a032b3ad01b034d5d6a96b92d8ba6f7344...,78c2e56448aad92ca71e8501f14fdc6f4a177ee7773dc4...,...,0.0,0.0,4.0,0.0,0.0,0.0,0.0,1.0,0.0,2.0


In [8]:
## 取得整個月的 objam 
cdtx_group = df_cdtx[['chid', 'month', 'objam']].groupby(['chid', 'month'])

cdtx_sum = cdtx_group.sum() # 總金額
cdtx_mean = cdtx_group.mean() # 平均金額
cdtx_count = cdtx_group.count() # 消費次數

df_cdtx_objam = pd.DataFrame(list(map(list, cdtx_sum.index)), columns=['chid', 'data_dt'])
df_cdtx_objam['objam_sum'] = cdtx_sum.values[:, 0]
df_cdtx_objam['objam_mean'] = cdtx_mean.values[:, 0]
df_cdtx_objam['trans_count'] = cdtx_count.values[:, 0] # 交易次數

cdtx_group = df_cdtx[['chid', 'month', 'stonc_6_label']].drop_duplicates().groupby(['chid', 'month'])
cdtx_count = cdtx_group.count()
df_cdtx_objam['shop_count'] = cdtx_count.values[:, 0] # 一個月內消費店家種類個數

df_cdtx_objam.shape

(875908, 6)

In [9]:
list_chid = sorted(df_cdtx.chid.unique())
list_month = sorted(df_cdtx.month.unique())[:]

df_full_y_sum = pd.DataFrame({
    'chid': list_chid*len(list_month),
}).sort_values(by='chid', ignore_index=True)
df_full_y_sum['data_dt'] = list_month*len(list_chid)

df_full_y_sum.shape

(1250000, 2)

In [10]:
## join objam
df_full_y_sum = df_full_y_sum.merge(df_cdtx_objam, 
                                    how='left', 
                                    left_on=['chid', 'data_dt'], 
                                    right_on=['chid', 'data_dt']).fillna(0)

df_full_y_sum.shape

(1250000, 6)

In [11]:
## join feature
df_full_y_sum = df_full_y_sum.merge(df_cust_f, 
                                    how='inner', 
                                    left_on=['chid', 'data_dt'], 
                                    right_on=['chid', 'data_dt']).fillna(0)
df_full_y_sum.shape

(1183871, 35)

In [12]:
# 本月 前1、2月 平均金額 
df_full_y_sum.insert(6, 'objam_mean_M3', 0)
for chid in tqdm(sorted(df_full_y_sum.chid.unique())):
    mask = df_full_y_sum.chid == chid
    
    temp = (df_full_y_sum.loc[mask, 'objam_sum'] + 
            df_full_y_sum.loc[mask, 'objam_sum'].shift(1).fillna(0) + 
            df_full_y_sum.loc[mask, 'objam_sum'].shift(2).fillna(0)) // 3   
    
    df_full_y_sum.loc[mask, 'objam_mean_M3'] = temp

100%|██████████| 50000/50000 [06:40<00:00, 124.94it/s]


In [None]:
df_full_y_sum.data_dt = df_full_y_sum.data_dt.astype(np.datetime64)

df_cdtx.csmdt = df_cdtx.csmdt.astype(np.datetime64)
df_cdtx.sort_values(by=['chid', 'csmdt'], ignore_index=True, inplace=True)
df_cdtx['timestamp_0'] = (df_cdtx.csmdt - df_cdtx.csmdt.shift()).apply(lambda x: x.days).fillna(0) # 距離上次消費的天數
df_cdtx['timestamp_1'] = (df_cdtx.csmdt - np.datetime64('2018-01-01')).apply(lambda x: x.days).fillna(0) # 距離2018-01-01的天數

In [None]:
# drop error row
mask_list = []
chid_pre = -1 

for i, chid in tqdm(enumerate(df_cdtx.chid.values)):
    if chid != chid_pre:
        chid_pre = chid
        mask_list.append(i)
        
df_cdtx.loc[mask_list, 'timestamp_0'] = 0

In [None]:
# time series columns
category_cols = ['chid', 'bnsfg', 'iterm', 'mcc', 'scity', 'stonc_tag', 'stonc_label', 'stonm_label', 
                 'stonc_6_label', 'stonc_10_label']
numeric_cols = ['bnspt', 'timestamp_0', 'timestamp_1', 'objam']

df_input = df_cdtx[category_cols + numeric_cols].copy()

In [16]:
# time series.category_cols convert to index
df_input.loc[:, category_cols[1:]] = df_input.loc[:, category_cols[1:]].astype(np.str)

mapper = {col: {value: index+1 for index, value in enumerate(sorted(df_input[col].unique()))} 
          for col in category_cols[1:]}

df_input[category_cols[1:]] = df_input[category_cols[1:]].apply(lambda x: x.map(mapper[x.name]))

print(df_input.shape)
df_input.head(2)

(5991848, 14)


Unnamed: 0,chid,bnsfg,iterm,mcc,scity,stonc_tag,stonc_label,stonm_label,stonc_6_label,stonc_10_label,bnspt,timestamp_0,timestamp_1,objam
0,1,1,1,276,10784,38,84934,109698,58742,25124,0,0.0,19,21241
1,1,1,1,276,10766,38,115998,115922,61758,29054,0,33.0,52,639


In [17]:
for feat in mapper:
    print(feat, len(mapper[feat]))

bnsfg 2
iterm 15
mcc 507
scity 11074
stonc_tag 49
stonc_label 202387
stonm_label 212342
stonc_6_label 78560
stonc_10_label 128075


In [18]:
# user feature columns / each month
feat_category_cols = ['chid', 'masts', 'educd', 'trdtp', 'poscd']
feat_numeric_cols = ['slam', 'first_mob', 'constant_change', 'sum_l2_ind', 'sum_u2_ind', 'constant_l2_ind', 'constant_u4_ind', 
                     'growth_rate', 'monotone_down', 'monotone_up']

df_feat_input = df_cust_f[feat_category_cols + feat_numeric_cols + ['data_dt']].copy()
df_feat_input.data_dt = df_feat_input.data_dt.astype(np.datetime64)

print(df_feat_input.shape)
df_feat_input.tail()

(1183871, 16)


Unnamed: 0,chid,masts,educd,trdtp,poscd,slam,first_mob,constant_change,sum_l2_ind,sum_u2_ind,constant_l2_ind,constant_u4_ind,growth_rate,monotone_down,monotone_up,data_dt
1183866,49996,ea9d465d7361343a138603660b263e4f9fdb7bc04c5843...,a2822b0ed756338d1ff31695467786db874a02189bdbcf...,4e55935aee7e67e2892b3a22b4d5a21ab77ed239947e58...,9484866f7342a8442f275ab5cf94c4be10c4474bb036ca...,962000.0,36.0,0.0,3.0,1.0,0.0,0.0,1.0,0.0,3.0,2019-12-01
1183867,49997,ea9d465d7361343a138603660b263e4f9fdb7bc04c5843...,78c2e56448aad92ca71e8501f14fdc6f4a177ee7773dc4...,27cda3aac16cb3ddbad97b0b4e698a602a8a2176331227...,5c5e9c182ca75b1747c92227c6b1e8d2eeb1f4d0e205ae...,592000.0,77.0,0.0,2.0,0.0,0.0,0.0,1.0,6.0,0.0,2019-12-01
1183868,49998,5a7095e93ba2ee05247bba34dd4509a51e2798d33a1c7c...,a2822b0ed756338d1ff31695467786db874a02189bdbcf...,4e55935aee7e67e2892b3a22b4d5a21ab77ed239947e58...,9484866f7342a8442f275ab5cf94c4be10c4474bb036ca...,814000.0,225.0,0.0,0.0,4.0,0.0,0.0,1.0,2.0,0.0,2019-12-01
1183869,49999,ea9d465d7361343a138603660b263e4f9fdb7bc04c5843...,78c2e56448aad92ca71e8501f14fdc6f4a177ee7773dc4...,27cda3aac16cb3ddbad97b0b4e698a602a8a2176331227...,78c2e56448aad92ca71e8501f14fdc6f4a177ee7773dc4...,1154400.0,54.0,0.0,6.0,0.0,6.0,0.0,1.0,1.0,0.0,2019-12-01
1183870,50000,5a7095e93ba2ee05247bba34dd4509a51e2798d33a1c7c...,78c2e56448aad92ca71e8501f14fdc6f4a177ee7773dc4...,f0243f3eded4a032b3ad01b034d5d6a96b92d8ba6f7344...,f17309a7a79032026d499f38ac354432e400e1e072b8a8...,1250600.0,60.0,0.0,0.0,0.0,0.0,0.0,1.0,2.0,0.0,2019-12-01


In [19]:
# user feature.category_cols convert to index
df_feat_input.loc[:, feat_category_cols[1:]] = df_feat_input.loc[:, feat_category_cols[1:]].astype(np.str)

feat_mapper = {col: {value: index+1 for index, value in enumerate(sorted(df_feat_input[col].unique()))} 
               for col in feat_category_cols[1:]}

df_feat_input[feat_category_cols[1:]] = df_feat_input[feat_category_cols[1:]].apply(lambda x: x.map(feat_mapper[x.name]))

print(df_feat_input.shape)
df_feat_input.head(2)

(1183871, 16)


Unnamed: 0,chid,masts,educd,trdtp,poscd,slam,first_mob,constant_change,sum_l2_ind,sum_u2_ind,constant_l2_ind,constant_u4_ind,growth_rate,monotone_down,monotone_up,data_dt
0,1,1,5,18,6,577200.0,144.0,0.0,1.0,0.0,0.0,3.0,1.0,2.0,0.0,2018-01-01
1,2,3,5,25,5,288600.0,15.0,0.0,0.0,2.0,0.0,4.0,1.0,2.0,0.0,2018-01-01


In [20]:
for feat in feat_mapper:
    print(feat, len(feat_mapper[feat]))

masts 3
educd 6
trdtp 27
poscd 9


In [21]:
y_cols = ['chid', 'data_dt', 'objam_sum', 'objam_mean', 'trans_count', 'shop_count', 'objam_mean_M3']
df_y = df_full_y_sum[y_cols].copy().reset_index(drop=True)

print(df_y.shape)
df_y.tail()

(1183871, 7)


Unnamed: 0,chid,data_dt,objam_sum,objam_mean,trans_count,shop_count,objam_mean_M3
1183866,50000,2019-08-01,28526.0,4075.142857,7.0,5.0,27699.0
1183867,50000,2019-09-01,17955.0,2992.5,6.0,4.0,20412.0
1183868,50000,2019-10-01,33220.0,4745.714286,7.0,6.0,26567.0
1183869,50000,2019-11-01,21384.0,3054.857143,7.0,5.0,24186.0
1183870,50000,2019-12-01,12977.0,2595.4,5.0,4.0,22527.0


In [22]:
def data_split(df_x, df_f, df_y, window_size, test_size=2):
    df_x = df_x.copy()
    df_f = df_f.copy()
    df_y = df_y.copy()
    
    df_f['timestamp'] = (df_f.data_dt - np.datetime64('2018-01-01')).apply(lambda x: x.days).fillna(0)
    df_y['timestamp'] = (df_y.data_dt - np.datetime64('2018-01-01')).apply(lambda x: x.days).fillna(0)
    
    x_train, x_test, f_train, f_test, y_train, y_test = [], [], [], [], [], []
            
    for i in tqdm(sorted(df_y.chid.unique())):
        data_x = df_x[df_x.chid == i].reset_index(drop=True)
        data_f = df_f[df_f.chid == i].reset_index(drop=True)
        data_y = df_y[df_y.chid == i].reset_index(drop=True)
        
        last = data_y.shape[0] - 1
        ts_list = sorted(data_y.timestamp.unique())
        
        for j, (ts_f, ts_y) in enumerate(zip(ts_list[:-1], ts_list[1:])):
            data_x_ws = data_x[data_x.timestamp_1 < ts_y][-window_size:].copy()
            data_x_ws.timestamp_1 = ts_y - data_x_ws.timestamp_1
            data_x_ws = data_x_ws.values

            if data_x_ws.shape[0] < window_size:
                tmp = np.zeros((window_size, data_x.shape[1]))
                if data_x_ws.shape[0] > 0:
                    tmp[-data_x_ws.shape[0]:] = data_x_ws
                data_x_ws = tmp

            if j < last - test_size:
                x_train.append(data_x_ws)
                f_train.append(data_f[data_f.timestamp == ts_f].values[0, :-1])
                y_train.append(data_y.values[j+1, :-1])
            elif j < last:
                x_test.append(data_x_ws)
                f_test.append(data_f[data_f.timestamp == ts_f].values[0, :-1])
                y_test.append(data_y.values[j+1, :-1])
            else:
                break        

    x_train, x_test = np.array(x_train), np.array(x_test)
    f_train, f_test = np.array(f_train), np.array(f_test)
    y_train, y_test = np.array(y_train), np.array(y_test)
    
    return x_train, x_test, f_train, f_test, y_train, y_test

In [23]:
# full_data，全資料

# input month: train -> 2018[1, 2, ..., 12]+2019[1, 2, ..., 9], test -> 2019[10, 11]
x_train, x_test, f_train, f_test, y_train, y_test = data_split(df_input, df_feat_input, df_y, 
                                                               window_size=120, test_size=2)

100%|██████████| 50000/50000 [43:45<00:00, 19.04it/s]  


In [24]:
x_train.shape, x_test.shape, f_train.shape, f_test.shape, y_train.shape, y_test.shape

((1033871, 120, 14),
 (100000, 120, 14),
 (1033871, 16),
 (100000, 16),
 (1033871, 7),
 (100000, 7))

In [25]:
y_columns = list(df_y)
y_columns[-1] = 'objam_mean_M3_diff'

y_train[:, -1] = y_train[:, 2] - y_train[:, -1]
y_test[:, -1] = y_test[:, 2] - y_test[:, -1]

print(y_columns)

['chid', 'data_dt', 'objam_sum', 'objam_mean', 'trans_count', 'shop_count', 'objam_mean_M3_diff']


In [26]:
np.save(os.path.join(sample_path, 'RNN', 'x_train'), x_train)
np.save(os.path.join(sample_path, 'RNN', 'x_test'), x_test)
np.save(os.path.join(sample_path, 'RNN', 'f_train'), f_train)
np.save(os.path.join(sample_path, 'RNN', 'f_test'), f_test)
np.save(os.path.join(sample_path, 'RNN', 'y_train'), y_train)
np.save(os.path.join(sample_path, 'RNN', 'y_test'), y_test)

In [27]:
np.save(os.path.join(sample_path, 'RNN', 'feature_map'), mapper)
np.save(os.path.join(sample_path, 'RNN', 'cust_feature_map'), feat_mapper)

In [28]:
columns = {
    'x_columns': list(df_input), 
    'f_columns': list(df_feat_input), 
    'y_columns': y_columns, 
}
np.save(os.path.join(sample_path, 'RNN', 'columns'), columns)
print(columns)

{'x_columns': ['chid', 'bnsfg', 'iterm', 'mcc', 'scity', 'stonc_tag', 'stonc_label', 'stonm_label', 'stonc_6_label', 'stonc_10_label', 'bnspt', 'timestamp_0', 'timestamp_1', 'objam'], 'f_columns': ['chid', 'masts', 'educd', 'trdtp', 'poscd', 'slam', 'first_mob', 'constant_change', 'sum_l2_ind', 'sum_u2_ind', 'constant_l2_ind', 'constant_u4_ind', 'growth_rate', 'monotone_down', 'monotone_up', 'data_dt'], 'y_columns': ['chid', 'data_dt', 'objam_sum', 'objam_mean', 'trans_count', 'shop_count', 'objam_mean_M3_diff']}
