In [1]:
import os
import json
import numpy as np
import pandas as pd
from time import time
from tqdm import tqdm, trange
from sklearn.preprocessing import MinMaxScaler

In [2]:
sample_path = './data/sample_50k'

chid_file = 'sample_50k_chid.txt'
cdtx_file = 'sample_50k_cdtx.json'
cust_f_file = 'sample_50k_cust_f.json'
chid_dict_file = 'sample_50k_chid_idx_map.npy'

In [3]:
chid_array = np.loadtxt(os.path.join(sample_path, chid_file), dtype=np.str)
chid_dict = np.load(os.path.join(sample_path, chid_dict_file), allow_pickle=True).item()

chid_array.shape, len(chid_dict)

OSError: ./data/sample_50k/sample_50k_chid.txt not found.

In [4]:
# load 消費檔, json -> dataframe
t0 = time()
with open(os.path.join(sample_path, cdtx_file)) as f:
    cdtx_dict = json.load(f)
    
cdtx_rows = np.array(list(map(lambda x:list(x.values()), cdtx_dict.values())))
cdtx_cols = list(cdtx_dict.get('0').keys())    
df_cdtx = pd.DataFrame(data=cdtx_rows, columns=cdtx_cols)

df_cdtx.csmdt = df_cdtx.csmdt.apply(lambda x: x[:-len('T00:00:00.000Z')])
df_cdtx.sort_values(by=['csmdt', 'chid'], inplace=True, ignore_index=True)
df_cdtx.objam = df_cdtx.objam.astype(np.int64)
print(time() - t0)

df_cdtx.shape, df_cdtx.chid.nunique()

94.79894924163818


((5991848, 17), 50000)

In [5]:
# load 特徵檔, json -> dataframe
t0 = time()
with open(os.path.join(sample_path, cust_f_file)) as f:
    cust_f_dict = json.load(f)
    
cust_f_rows = np.array(list(map(lambda x:list(x.values()), cust_f_dict.values())))
cust_f_cols = list(cust_f_dict.get('0').keys())    
df_cust_f = pd.DataFrame(data=cust_f_rows, columns=cust_f_cols)

df_cust_f.data_dt = df_cust_f.data_dt.apply(lambda x: x[:-len('T00:00:00.000Z')])
df_cust_f.sort_values(by=['data_dt', 'chid'], inplace=True, ignore_index=True)
df_cust_f.drop_duplicates(ignore_index=True, inplace=True)

print(time() - t0)

df_cust_f.shape, df_cust_f.chid.nunique()

58.011794090270996


((1183871, 31), 50000)

In [6]:
df_cust_f.head()

Unnamed: 0,chid,monin,wrky,first_mob,data_dt,masts,educd,naty,trdtp,poscd,...,constant_u2_ind,constant_u3_ind,constant_u4_ind,constant_l2_ind,constant_l3_ind,constant_l4_ind,constant_change,growth_rate,monotone_up,monotone_down
0,++2ooB282PrzxCER3DfKVA==,392533.0,30.0,144.0,2018-01-01,5a7095e93ba2ee05247bba34dd4509a51e2798d33a1c7c...,a2822b0ed756338d1ff31695467786db874a02189bdbcf...,7fcfcd907e0d490a37d0e7df45db65bc6bf009c3d90f66...,99bc9b817c9067f33b85455a28efaba660e6aa1231be75...,9484866f7342a8442f275ab5cf94c4be10c4474bb036ca...,...,0.0,0.0,3.0,0.0,0.0,0.0,0.0,1.0,0.0,2.0
1,++9w1UgockPycf2IG6QqeQ==,302357.0,0.0,15.0,2018-01-01,ea9d465d7361343a138603660b263e4f9fdb7bc04c5843...,a2822b0ed756338d1ff31695467786db874a02189bdbcf...,7fcfcd907e0d490a37d0e7df45db65bc6bf009c3d90f66...,f0243f3eded4a032b3ad01b034d5d6a96b92d8ba6f7344...,78c2e56448aad92ca71e8501f14fdc6f4a177ee7773dc4...,...,0.0,0.0,4.0,0.0,0.0,0.0,0.0,1.0,0.0,2.0
2,++GnbfoVuQ14bYMTfFu5Iw==,259052.0,0.0,184.0,2018-01-01,ea9d465d7361343a138603660b263e4f9fdb7bc04c5843...,1ed13fb8d916b8ba4aca01f3ab6eca7a819b92e912c7a8...,7fcfcd907e0d490a37d0e7df45db65bc6bf009c3d90f66...,99bc9b817c9067f33b85455a28efaba660e6aa1231be75...,78c2e56448aad92ca71e8501f14fdc6f4a177ee7773dc4...,...,0.0,0.0,0.0,0.0,0.0,0.0,1.0,1.0,1.0,0.0
3,++JbCrg7w3D/R9imzfxilg==,1747384.0,0.0,153.0,2018-01-01,5a7095e93ba2ee05247bba34dd4509a51e2798d33a1c7c...,a2822b0ed756338d1ff31695467786db874a02189bdbcf...,7fcfcd907e0d490a37d0e7df45db65bc6bf009c3d90f66...,f0243f3eded4a032b3ad01b034d5d6a96b92d8ba6f7344...,78c2e56448aad92ca71e8501f14fdc6f4a177ee7773dc4...,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0
4,++Vm1gVqBD+aJKBg71hRWA==,402915.0,0.0,51.0,2018-01-01,ea9d465d7361343a138603660b263e4f9fdb7bc04c5843...,9ea8636e8661c94a50a8f80087137df3df35659c2aca17...,7fcfcd907e0d490a37d0e7df45db65bc6bf009c3d90f66...,53f490495a0ec0ee6aaf9c03252e5989b3624d312c7b72...,9484866f7342a8442f275ab5cf94c4be10c4474bb036ca...,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0


In [7]:
# add month column, chid convert to index
df_cdtx.chid = df_cdtx.chid.map(chid_dict)
df_cdtx['month'] = df_cdtx.csmdt.apply(lambda x: x[:-3]+'-01')
print(df_cdtx.chid.nunique())

df_cdtx.head(2)

50000


Unnamed: 0,bnsfg,bnspt,chid,csmdt,iterm,mcc,objam,scity,tcode,hcefg,ecfg,etymd,stonc_tag,stonc_label,stonm_label,stonc_6_label,stonc_10_label,month
0,N,0,33,2018-01-01,0,5411,19945,KAOHSIUNG,5,,N,7,R2,650646,3564350.0,53003,296048,2018-01-01
1,N,0,33,2018-01-01,0,5811,1302,KAOHSIUNG,5,,N,5,F3,963675,3725180.0,119238,529490,2018-01-01


In [8]:
# add month column, chid convert to index
df_cust_f.chid = df_cust_f.chid.map(chid_dict)#+1
print(df_cust_f.chid.nunique())

df_cust_f.head(2)

50000


Unnamed: 0,chid,monin,wrky,first_mob,data_dt,masts,educd,naty,trdtp,poscd,...,constant_u2_ind,constant_u3_ind,constant_u4_ind,constant_l2_ind,constant_l3_ind,constant_l4_ind,constant_change,growth_rate,monotone_up,monotone_down
0,0,392533.0,30.0,144.0,2018-01-01,5a7095e93ba2ee05247bba34dd4509a51e2798d33a1c7c...,a2822b0ed756338d1ff31695467786db874a02189bdbcf...,7fcfcd907e0d490a37d0e7df45db65bc6bf009c3d90f66...,99bc9b817c9067f33b85455a28efaba660e6aa1231be75...,9484866f7342a8442f275ab5cf94c4be10c4474bb036ca...,...,0.0,0.0,3.0,0.0,0.0,0.0,0.0,1.0,0.0,2.0
1,1,302357.0,0.0,15.0,2018-01-01,ea9d465d7361343a138603660b263e4f9fdb7bc04c5843...,a2822b0ed756338d1ff31695467786db874a02189bdbcf...,7fcfcd907e0d490a37d0e7df45db65bc6bf009c3d90f66...,f0243f3eded4a032b3ad01b034d5d6a96b92d8ba6f7344...,78c2e56448aad92ca71e8501f14fdc6f4a177ee7773dc4...,...,0.0,0.0,4.0,0.0,0.0,0.0,0.0,1.0,0.0,2.0


In [9]:
## 取得整個月的 objam 
cdtx_group = df_cdtx[['chid', 'month', 'objam']].groupby(['chid', 'month'])

cdtx_sum = cdtx_group.sum() # 總金額
cdtx_mean = cdtx_group.mean() # 平均金額
cdtx_count = cdtx_group.count() # 消費次數

df_cdtx_objam = pd.DataFrame(list(map(list, cdtx_sum.index)), columns=['chid', 'data_dt'])
df_cdtx_objam['objam_sum'] = cdtx_sum.values[:, 0]
df_cdtx_objam['objam_mean'] = cdtx_mean.values[:, 0]
df_cdtx_objam['trans_count'] = cdtx_count.values[:, 0] # 交易次數

cdtx_group = df_cdtx[['chid', 'month', 'stonc_6_label']].drop_duplicates().groupby(['chid', 'month'])
cdtx_count = cdtx_group.count()
df_cdtx_objam['shop_count'] = cdtx_count.values[:, 0] # 一個月內消費店家種類個數

df_cdtx_objam.shape

(875908, 6)

In [10]:
## MCC 轉 index
mcc_mapper = {value: index for index, value in enumerate(sorted(df_cdtx.mcc.unique()))} 
df_cdtx.mcc = df_cdtx.mcc.map(mcc_mapper)

In [11]:
## 每個月MCC的objam總和

cust_shop_objam = []
list_month = sorted(df_cdtx.month.unique())
n_user = df_cdtx.chid.nunique()
n_mcc = df_cdtx.mcc.nunique()

for month in tqdm(list_month):
    x = np.zeros([n_user, n_mcc])
    groupby = df_cdtx[df_cdtx.month==month].groupby(['chid', 'mcc']).sum()
    index = groupby.index
    objams = groupby.values
    
    for (row, col), objam in zip(index, objams):
        if objam > 0:
            x[row, col] = objam
        else:
            x[row, col] = 0

    cust_shop_objam.append(x)

cust_shop_objam = np.concatenate(cust_shop_objam, 0)

100%|██████████| 25/25 [00:27<00:00,  1.09s/it]


In [12]:
df_mcc_sum = pd.DataFrame({
                'chid': sorted(df_cdtx.chid.unique())*len(list_month),
                'data_dt': np.array([ [month]*n_user for month in list_month]).flatten()
            })

In [13]:
mcc_cols = ['mcc_' + str(i) for i in range(n_mcc)]
df_mcc_sum[mcc_cols] = cust_shop_objam

df_mcc_sum.shape

(1250000, 509)

In [14]:
list_chid = sorted(df_cdtx.chid.unique())
list_month = sorted(df_cdtx.month.unique())

df_full_y_sum = pd.DataFrame({
    'chid': list_chid*len(list_month),
}).sort_values(by='chid', ignore_index=True)
df_full_y_sum['data_dt'] = list_month*len(list_chid)

df_full_y_sum.shape

(1250000, 2)

In [15]:
## join objam
df_full_y_sum = df_full_y_sum.merge(df_cdtx_objam, 
                                    how='left', 
                                    left_on=['chid', 'data_dt'], 
                                    right_on=['chid', 'data_dt']).fillna(0)

df_full_y_sum.shape

(1250000, 6)

In [16]:
## join feature
df_full_y_sum = df_full_y_sum.merge(df_cust_f, 
                                    how='inner', 
                                    left_on=['chid', 'data_dt'], 
                                    right_on=['chid', 'data_dt']).fillna(0)
df_full_y_sum.shape

(1183871, 35)

In [17]:
## join MCC objam
df_full_y_sum = df_full_y_sum.merge(df_mcc_sum, 
                                    how='inner', 
                                    left_on=['chid', 'data_dt'], 
                                    right_on=['chid', 'data_dt']).fillna(0)
df_full_y_sum.shape

(1183871, 542)

In [18]:
category_cols = ['chid', 'masts', 'educd', 'naty', 'trdtp', 'poscd', 'cuorg']

numeric_cols = sorted(set(df_full_y_sum.columns) - set(category_cols) - set(['data_ym', 'data_dt']), 
                      key=list(df_full_y_sum.columns).index)

len(category_cols), len(numeric_cols)

(7, 534)

In [19]:
df_input = df_full_y_sum[category_cols + numeric_cols + ['data_dt']].copy()
df_input.loc[:, category_cols[1:]] = df_input.loc[:, category_cols[1:]].astype(np.str)

mapper = {col: {value: index for index, value in enumerate(sorted(df_input[col].unique()))} 
          for col in category_cols[1:]}

df_input[category_cols[1:]] = df_input[category_cols[1:]].apply(lambda x: x.map(mapper[x.name]))

print(df_input.shape)
df_input.head(2)

(1183871, 542)


Unnamed: 0,chid,masts,educd,naty,trdtp,poscd,cuorg,objam_sum,objam_mean,trans_count,...,mcc_498,mcc_499,mcc_500,mcc_501,mcc_502,mcc_503,mcc_504,mcc_505,mcc_506,data_dt
0,0,0,4,1,17,5,6,21241.0,21241.0,1.0,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,2018-01-01
1,0,0,4,1,17,5,6,639.0,639.0,1.0,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,2018-02-01


In [20]:
y_cols = ['chid', 'data_dt', 'objam_sum', 'objam_mean', 'trans_count', 'shop_count']
df_y = df_full_y_sum[y_cols].copy().reset_index(drop=True)

print(df_y.shape)
df_y.tail()

(1183871, 6)


Unnamed: 0,chid,data_dt,objam_sum,objam_mean,trans_count,shop_count
1183866,49999,2019-08-01,28526.0,4075.142857,7.0,5.0
1183867,49999,2019-09-01,17955.0,2992.5,6.0,4.0
1183868,49999,2019-10-01,33220.0,4745.714286,7.0,6.0
1183869,49999,2019-11-01,21384.0,3054.857143,7.0,5.0
1183870,49999,2019-12-01,12977.0,2595.4,5.0,4.0


In [21]:
df_input.chid.nunique(), df_y.chid.nunique()

(50000, 50000)

In [22]:
def data_split(df_x, df_y, test_size=2):
    df_x = df_x.copy()
    df_y = df_y.copy()
    
    x_train, x_test, y_train, y_test = [], [], [], []
            
    for i in tqdm(sorted(df_y.chid.unique())):
        data_x = df_x[df_x.chid == i].reset_index(drop=True)
        data_y = df_y[df_y.chid == i].reset_index(drop=True)
        
        last = data_y.shape[0] - 1
        dt_list = sorted(data_y.data_dt.unique())
        
        for j, (dt_x, dt_y) in enumerate(zip(dt_list[:-1], dt_list[1:])):

            if j < last - test_size:
                x_train.append(data_x[data_x.data_dt == dt_x].values[0, :-1])
                y_train.append(data_y.values[j+1, :])
            elif j < last:
                x_test.append(data_x[data_x.data_dt == dt_x].values[0, :-1])
                y_test.append(data_y.values[j+1, :])
            else:
                break        

    x_train, x_test = np.array(x_train), np.array(x_test)
    y_train, y_test = np.array(y_train), np.array(y_test)
    
    return x_train, x_test, y_train, y_test

In [23]:
# full_data，全資料

# input month: train -> 2018[1, 2, ..., 12]+2019[1, 2, ..., 9], test -> 2019[10, 11]
x_train, x_test, y_train, y_test = data_split(df_input, df_y, test_size=2)

100%|██████████| 50000/50000 [13:31<00:00, 61.63it/s]


In [24]:
x_train.shape, x_test.shape, y_train.shape, y_test.shape

((1033871, 541), (100000, 541), (1033871, 6), (100000, 6))

In [25]:
np.save(os.path.join(sample_path, 'Normal', 'x_train'), x_train)
np.save(os.path.join(sample_path, 'Normal', 'x_test'), x_test)
np.save(os.path.join(sample_path, 'Normal', 'y_train'), y_train)
np.save(os.path.join(sample_path, 'Normal', 'y_test'), y_test)

In [26]:
np.save(os.path.join(sample_path, 'Normal', 'feature_map'), mapper)

In [27]:
columns = {
    'x_columns': list(df_input), 
    'y_columns': y_cols, 
}
np.save(os.path.join(sample_path, 'Normal', 'columns'), columns)
print(columns)

{'x_columns': ['chid', 'masts', 'educd', 'naty', 'trdtp', 'poscd', 'cuorg', 'objam_sum', 'objam_mean', 'trans_count', 'shop_count', 'monin', 'wrky', 'first_mob', 'cycam', 'slam', 'sum_area_c', 'sum_u2_ind', 'sum_u3_ind', 'sum_u4_ind', 'sum_l2_ind', 'sum_l3_ind', 'sum_l4_ind', 'constant_area_c', 'constant_u2_ind', 'constant_u3_ind', 'constant_u4_ind', 'constant_l2_ind', 'constant_l3_ind', 'constant_l4_ind', 'constant_change', 'growth_rate', 'monotone_up', 'monotone_down', 'mcc_0', 'mcc_1', 'mcc_2', 'mcc_3', 'mcc_4', 'mcc_5', 'mcc_6', 'mcc_7', 'mcc_8', 'mcc_9', 'mcc_10', 'mcc_11', 'mcc_12', 'mcc_13', 'mcc_14', 'mcc_15', 'mcc_16', 'mcc_17', 'mcc_18', 'mcc_19', 'mcc_20', 'mcc_21', 'mcc_22', 'mcc_23', 'mcc_24', 'mcc_25', 'mcc_26', 'mcc_27', 'mcc_28', 'mcc_29', 'mcc_30', 'mcc_31', 'mcc_32', 'mcc_33', 'mcc_34', 'mcc_35', 'mcc_36', 'mcc_37', 'mcc_38', 'mcc_39', 'mcc_40', 'mcc_41', 'mcc_42', 'mcc_43', 'mcc_44', 'mcc_45', 'mcc_46', 'mcc_47', 'mcc_48', 'mcc_49', 'mcc_50', 'mcc_51', 'mcc_52', 'mcc