In [1]:
import os
import numpy as np
import pandas as pd
from time import time
from tqdm import tqdm, trange
from sklearn.preprocessing import MinMaxScaler

In [2]:
sample_path = './data/sample_50k'

chid_file = 'sample_chid.txt'
chid_dict_file = 'sample_idx_map.npy'
cdtx_file = 'sample_zip_if_cca_cdtx0001_hist.csv'
cust_f_file = 'sample_zip_if_cca_cust_f.csv'
y_file = 'sample_zip_if_cca_y.csv'

In [3]:
chid_array = np.loadtxt(os.path.join(sample_path, chid_file), dtype=np.str)
chid_dict = np.load(os.path.join(sample_path, chid_dict_file), allow_pickle=True).item()
df_cdtx = pd.read_csv(os.path.join(sample_path, cdtx_file)) # 交易記錄檔
df_cust_f = pd.read_csv(os.path.join(sample_path, cust_f_file)) # user feature
df_cust_f.drop_duplicates(ignore_index=True, inplace=True)
df_y = pd.read_csv(os.path.join(sample_path, y_file)) #  預測目標

print(chid_array.shape, len(chid_dict), df_cdtx.shape, df_cust_f.shape, df_y.shape)

(50000,) 50000 (6654938, 10) (1176172, 32) (2889085, 4)


In [4]:
df_cdtx = df_cdtx[df_cdtx.chid.isin(chid_array)].copy()
df_cust_f = df_cust_f[df_cust_f.chid.isin(chid_array)].copy()
df_y = df_y[df_y.chid.isin(chid_array)].copy()

In [5]:
df_cdtx.chid = df_cdtx.chid.map(chid_dict)+1
df_cust_f.chid = df_cust_f.chid.map(chid_dict)+1
df_y.chid = df_y.chid.map(chid_dict)+1

print(df_cdtx.chid.nunique(), df_cust_f.chid.nunique(), df_y.chid.nunique())
df_cust_f.groupby('chid').count().sort_values(by='data_ym').head()

50000 50000 50000


Unnamed: 0_level_0,data_ym,monin,wrky,first_mob,data_dt,masts,educd,naty,trdtp,poscd,...,constant_u2_ind,constant_u3_ind,constant_u4_ind,constant_l2_ind,constant_l3_ind,constant_l4_ind,constant_change,growth_rate,monotone_up,monotone_down
chid,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1,Unnamed: 5_level_1,Unnamed: 6_level_1,Unnamed: 7_level_1,Unnamed: 8_level_1,Unnamed: 9_level_1,Unnamed: 10_level_1,Unnamed: 11_level_1,Unnamed: 12_level_1,Unnamed: 13_level_1,Unnamed: 14_level_1,Unnamed: 15_level_1,Unnamed: 16_level_1,Unnamed: 17_level_1,Unnamed: 18_level_1,Unnamed: 19_level_1,Unnamed: 20_level_1,Unnamed: 21_level_1
15476,11,11,11,11,11,11,11,11,11,11,...,11,11,11,11,11,11,11,11,11,11
49573,11,11,11,11,11,11,11,11,11,11,...,11,11,11,11,11,11,11,11,11,11
23965,11,11,11,11,11,11,11,11,11,11,...,11,11,11,11,11,11,11,11,11,11
49831,11,11,11,11,11,11,11,11,11,11,...,11,11,11,11,11,11,11,11,11,11
41575,11,11,11,11,11,11,11,11,11,11,...,11,11,11,11,11,11,11,11,11,11


In [6]:
df_cdtx['month'] = df_cdtx.csmdt.apply(lambda x: x[:-3]+'-01')
df_cdtx.head(2)

Unnamed: 0,bnsfg,bnspt,chid,csmdt,iterm,mcc,objam,scity,tcode,hcefg,month
0,N,0,8055,2018-01-01,0,5411,151,TAOYUAN,5,,2018-01-01
1,N,0,8055,2018-01-01,0,5411,146,TAOYUAN,5,,2018-01-01


In [7]:
## 填滿後12個月

list_chid = sorted(df_y.chid.unique())
list_month = sorted(df_y.data_dt.unique())[12:]

df_full_y_sum = pd.DataFrame({
    'chid': list_chid*len(list_month),
}).sort_values(by='chid', ignore_index=True)
df_full_y_sum['data_dt'] = list_month*len(list_chid)

df_full_y_sum.shape

(600000, 2)

In [8]:
## 取得整個月的 objam 
temp_cdtx = df_cdtx.groupby(['chid', 'month']).sum()
df_cdtx_objam = pd.DataFrame(list(map(list, temp_cdtx.index)), columns=['chid', 'data_dt'])
df_cdtx_objam['objam'] = temp_cdtx.objam.values

### 取得整個月的 y
#temp_y = df_y.groupby(['chid', 'data_dt']).sum()
#df_y_sum = pd.DataFrame(list(map(list, temp_y.index)), columns=['chid', 'data_dt'])
#df_y_sum['y'] = temp_y['y'].values

In [9]:
## join objam & y 

df_full_y_sum = df_full_y_sum.merge(df_cdtx_objam, 
                                    how='left', 
                                    left_on=['chid', 'data_dt'], 
                                    right_on=['chid', 'data_dt']).fillna(0)

#df_full_y_sum = df_full_y_sum.merge(df_y_sum, 
#                                    how='left', 
#                                    left_on=['chid', 'data_dt'], 
#                                    right_on=['chid', 'data_dt']).fillna(0)

df_full_y_sum.data_dt = df_full_y_sum.data_dt.astype(np.datetime64)

df_full_y_sum.shape

(600000, 3)

In [10]:
df_cdtx.csmdt = df_cdtx.csmdt.astype(np.datetime64)
df_cdtx.sort_values(by=['chid', 'csmdt'], ignore_index=True, inplace=True)
df_cdtx['timestamp_0'] = (df_cdtx.csmdt - df_cdtx.csmdt.shift()).apply(lambda x: x.days).fillna(0)
df_cdtx['timestamp_1'] = (df_cdtx.csmdt - np.datetime64('2018-01-01')).apply(lambda x: x.days).fillna(0)

In [11]:
mask_list = []
chid_pre = -1 

for i, chid in tqdm(enumerate(df_cdtx.chid.values)):
    if chid != chid_pre:
        chid_pre = chid
        mask_list.append(i)
        
df_cdtx.loc[mask_list, 'timestamp_0'] = 0

6654938it [00:01, 3391576.39it/s]


In [12]:
category_cols = ['chid', 'bnsfg', 'iterm', 'mcc', 'scity', 'tcode']
numeric_cols = ['bnspt', 'timestamp_0', 'timestamp_1', 'objam']

df_input = df_cdtx[category_cols + numeric_cols].copy()

In [13]:
df_input.loc[:, category_cols[1:]] = df_input.loc[:, category_cols[1:]].astype(np.str)

mapper = {col: {value: index+1 for index, value in enumerate(sorted(df_input[col].unique()))} 
          for col in category_cols[1:]}

df_input[category_cols[1:]] = df_input[category_cols[1:]].apply(lambda x: x.map(mapper[x.name]))

print(df_input.shape)
df_input.head(2)

(6654938, 10)


Unnamed: 0,chid,bnsfg,iterm,mcc,scity,tcode,bnspt,timestamp_0,timestamp_1,objam
0,1,1,1,281,9834,3,0,0.0,24,5528
1,1,1,1,319,9833,3,0,6.0,30,5200


In [14]:
for feat in mapper:
    print(feat, len(mapper[feat]))

bnsfg 2
iterm 17
mcc 502
scity 11242
tcode 6


In [15]:
def data_split(df_x, df_y, window_size, test_size=0.166):
    df_x = df_x.copy()
    df_y = df_y.copy()
    df_y['timestamp'] = (df_y.data_dt - np.datetime64('2018-01-01')).apply(lambda x: x.days).fillna(0)    
    
    x_train, x_test, y_train, y_test = [], [], [], []
            
    for i in tqdm(sorted(df_y.chid.unique())):
        data_x = df_x[df_x.chid == i]
        data_y = df_y[df_y.chid == i]
        
        last = data_y.shape[0] - 1
        test_num = round(data_y.shape[0]*test_size)           
        
        for j, ts in enumerate(sorted(data_y.timestamp.unique()[1:])):
            data_x_ws = data_x[data_x.timestamp_1 < ts][-window_size:].copy()
            data_x_ws.timestamp_1 = ts - data_x_ws.timestamp_1
            data_x_ws = data_x_ws.values

            if data_x_ws.shape[0] < window_size:
                tmp = np.zeros((window_size, data_x.shape[1]))
                tmp[-data_x_ws.shape[0]:] = data_x_ws
                data_x_ws = tmp

            if j < last - test_num:
                x_train.append(data_x_ws)
                y_train.append(data_y.objam.values[[j+1]])
            elif j < last:
                x_test.append(data_x_ws)
                y_test.append(data_y.objam.values[[j+1]])
            else:
                break        

    x_train, x_test, y_train, y_test = np.array(x_train), np.array(x_test), np.array(y_train), np.array(y_test)
    
    return x_train, x_test, y_train, y_test

In [16]:
x_train, x_test, y_train, y_test = data_split(df_input, df_full_y_sum, window_size=20, test_size=0.166)

100%|██████████| 50000/50000 [12:53<00:00, 64.67it/s]


In [17]:
x_train.shape, x_test.shape, y_train.shape, y_test.shape

((450000, 20, 10), (100000, 20, 10), (450000, 1), (100000, 1))

In [19]:
np.save(os.path.join(sample_path, 'ETRNN', 'x_train'), x_train)
np.save(os.path.join(sample_path, 'ETRNN', 'x_test'), x_test)
np.save(os.path.join(sample_path, 'ETRNN', 'y_train'), y_train)
np.save(os.path.join(sample_path, 'ETRNN', 'y_test'), y_test)

In [20]:
np.save(os.path.join(sample_path, 'ETRNN', 'feature_map'), mapper)