# Question: 
- chid_dict_file 和 chid_array 沒有對應起來 

In [1]:
import os
import json
import numpy as np
import pandas as pd
from time import time
from tqdm import tqdm, trange
from sklearn.preprocessing import MinMaxScaler

In [2]:
sample_path = '../data'
specific_path = './data/sample_50k'

chid_file = os.path.join(sample_path, 'sample_chid.txt') # originally: 'sample_50k_chid.txt'
cdtx_file = os.path.join(sample_path, 'sample_zip_if_cca_cdtx0001_hist.csv') # 
cust_f_file = os.path.join(sample_path, 'sample_zip_if_cca_cust_f.csv') # originally: 'sample_50k_cust_f.json'
chid_dict_file = os.path.join(sample_path, 'sample_idx_map.npy') # 此檔案是在 ./data/sample_50k

# load 使用者 id 轉換檔

In [3]:
chid_array = np.loadtxt(chid_file, dtype=np.str)
chid_dict = np.load(chid_dict_file, allow_pickle=True).item()

for i in range(len(chid_array)):
    assert chid_dict[chid_array[i]] == i

chid_array.shape, len(chid_dict)

((50000,), 50000)

# load 特徵檔, json -> dataframe

In [None]:
'''
# load 特徵檔, json -> dataframe
t0 = time()
with open(cust_f_file) as f:
    cust_f_dict = json.load(f)
    
cust_f_rows = np.array(list(map(lambda x:list(x.values()), cust_f_dict.values())))
cust_f_cols = list(cust_f_dict.get('0').keys())    
df_cust_f = pd.DataFrame(data=cust_f_rows, columns=cust_f_cols)

df_cust_f.data_dt = df_cust_f.data_dt.apply(lambda x: x[:-len('T00:00:00.000Z')])
df_cust_f.sort_values(by=['data_dt', 'chid'], inplace=True, ignore_index=True)
df_cust_f.drop_duplicates(ignore_index=True, inplace=True)

print(time() - t0)

df_cust_f.shape, df_cust_f.chid.nunique()
'''

t0 = time()
df_cust_f = pd.read_csv(cust_f_file, skipinitialspace=True)
df_cust_f.sort_values(by=['data_dt', 'chid'], inplace=True, ignore_index=True)
df_cust_f.drop_duplicates(ignore_index=True, inplace=True)
print(time() - t0)

print(df_cust_f.shape, df_cust_f.chid.nunique())
assert len(set(df_cust_f.chid)-set(chid_array)) == 0 and len(set(chid_array)-set(df_cust_f.chid)) == 0
df_cust_f.head()

# load 消費檔 (原始 json 已經轉為 dataframe) 

In [None]:
'''
with open(cdtx_file) as f:
    cdtx_dict = json.load(f)
    
cdtx_rows = np.array(list(map(lambda x:list(x.values()), cdtx_dict.values())))
cdtx_cols = list(cdtx_dict.get('0').keys())    
df_cdtx = pd.DataFrame(data=cdtx_rows, columns=cdtx_cols)

df_cdtx.csmdt = df_cdtx.csmdt.apply(lambda x: x[:-len('T00:00:00.000Z')])
df_cdtx.sort_values(by=['csmdt', 'chid'], inplace=True, ignore_index=True)
df_cdtx.objam = df_cdtx.objam.astype(np.int64)

'''

t0 = time()
df_cdtx = pd.read_csv(cdtx_file, skipinitialspace=True)
# df_cdtx.csmdt.apply(lambda x: x[:-len('T00:00:00.000Z')]) 天以後的資訊已經移除。
df_cdtx.sort_values(by=['csmdt', 'chid'], inplace=True, ignore_index=True)
# df_cdtx.objam = df_cdtx.objam.astype(np.int64) # 交易金額已經是int64 
print(time() - t0)

df_cdtx.shape, df_cdtx.chid.nunique()

print(df_cdtx.shape, df_cdtx.chid.nunique())

# assert len(set(df_cdtx.chid)-set(chid_array)) == 0 and len(set(chid_array)-set(df_cdtx.chid)) == 0
assert type(df_cdtx.objam[0]) == np.int64
assert len(df_cdtx.csmdt[0]) == 10

df_cdtx.head()

In [None]:
df_cdtx = df_cdtx.iloc[:5000]
import gc
gc.collect()

In [None]:
'''np.mean(df_cdtx.objam)

import matplotlib.pylab as plt 
plt.hist(np.log10(1+df_cdtx.objam),bins=100)
plt.show()'''

### add month column, chid convert to index

In [None]:
assert 'month' not in df_cdtx.columns
df_cdtx.chid = df_cdtx.chid.map(chid_dict)+1
print(df_cdtx.chid.nunique())
# assert max(df_cdtx.chid) == len(chid_array)
df_cdtx.head()

### 取得整個月的 objam 

In [None]:
import gc

cdtx_group = df_cdtx[['chid', 'month', 'objam']].groupby(['chid', 'month'])

cdtx_sum = cdtx_group.sum() # 總金額
cdtx_mean = cdtx_group.mean() # 平均金額
cdtx_count = cdtx_group.count() # 消費次數
del cdtx_group 
gc.collect()

# cdtx_group = df_cdtx[['chid', 'month', 'stonc_6_label']].drop_duplicates().groupby(['chid', 'month'])
# cdtx_shop_kind_count = cdtx_group.count()

# del cdtx_group 
# gc.collect()

# del df_cdtx 
# gc.collect()

df_cdtx_objam = pd.DataFrame(list(map(list, cdtx_sum.index)), columns=['chid', 'data_dt'])
df_cdtx_objam['objam_sum'] = cdtx_sum.values[:, 0]
df_cdtx_objam['objam_mean'] = cdtx_mean.values[:, 0]
df_cdtx_objam['trans_count'] = cdtx_count.values[:, 0] # 交易次數

# df_cdtx_objam['shop_count'] = cdtx_shop_kind_count.values[:, 0] # 一個月內消費店家種類個數

del cdtx_sum, cdtx_mean, cdtx_count# , cdtx_shop_kind_count
gc.collect()

df_cdtx_objam.shape
# 每個顧客，每個月會有一個數值

In [None]:
def outer_product_table_of_chids_and_months(df_cdtx):
    '''
    產生一個包含所有顧客與所有月份的一個table。column的數量為: (# chids) X (# months)
    '''
    list_chid = sorted(df_cdtx.chid.unique())
    list_month = sorted(df_cdtx.month.unique())[:]
    
    df_full_y_sum = pd.DataFrame({
        'chid': list_chid*len(list_month),
    }).sort_values(by='chid', ignore_index=True) # 讓list_chid重複的次數和月的數量一樣多
    df_full_y_sum['data_dt'] = list_month*len(list_chid) # 讓list_month重複出現的次數和顧客數一樣多

    return df_full_y_sum 

df_full_y_sum = outer_product_table_of_chids_and_months(df_cdtx)
df_full_y_sum.shape

In [None]:
## join objam - 併入每個顧客每個月的目標指數 
df_full_y_sum = df_full_y_sum.merge(df_cdtx_objam, 
                                    how='left', 
                                    left_on=['chid', 'data_dt'], 
                                    right_on=['chid', 'data_dt']).fillna(0)

df_full_y_sum.shape

In [None]:
## join feature - 併入每個顧客每個月的特徵 
# p.s., df_cust_f 內存的剛好是每個月初的狀態
df_full_y_sum = df_full_y_sum.merge(df_cust_f, 
                                    how='inner', 
                                    left_on=['chid', 'data_dt'], 
                                    right_on=['chid', 'data_dt']).fillna(0)
df_full_y_sum.shape

In [None]:
# 本月 前1、2月 平均金額 
df_full_y_sum.insert(6, 'objam_mean_M3', 0)
for chid in tqdm(sorted(df_full_y_sum.chid.unique())):
    mask = df_full_y_sum.chid == chid
    
    temp = (df_full_y_sum.loc[mask, 'objam_sum'] + 
            df_full_y_sum.loc[mask, 'objam_sum'].shift(1).fillna(0) + 
            df_full_y_sum.loc[mask, 'objam_sum'].shift(2).fillna(0)) // 3   
    
    df_full_y_sum.loc[mask, 'objam_mean_M3'] = temp

In [None]:
# 將時間欄位轉為datatime64形式
df_full_y_sum.data_dt = df_full_y_sum.data_dt.astype(np.datetime64)
df_cdtx.csmdt = df_cdtx.csmdt.astype(np.datetime64)

In [None]:
df_cdtx.sort_values(by=['chid', 'csmdt'], ignore_index=True, inplace=True)
# 距離上次消費的天數
df_cdtx['timestamp_0'] = (df_cdtx.csmdt - df_cdtx.csmdt.shift()).apply(lambda x: x.days).fillna(0) 
# 距離2018-01-01的天數
df_cdtx['timestamp_1'] = (df_cdtx.csmdt - np.datetime64('2018-01-01')).apply(lambda x: x.days).fillna(0) 

In [2]:
import tqdm 
# drop error row 
# 這個cell執行的前提是df_cdtx是先依造chid排，然後再依造csmdt排
# 之所以要這樣做是因為前一次消費的時間差應該要在同一個消費者之間去做計算，但是前面的code沒辦法避免到這一點。
# i.e., df_cdtx['timestamp_0'] = (df_cdtx.csmdt - df_cdtx.csmdt.shift()).apply(lambda x: x.days).fillna(0)  
# 因此我們就在換chid的時候，把timestamp_0設為 0。
# TODO: 此cell應該要和timestamp_0的建立統一起執行。

mask_list = []
chid_pre = -1 

for i, chid in tqdm(enumerate(df_cdtx.chid.values)):
    if chid != chid_pre: # 不是-1，也不是前一個chid，代表是沒有算到另一個chid的前一次時間。
        chid_pre = chid
        mask_list.append(i)
        
df_cdtx.loc[mask_list, 'timestamp_0'] = 0

NameError: name 'df_cdtx' is not defined

In [None]:
# time series columns
# - 把類別型和數值型的時序input抓出來。
# TODO: Question) 有哪些df_cdtx的資料為非input的資料，而是output的資料。

category_cols = ['chid', 'bnsfg', 'iterm', 'mcc', 'scity', 'stonc_tag', 'stonc_label', 'stonm_label', 
                 'stonc_6_label', 'stonc_10_label']

numeric_cols = ['bnspt', 'timestamp_0', 'timestamp_1', 'objam']

df_input = df_cdtx[category_cols + numeric_cols].copy()

In [None]:
# time series.category_cols convert to index
df_input.loc[:, category_cols[1:]] = df_input.loc[:, category_cols[1:]].astype(np.str)

mapper = {col: {value: index+1 for index, value in enumerate(sorted(df_input[col].unique()))} 
          for col in category_cols[1:]}

df_input[category_cols[1:]] = df_input[category_cols[1:]].apply(lambda x: x.map(mapper[x.name]))

print(df_input.shape)
df_input.head(2)

for feat in mapper:
    print(feat, len(mapper[feat]))

In [None]:
# user feature columns / each month
# 相較於df_input，是來自於消費檔的資料，此處的資料是來自於崮客特徵檔。
feat_category_cols = ['chid', 'masts', 'educd', 'trdtp', 'poscd']
feat_numeric_cols = ['slam', 'first_mob', 'constant_change', 'sum_l2_ind', 'sum_u2_ind', 'constant_l2_ind', 'constant_u4_ind', 
                     'growth_rate', 'monotone_down', 'monotone_up']

df_feat_input = df_cust_f[feat_category_cols + feat_numeric_cols + ['data_dt']].copy()
df_feat_input.data_dt = df_feat_input.data_dt.astype(np.datetime64)

print(df_feat_input.shape)
df_feat_input.tail()

In [None]:
# user feature.category_cols convert to index
df_feat_input.loc[:, feat_category_cols[1:]] = df_feat_input.loc[:, feat_category_cols[1:]].astype(np.str)

feat_mapper = {col: {value: index+1 for index, value in enumerate(sorted(df_feat_input[col].unique()))} 
               for col in feat_category_cols[1:]}

df_feat_input[feat_category_cols[1:]] = df_feat_input[feat_category_cols[1:]].apply(lambda x: x.map(feat_mapper[x.name]))

print(df_feat_input.shape)
df_feat_input.head(2)

In [None]:
for feat in feat_mapper:
    print(feat, len(feat_mapper[feat]))

In [None]:
# 從yf_full_y_sum裡面挑出target
y_cols = ['chid', 'data_dt', 'objam_sum', 'objam_mean', 'trans_count', 'shop_count', 'objam_mean_M3']
df_y = df_full_y_sum[y_cols].copy().reset_index(drop=True)

print(df_y.shape)
df_y.tail()

In [None]:
def data_split(df_x, df_f, df_y, window_size, test_size=2):
    df_x = df_x.copy()
    df_f = df_f.copy()
    df_y = df_y.copy()
    
    df_f['timestamp'] = (df_f.data_dt - np.datetime64('2018-01-01')).apply(lambda x: x.days).fillna(0)
    df_y['timestamp'] = (df_y.data_dt - np.datetime64('2018-01-01')).apply(lambda x: x.days).fillna(0)
    
    x_train, x_test, f_train, f_test, y_train, y_test = [], [], [], [], [], []
            
    for i in tqdm(sorted(df_y.chid.unique())):
        data_x = df_x[df_x.chid == i].reset_index(drop=True)
        data_f = df_f[df_f.chid == i].reset_index(drop=True)
        data_y = df_y[df_y.chid == i].reset_index(drop=True)
        
        last = data_y.shape[0] - 1
        ts_list = sorted(data_y.timestamp.unique())
        
        for j, (ts_f, ts_y) in enumerate(zip(ts_list[:-1], ts_list[1:])):
            data_x_ws = data_x[data_x.timestamp_1 < ts_y][-window_size:].copy()
            data_x_ws.timestamp_1 = ts_y - data_x_ws.timestamp_1
            data_x_ws = data_x_ws.values

            if data_x_ws.shape[0] < window_size:
                tmp = np.zeros((window_size, data_x.shape[1]))
                if data_x_ws.shape[0] > 0:
                    tmp[-data_x_ws.shape[0]:] = data_x_ws
                data_x_ws = tmp

            if j < last - test_size:
                x_train.append(data_x_ws)
                f_train.append(data_f[data_f.timestamp == ts_f].values[0, :-1])
                y_train.append(data_y.values[j+1, :-1])
            elif j < last:
                x_test.append(data_x_ws)
                f_test.append(data_f[data_f.timestamp == ts_f].values[0, :-1])
                y_test.append(data_y.values[j+1, :-1])
            else:
                break        

    x_train, x_test = np.array(x_train), np.array(x_test)
    f_train, f_test = np.array(f_train), np.array(f_test)
    y_train, y_test = np.array(y_train), np.array(y_test)
    
    return x_train, x_test, f_train, f_test, y_train, y_test

In [None]:
# full_data，全資料
# x: 代表交易資料
# f: 代表顧客特徵資料 
# y: 代表預測目標
# input month: train -> 2018[1, 2, ..., 12]+2019[1, 2, ..., 9], test -> 2019[10, 11]
x_train, x_test, f_train, f_test, y_train, y_test = data_split(df_input, df_feat_input, df_y, 
                                                               window_size=120, test_size=2)

In [None]:
x_train.shape, x_test.shape, f_train.shape, f_test.shape, y_train.shape, y_test.shape

In [None]:
y_columns = list(df_y)
y_columns[-1] = 'objam_mean_M3_diff'

y_train[:, -1] = y_train[:, 2] - y_train[:, -1]
y_test[:, -1] = y_test[:, 2] - y_test[:, -1]

print(y_columns)

In [None]:
np.save(os.path.join(sample_path, 'RNN', 'x_train'), x_train)
np.save(os.path.join(sample_path, 'RNN', 'x_test'), x_test)
np.save(os.path.join(sample_path, 'RNN', 'f_train'), f_train)
np.save(os.path.join(sample_path, 'RNN', 'f_test'), f_test)
np.save(os.path.join(sample_path, 'RNN', 'y_train'), y_train)
np.save(os.path.join(sample_path, 'RNN', 'y_test'), y_test)

In [None]:
np.save(os.path.join(sample_path, 'RNN', 'feature_map'), mapper)
np.save(os.path.join(sample_path, 'RNN', 'cust_feature_map'), feat_mapper)

In [None]:
columns = {
    'x_columns': list(df_input), 
    'f_columns': list(df_feat_input), 
    'y_columns': y_columns, 
}
np.save(os.path.join(sample_path, 'RNN', 'columns'), columns)
print(columns)