# Data Preprocess

In [1]:
import os
import math
import datetime
import numpy as np
import pandas as pd

from time import time
from tqdm.notebook import tqdm, trange
from sklearn.preprocessing import MinMaxScaler

In [2]:
sample_path = '../data'

chid_file = 'sample_chid.txt'
cdtx_file = 'sample_zip_if_cca_cdtx0001_hist.csv'
cust_f_file = 'sample_zip_if_cca_cust_f.csv'

embedding_path = '../Embedding/sliding_windows_MM'
chid_dict_file = 'cust_mcc_idx_map_50k.npy'

In [3]:
chid_array = np.loadtxt(os.path.join(sample_path, chid_file), dtype=np.str)
chid_dict = np.load(os.path.join(embedding_path, chid_dict_file), allow_pickle=True).tolist()
df_cdtx = pd.read_csv(os.path.join(sample_path, cdtx_file)) # 交易記錄檔
df_cust_f = pd.read_csv(os.path.join(sample_path, cust_f_file)) # user feature
df_cust_f.drop_duplicates(ignore_index=True, inplace=True)

print(chid_array.shape, len(chid_dict), df_cdtx.shape, df_cust_f.shape)

(50000,) 50502 (6654938, 10) (1176172, 32)


In [4]:
df_cdtx = df_cdtx[df_cdtx.chid.isin(chid_array)].copy()
df_cust_f = df_cust_f[df_cust_f.chid.isin(chid_array)].copy()

In [5]:
df_cdtx.chid = df_cdtx.chid.map(chid_dict)
df_cust_f.chid = df_cust_f.chid.map(chid_dict)

print(len(df_cdtx.chid.unique()), len(df_cust_f.chid.unique()))
df_cust_f.groupby('chid').count().sort_values(by='data_ym').head()

50000 50000


Unnamed: 0_level_0,data_ym,monin,wrky,first_mob,data_dt,masts,educd,naty,trdtp,poscd,...,constant_u2_ind,constant_u3_ind,constant_u4_ind,constant_l2_ind,constant_l3_ind,constant_l4_ind,constant_change,growth_rate,monotone_up,monotone_down
chid,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1,Unnamed: 5_level_1,Unnamed: 6_level_1,Unnamed: 7_level_1,Unnamed: 8_level_1,Unnamed: 9_level_1,Unnamed: 10_level_1,Unnamed: 11_level_1,Unnamed: 12_level_1,Unnamed: 13_level_1,Unnamed: 14_level_1,Unnamed: 15_level_1,Unnamed: 16_level_1,Unnamed: 17_level_1,Unnamed: 18_level_1,Unnamed: 19_level_1,Unnamed: 20_level_1,Unnamed: 21_level_1
15475,11,11,11,11,11,11,11,11,11,11,...,11,11,11,11,11,11,11,11,11,11
49572,11,11,11,11,11,11,11,11,11,11,...,11,11,11,11,11,11,11,11,11,11
23964,11,11,11,11,11,11,11,11,11,11,...,11,11,11,11,11,11,11,11,11,11
49830,11,11,11,11,11,11,11,11,11,11,...,11,11,11,11,11,11,11,11,11,11
41574,11,11,11,11,11,11,11,11,11,11,...,11,11,11,11,11,11,11,11,11,11


In [6]:
df_cdtx['month'] = df_cdtx.csmdt.apply(lambda x: x[:-3]+'-01')
df_cdtx.head(2)

Unnamed: 0,bnsfg,bnspt,chid,csmdt,iterm,mcc,objam,scity,tcode,hcefg,month
0,N,0,8054,2018-01-01,0,5411,151,TAOYUAN,5,,2018-01-01
1,N,0,8054,2018-01-01,0,5411,146,TAOYUAN,5,,2018-01-01


In [7]:
## 填滿後12個月

list_chid = sorted(df_cust_f.chid.unique())
list_month = sorted(df_cust_f.data_dt.unique())[12:]

df_full_y_sum = pd.DataFrame({
    'chid': list_chid*len(list_month),
}).sort_values(by='chid', ignore_index=True)
df_full_y_sum['data_dt'] = list_month*len(list_chid)

df_full_y_sum.shape

(600000, 2)

In [8]:
## join feature
category_cols = ['masts', 'educd', 'naty', 'trdtp', 'poscd', 'cuorg']

numeric_cols = sorted(set(df_cust_f.columns) - set(category_cols) - set(['chid', 'data_ym', 'data_dt']), 
                      key=list(df_cust_f.columns).index)

df_full_y_sum = df_full_y_sum.merge(df_cust_f[['chid', 'data_ym'] + category_cols + numeric_cols], 
                                    how='left', 
                                    left_on=['chid', 'data_dt'], 
                                    right_on=['chid', 'data_ym'])

#df_full_y_sum.dropna(thresh=len(numeric_cols+category_cols), inplace=True)

## fill na value, numerical: 0, category: '-1'
values = dict()

for col in numeric_cols:
    values[col] = 0
    
for col in category_cols:
    values[col] = '-1'
    
df_full_y_sum.fillna(value=values, inplace=True)
df_full_y_sum.shape

(600000, 32)

In [9]:
## 取得整個月的 objam 
temp_cdtx = df_cdtx.groupby(['chid', 'month']).sum()
df_cdtx_objam = pd.DataFrame(list(map(list, temp_cdtx.index)), columns=['chid', 'data_dt'])
df_cdtx_objam['objam'] = np.ma.log(temp_cdtx.objam.values).filled(0)

In [10]:
## join objam

df_full_y_sum = df_full_y_sum.merge(df_cdtx_objam, 
                                    how='left', 
                                    left_on=['chid', 'data_dt'], 
                                    right_on=['chid', 'data_dt']).fillna(0)

df_full_y_sum.shape

(600000, 33)

In [11]:
mapper = {col: {value: index for index, value in enumerate(sorted(df_full_y_sum[col].unique()))} 
          for col in category_cols}

df_full_y_sum[category_cols] = df_full_y_sum[category_cols].apply(lambda x: x.map(mapper[x.name]))

print(df_full_y_sum.shape)
df_full_y_sum.head(2)

(600000, 33)


Unnamed: 0,chid,data_dt,data_ym,masts,educd,naty,trdtp,poscd,cuorg,monin,...,constant_u3_ind,constant_u4_ind,constant_l2_ind,constant_l3_ind,constant_l4_ind,constant_change,growth_rate,monotone_up,monotone_down,objam
0,0,2019-01-01,2019-01-01,3,5,2,23,2,8,173472.0,...,0.0,0.0,1.0,0.0,0.0,0.0,0.8,0.0,3.0,0.0
1,0,2019-02-01,2019-02-01,3,5,2,23,2,8,173472.0,...,0.0,0.0,0.0,0.0,0.0,0.0,0.8,0.0,5.0,0.0


In [12]:
df_full_y_sum.drop(columns=['data_ym'], inplace=True)

ignore_cols = ['data_dt']
category_cols = ['chid'] + category_cols
numeric_cols = sorted(set(df_full_y_sum.columns) - set(category_cols) - set(ignore_cols), 
                      key=list(df_full_y_sum.columns).index)

print(len(ignore_cols), ignore_cols, '\n')
print(len(category_cols), category_cols, '\n')
print(len(numeric_cols), numeric_cols)

1 ['data_dt'] 

7 ['chid', 'masts', 'educd', 'naty', 'trdtp', 'poscd', 'cuorg'] 

24 ['monin', 'wrky', 'first_mob', 'cycam', 'slam', 'sum_area_c', 'sum_u2_ind', 'sum_u3_ind', 'sum_u4_ind', 'sum_l2_ind', 'sum_l3_ind', 'sum_l4_ind', 'constant_area_c', 'constant_u2_ind', 'constant_u3_ind', 'constant_u4_ind', 'constant_l2_ind', 'constant_l3_ind', 'constant_l4_ind', 'constant_change', 'growth_rate', 'monotone_up', 'monotone_down', 'objam']


In [59]:
def data_split(df, numeric_cols=[], category_cols=[], test_size=0.2, x_minmax=None, y_minmax=None):
    
    x_train, x_test, y_train, y_test = [], [], [], []
    df = df[category_cols + numeric_cols].copy()
    new_chid = np.arange(df.shape[0])
    
    for i in tqdm(sorted(df.chid.unique())):
        data = df[df.chid == i].copy()
        last = data.shape[0]-1
        test_num = round((data.shape[0]-1)*test_size)            
        new_index = np.arange(i*last,i*last+last)
        train_x = data.iloc[0:last-test_num].copy()
        train_x.loc[:,'chid'] = new_index[:last-test_num]
        
        x_train.append(train_x)
        y_train.append(data.iloc[1:last-test_num+1, [-1]])

        test_x = data.iloc[last-test_num:last].copy()
        test_x.loc[:,'chid'] = new_index[last-test_num:last]
        
        x_test.append(test_x)
        y_test.append(data.iloc[last-test_num+1:last+1, [-1]])
        
    
    x_train = pd.concat(x_train)
    y_train = pd.concat(y_train)
    
    x_test = pd.concat(x_test)
    y_test = pd.concat(y_test)
    
    if x_minmax or y_minmax:
        scaler_dcit = dict()
    
    if x_minmax:
        x_scaler = MinMaxScaler(feature_range=x_minmax)
        x_train[numeric_cols] = x_scaler.fit_transform(x_train[numeric_cols])
        x_test[numeric_cols] = x_scaler.transform(x_test[numeric_cols]) 
        
        scaler_dcit['x'] = x_scaler
    if y_minmax:
        y_scaler = MinMaxScaler(feature_range=y_minmax)  
        y_train = y_scaler.fit_transform(y_train)
        y_test = y_scaler.transform(y_test)    
        
        scaler_dict['y'] = y_scaler
         
    if x_minmax or y_minmax:
        return x_train, x_test, y_train, y_test, scaler_dcit
    else:
        return x_train, x_test, y_train, y_test

In [60]:
x_minmax, y_minmax = (0,1), None

if x_minmax or y_minmax:
    x_train, x_test, y_train, y_test, scaler_dcit = data_split(df_full_y_sum, numeric_cols, category_cols, 
                                                               x_minmax=x_minmax, y_minmax=y_minmax, test_size=0.166)
else:
    x_train, x_test, y_train, y_test = data_split(df_full_y_sum, numeric_cols, category_cols, test_size=0.166)    

num_chid = len(set(df_full_y_sum.chid))
print('train:{}, test:{}'.format(x_train.shape[0]//num_chid, x_test.shape[0]//num_chid))
print(x_train.shape, y_train.shape, x_test.shape, y_test.shape)

HBox(children=(HTML(value=''), FloatProgress(value=0.0, max=50000.0), HTML(value='')))


train:9, test:2
(450000, 31) (450000, 1) (100000, 31) (100000, 1)


In [63]:
x_test.head()

Unnamed: 0,chid,masts,educd,naty,trdtp,poscd,cuorg,monin,wrky,first_mob,...,constant_u3_ind,constant_u4_ind,constant_l2_ind,constant_l3_ind,constant_l4_ind,constant_change,growth_rate,monotone_up,monotone_down,objam
9,9,3,5,2,25,2,8,0.00082,0.0,0.634069,...,0.0,0.0,0.0,0.0,0.0,0.166667,0.699331,0.166667,0.0,0.589435
10,10,3,5,2,25,2,8,0.00082,0.0,0.637224,...,0.0,0.0,0.0,0.0,0.0,0.0,0.699262,0.0,0.333333,0.572487
21,20,1,2,2,25,5,8,0.001176,0.0,0.634069,...,0.0,0.0,0.0,0.0,0.0,0.0,0.699245,0.0,0.0,0.521934
22,21,1,2,2,25,5,8,0.001176,0.0,0.637224,...,0.0,0.0,0.0,0.0,0.0,0.0,0.699245,0.0,0.0,0.0
33,31,1,4,2,7,6,8,0.001287,0.0,0.280757,...,0.0,0.0,0.0,0.0,0.0,0.0,0.699264,0.0,0.333333,0.583097


In [64]:
y_test.head()

Unnamed: 0,objam
10,10.621522
11,12.918493
22,0.0
23,0.0
34,13.009936


In [65]:
import copy
import torch
from torch.utils.data import DataLoader, TensorDataset
from sklearn.metrics import mean_squared_error, mean_absolute_error, median_absolute_error

In [66]:
def feature_index(x, feature_cols):
    feature_idx = {}
    x_cols = list(x.columns)
    for i in feature_cols:
        feature_idx[i] = x_cols.index(i)
        
    return feature_idx

def Linear_block(in_dim, out_dim):
    block = torch.nn.Sequential(torch.nn.Linear(in_dim, out_dim),
                                torch.nn.ReLU())
    return block

class MLP(torch.nn.Module):
    def __init__(self, category_cols, category_dims, ori_dim, layer_dims, embedding_dim):
        super(MLP, self).__init__()
        self.out_dims = [ori_dim, *layer_dims]
        Linear_blokcs = [Linear_block(in_dim, out_dim)
                         for in_dim, out_dim in zip(self.out_dims, self.out_dims[1:])]
        self.model = torch.nn.Sequential(*Linear_blokcs)
        self.embedding_dict = torch.nn.ModuleDict({category_col:torch.nn.Embedding(category_dim,
                                                                                   embedding_dim)
                                                   for category_col, category_dim in zip(category_cols,category_dims)})



        
    def forward(self, x, category_cols, category_dict, numeric_dict):
    
        category_embeddings = [self.embedding_dict[item[0]](x[:,item[1]].long()) for item in category_dict.items()]
        category_embeddings = torch.cat(category_embeddings, -1)
        
        numeric_idx = torch.Tensor(list(numeric_dict.values())).long()
        
        x = torch.cat([category_embeddings, x[:,numeric_idx]], -1)
        
        x = self.model(x)
        
        return x

In [67]:
print(len(ignore_cols), ignore_cols, '\n')
print(len(category_cols), category_cols, '\n')
print(len(numeric_cols), numeric_cols)

1 ['data_dt'] 

7 ['chid', 'masts', 'educd', 'naty', 'trdtp', 'poscd', 'cuorg'] 

24 ['monin', 'wrky', 'first_mob', 'cycam', 'slam', 'sum_area_c', 'sum_u2_ind', 'sum_u3_ind', 'sum_u4_ind', 'sum_l2_ind', 'sum_l3_ind', 'sum_l4_ind', 'constant_area_c', 'constant_u2_ind', 'constant_u3_ind', 'constant_u4_ind', 'constant_l2_ind', 'constant_l3_ind', 'constant_l4_ind', 'constant_change', 'growth_rate', 'monotone_up', 'monotone_down', 'objam']


In [68]:
category_cols = category_cols[:]
numeric_cols = numeric_cols

In [69]:
category_dims = [df_full_y_sum[feat].nunique() for feat in category_cols]

category_dict = feature_index(x_train, category_cols)
numeric_dict = feature_index(x_train, numeric_cols)

category_dims[category_cols.index('chid')] = x_train.shape[0] + x_test.shape[0]
embedding_size = 64

layer_dims = [256, 128, 1]
input_dim = len(category_dict)*embedding_size + len(numeric_dict)

epochs = 400
batch_size = 2048
learning_rate = 0.001

In [91]:
train_dataset = TensorDataset(torch.from_numpy(x_train.to_numpy()),
                              torch.from_numpy(y_train.to_numpy()))
train_loader = DataLoader(dataset=train_dataset, shuffle=True, batch_size=batch_size)

test_dataset = TensorDataset(torch.from_numpy(x_test.to_numpy()),
                              torch.from_numpy(y_test.to_numpy()))
test_loader = DataLoader(dataset=test_dataset, shuffle=False, batch_size=batch_size)

In [71]:
device = torch.device('cuda:1' if torch.cuda.is_available() else 'cpu')
model = MLP(category_cols, category_dims, input_dim,layer_dims, embedding_size).to(device)
criterion = torch.nn.MSELoss()
model

MLP(
  (model): Sequential(
    (0): Sequential(
      (0): Linear(in_features=472, out_features=256, bias=True)
      (1): ReLU()
    )
    (1): Sequential(
      (0): Linear(in_features=256, out_features=128, bias=True)
      (1): ReLU()
    )
    (2): Sequential(
      (0): Linear(in_features=128, out_features=1, bias=True)
      (1): ReLU()
    )
  )
  (embedding_dict): ModuleDict(
    (chid): Embedding(550000, 64)
    (masts): Embedding(4, 64)
    (educd): Embedding(7, 64)
    (naty): Embedding(3, 64)
    (trdtp): Embedding(28, 64)
    (poscd): Embedding(10, 64)
    (cuorg): Embedding(31, 64)
  )
)

In [72]:
USE_NODE2VEC = True
if USE_NODE2VEC:
    embedding = {}
    for i in range(12):
        embedding_file = f'node2vec_50k_2018{i+1:02d}_2019{i+1:02d}.npy'
        embedding[f'{i+1}'] = np.load(os.path.join(embedding_path, embedding_file))

    node2vec_emb = []
    for i in tqdm(df_full_y_sum.chid.unique()):
        for j in range(12):
            node2vec_emb.append(embedding[f'{j+1}'][i])

    node2vec_emb = torch.Tensor(node2vec_emb[50000:])
    
    model.embedding_dict['chid'].weight.data.copy_(node2vec_emb)

HBox(children=(HTML(value=''), FloatProgress(value=0.0, max=50000.0), HTML(value='')))




In [73]:
CHID_FINETUNING = True

if CHID_FINETUNING:
    param_optimizer = list(model.named_parameters())
    optimizer_parameters = [{"params": [p for n, p in param_optimizer]}]  
    #model.embedding_dict['chid'].weight.requires_grad = True
else:
    param_optimizer = list(model.named_parameters())
    optimizer_parameters = [{"params": [p for n, p in param_optimizer if not 'embedding_dict.chid.weight' == n]}]
    #model.embedding_dict['chid'].weight.requires_grad = False
    
optimizer = torch.optim.Adam(optimizer_parameters, lr=learning_rate)

In [74]:
early_stop = 20

best_loss = 1e10
early_cnt = 0
RMSE = []
for epoch in tqdm(range(epochs)):
    
    train_loss = 0
    test_loss = 0
    train_output = np.array([])
    train_y = np.array([])
    test_output = np.array([])
    test_y = np.array([])
    
    for x , y in train_loader:
        x, y = x.float().to(device), y.float().to(device)
        model.train()
        optimizer.zero_grad()
        
        output = model(x, category_cols, category_dict, numeric_dict)

        loss = criterion(output, y)
        train_loss += loss.item()
        train_output = np.concatenate([train_output,output.cpu().detach().numpy().reshape(-1)])
        train_y = np.concatenate([train_y,y.cpu().detach().numpy().reshape(-1)])
        
        loss.backward()
        optimizer.step()
        
    for x , y in test_loader:
        x, y = x.float().to(device), y.float().to(device)
        model.eval()        
        output = model(x, category_cols, category_dict, numeric_dict)
        loss = criterion(output, y)
        test_loss += loss.item()
        test_output = np.concatenate([test_output,output.cpu().detach().numpy().reshape(-1)])
        test_y = np.concatenate([test_y,y.cpu().detach().numpy().reshape(-1)])
    
    #train_loss = np.sqrt(train_loss/len(train_loader))
    #test_loss = np.sqrt(test_loss/len(test_loader))
    
    train_output, train_y = np.e**train_output, np.e**train_y
    train_RMSE = mean_squared_error(train_output, train_y, squared=False)
    train_mean = mean_absolute_error(train_output, train_y)
    train_median = median_absolute_error(train_output, train_y)
    
    test_output, test_y = np.e**test_output, np.e**test_y
    test_RMSE = mean_squared_error(test_output, test_y, squared=False)
    test_mean = mean_absolute_error(test_output, test_y)
    test_median = median_absolute_error(test_output, test_y)
    
    print(f'epoch:{epoch}\ntrain loss:{train_RMSE:.0f},test loss:{test_RMSE:.0f}\ntrain MAE(mean):{train_mean:.0f},test MAE(mean):{test_mean:.0f}\ntrain MAE(median):{train_median:.0f}, test MAE(median):{test_median:.0f}')
    
    if test_RMSE <= best_loss:
        best_model_params = copy.deepcopy(model.state_dict())
        best_loss = test_RMSE
        print('\tBetter!')
        early_cnt = 0
    else:
        early_cnt += 1
    
    if early_cnt >= early_stop:
        break

model.load_state_dict(best_model_params)

HBox(children=(HTML(value=''), FloatProgress(value=0.0, max=400.0), HTML(value='')))

epoch:0
train loss:397982,test loss:422865
train MAE(mean):64420,test MAE(mean):70567
train MAE(median):12744, test MAE(median):12864
	Better!
epoch:1
train loss:395364,test loss:419798
train MAE(mean):60717,test MAE(mean):67148
train MAE(median):11646, test MAE(median):12199
	Better!
epoch:2
train loss:407504,test loss:419652
train MAE(mean):63248,test MAE(mean):69637
train MAE(median):10602, test MAE(median):15076
	Better!
epoch:3
train loss:380882,test loss:421006
train MAE(mean):47252,test MAE(mean):70600
train MAE(median):7598, test MAE(median):15200
epoch:4
train loss:366811,test loss:422287
train MAE(mean):37302,test MAE(mean):73092
train MAE(median):5402, test MAE(median):16684
epoch:5
train loss:349779,test loss:424161
train MAE(mean):29402,test MAE(mean):75527
train MAE(median):4273, test MAE(median):17677
epoch:6
train loss:324741,test loss:425627
train MAE(mean):23903,test MAE(mean):76248
train MAE(median):3307, test MAE(median):17597
epoch:7
train loss:295081,test loss:428

<All keys matched successfully>

In [None]:
test_dataset = TensorDataset(torch.from_numpy(pd.concat([a,c]).to_numpy()),
                              torch.from_numpy(pd.concat([b,d]).to_numpy()))
test_loader = DataLoader(dataset=test_dataset, shuffle=False, batch_size=batch_size)

In [92]:
train_output = np.array([])
train_y = np.array([])
test_output = np.array([])
test_y = np.array([])

model.eval()

for x , y in train_loader:
    x, y = x.float().to(device), y.float().to(device)
    optimizer.zero_grad()

    output = model(x, category_cols, category_dict, numeric_dict)
    train_output = np.concatenate([train_output,output.cpu().detach().numpy().reshape(-1)])
    train_y = np.concatenate([train_y,y.cpu().detach().numpy().reshape(-1)])

train_output, train_y = np.e**train_output, np.e**train_y
    
for x , y in test_loader:
    x, y = x.float().to(device), y.float().to(device)
            
    output = model(x, category_cols, category_dict, numeric_dict)
    test_output = np.concatenate([test_output,output.cpu().detach().numpy().reshape(-1)])
    test_y = np.concatenate([test_y,y.cpu().detach().numpy().reshape(-1)])

test_output, test_y = np.e**test_output, np.e**test_y

In [76]:
index = range(0,x_test.shape[0], 2)

a = x_test.iloc[index]
b = y_test.iloc[index]

index = range(1,x_test.shape[0], 2)
c = x_test.iloc[index]
d = y_test.iloc[index]

In [93]:
print('train\tRMSE: {:.0f} MAE(mean): {:.0f} MAE(median): {:.0f}'.format(
    mean_squared_error(train_y, train_output, squared=False), 
    mean_absolute_error(train_y, train_output), 
    median_absolute_error(train_y, train_output)
))
print('test\tRMSE: {:.0f} MAE(mean): {:.0f} MAE(median): {:.0f}'.format(
    mean_squared_error(test_y, test_output, squared=False), 
    mean_absolute_error(test_y, test_output), 
    median_absolute_error(test_y, test_output)
))

train	RMSE: 381741 MAE(mean): 46093 MAE(median): 6981
test	RMSE: 419652 MAE(mean): 69637 MAE(median): 15076


In [98]:
df_out = pd.DataFrame({'chid':np.repeat(range(50000),2)})
df_out['true'] = test_y
df_out['pred'] = test_output

In [102]:
df_out.to_csv('Node2Vec-DynamicGraph.csv', index=False, encoding='utf-8')

In [99]:
df_out

Unnamed: 0,chid,true,pred
0,0,41008.018652,20578.283036
1,0,407784.150652,136273.758810
2,1,1.000000,6377.554582
3,1,1.000000,23.890428
4,2,446830.844943,58561.669369
...,...,...,...
99995,49997,1.000000,8175.996324
99996,49998,187636.075613,9587.329211
99997,49998,97537.043514,54450.505752
99998,49999,85734.018490,3790.432201
