## Data Preprocess

In [1]:
import os
import math
import datetime
import numpy as np
import pandas as pd

from time import time
from tqdm.notebook import tqdm, trange
from sklearn.preprocessing import MinMaxScaler

In [2]:
sample_path = './data'

chid_file = 'sample_chid.txt'
chid_dict_file = 'sample_idx_map.npy'
cdtx_file = 'sample_zip_if_cca_cdtx0001_hist.csv'
cust_f_file = 'sample_zip_if_cca_cust_f.csv'

In [3]:
chid_array = np.loadtxt(os.path.join(sample_path, chid_file), dtype=np.str)
chid_dict = np.load(os.path.join(sample_path, chid_dict_file), allow_pickle=True).tolist()
df_cdtx = pd.read_csv(os.path.join(sample_path, cdtx_file)) # 交易記錄檔
df_cust_f = pd.read_csv(os.path.join(sample_path, cust_f_file)) # user feature
df_cust_f.drop_duplicates(ignore_index=True, inplace=True)

print(chid_array.shape, len(chid_dict), df_cdtx.shape, df_cust_f.shape)

(50000,) 50000 (6654938, 10) (1176172, 32)


In [4]:
df_cdtx = df_cdtx[df_cdtx.chid.isin(chid_array)].copy()
df_cust_f = df_cust_f[df_cust_f.chid.isin(chid_array)].copy()

In [5]:
df_cdtx.chid = df_cdtx.chid.map(chid_dict)
df_cust_f.chid = df_cust_f.chid.map(chid_dict)

print(len(df_cdtx.chid.unique()), len(df_cust_f.chid.unique()))
df_cust_f.groupby('chid').count().sort_values(by='data_ym').head()

50000 50000


Unnamed: 0_level_0,data_ym,monin,wrky,first_mob,data_dt,masts,educd,naty,trdtp,poscd,...,constant_u2_ind,constant_u3_ind,constant_u4_ind,constant_l2_ind,constant_l3_ind,constant_l4_ind,constant_change,growth_rate,monotone_up,monotone_down
chid,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1,Unnamed: 5_level_1,Unnamed: 6_level_1,Unnamed: 7_level_1,Unnamed: 8_level_1,Unnamed: 9_level_1,Unnamed: 10_level_1,Unnamed: 11_level_1,Unnamed: 12_level_1,Unnamed: 13_level_1,Unnamed: 14_level_1,Unnamed: 15_level_1,Unnamed: 16_level_1,Unnamed: 17_level_1,Unnamed: 18_level_1,Unnamed: 19_level_1,Unnamed: 20_level_1,Unnamed: 21_level_1
15475,11,11,11,11,11,11,11,11,11,11,...,11,11,11,11,11,11,11,11,11,11
49572,11,11,11,11,11,11,11,11,11,11,...,11,11,11,11,11,11,11,11,11,11
23964,11,11,11,11,11,11,11,11,11,11,...,11,11,11,11,11,11,11,11,11,11
49830,11,11,11,11,11,11,11,11,11,11,...,11,11,11,11,11,11,11,11,11,11
41574,11,11,11,11,11,11,11,11,11,11,...,11,11,11,11,11,11,11,11,11,11


In [6]:
df_cdtx['month'] = df_cdtx.csmdt.apply(lambda x: x[:-3]+'-01')
df_cdtx.head(2)

Unnamed: 0,bnsfg,bnspt,chid,csmdt,iterm,mcc,objam,scity,tcode,hcefg,month
0,N,0,8054,2018-01-01,0,5411,151,TAOYUAN,5,,2018-01-01
1,N,0,8054,2018-01-01,0,5411,146,TAOYUAN,5,,2018-01-01


In [7]:
## 填滿後12個月

list_chid = sorted(df_cust_f.chid.unique())
list_month = sorted(df_cust_f.data_dt.unique())[12:]

df_full_y_sum = pd.DataFrame({
    'chid': list_chid*len(list_month),
}).sort_values(by='chid', ignore_index=True)
df_full_y_sum['data_dt'] = list_month*len(list_chid)

df_full_y_sum.shape

(600000, 2)

In [8]:
df_full_y_sum

Unnamed: 0,chid,data_dt
0,0,2019-01-01
1,0,2019-02-01
2,0,2019-03-01
3,0,2019-04-01
4,0,2019-05-01
...,...,...
599995,49999,2019-08-01
599996,49999,2019-09-01
599997,49999,2019-10-01
599998,49999,2019-11-01


In [9]:
## join feature
category_cols = ['masts', 'educd', 'naty', 'trdtp', 'poscd', 'cuorg']

numeric_cols = sorted(set(df_cust_f.columns) - set(category_cols) - set(['chid', 'data_ym', 'data_dt']), 
                      key=list(df_cust_f.columns).index)

df_full_y_sum = df_full_y_sum.merge(df_cust_f[['chid', 'data_ym'] + category_cols + numeric_cols], 
                                    how='left', 
                                    left_on=['chid', 'data_dt'], 
                                    right_on=['chid', 'data_ym'])

#df_full_y_sum.dropna(thresh=len(numeric_cols+category_cols), inplace=True)

## fill na value, numerical: 0, category: '-1'
values = dict()

for col in numeric_cols:
    values[col] = 0
    
for col in category_cols:
    values[col] = '-1'
    
df_full_y_sum.fillna(value=values, inplace=True)
df_full_y_sum.shape

(600000, 32)

In [10]:
## 取得整個月的 objam 
temp_cdtx = df_cdtx.groupby(['chid', 'month']).sum()
df_cdtx_objam = pd.DataFrame(list(map(list, temp_cdtx.index)), columns=['chid', 'data_dt'])
df_cdtx_objam['objam'] = np.ma.log(temp_cdtx.objam.values).filled(0)

In [11]:
## join objam

df_full_y_sum = df_full_y_sum.merge(df_cdtx_objam, 
                                    how='left', 
                                    left_on=['chid', 'data_dt'], 
                                    right_on=['chid', 'data_dt']).fillna(0)

df_full_y_sum.shape

(600000, 33)

In [12]:
mapper = {col: {value: index for index, value in enumerate(sorted(df_full_y_sum[col].unique()))} 
          for col in category_cols}

df_full_y_sum[category_cols] = df_full_y_sum[category_cols].apply(lambda x: x.map(mapper[x.name]))

print(df_full_y_sum.shape)
df_full_y_sum.head(2)

(600000, 33)


Unnamed: 0,chid,data_dt,data_ym,masts,educd,naty,trdtp,poscd,cuorg,monin,...,constant_u3_ind,constant_u4_ind,constant_l2_ind,constant_l3_ind,constant_l4_ind,constant_change,growth_rate,monotone_up,monotone_down,objam
0,0,2019-01-01,2019-01-01,3,5,2,23,2,8,173472.0,...,0.0,0.0,1.0,0.0,0.0,0.0,0.8,0.0,3.0,0.0
1,0,2019-02-01,2019-02-01,3,5,2,23,2,8,173472.0,...,0.0,0.0,0.0,0.0,0.0,0.0,0.8,0.0,5.0,0.0


In [13]:
df_full_y_sum.drop(columns=['data_ym'], inplace=True)

ignore_cols = ['data_dt']
category_cols = ['chid'] + category_cols
numeric_cols = sorted(set(df_full_y_sum.columns) - set(category_cols) - set(ignore_cols), 
                      key=list(df_full_y_sum.columns).index)

print(len(ignore_cols), ignore_cols, '\n')
print(len(category_cols), category_cols, '\n')
print(len(numeric_cols), numeric_cols)

1 ['data_dt'] 

7 ['chid', 'masts', 'educd', 'naty', 'trdtp', 'poscd', 'cuorg'] 

24 ['monin', 'wrky', 'first_mob', 'cycam', 'slam', 'sum_area_c', 'sum_u2_ind', 'sum_u3_ind', 'sum_u4_ind', 'sum_l2_ind', 'sum_l3_ind', 'sum_l4_ind', 'constant_area_c', 'constant_u2_ind', 'constant_u3_ind', 'constant_u4_ind', 'constant_l2_ind', 'constant_l3_ind', 'constant_l4_ind', 'constant_change', 'growth_rate', 'monotone_up', 'monotone_down', 'objam']


In [14]:
def data_split(df, numeric_cols=[], category_cols=[], test_size=0.166, x_minmax=None, y_minmax=None):
    
    x_train, x_test, y_train, y_test = [], [], [], []
    df = df[category_cols + numeric_cols].copy()
            
    for i in tqdm(sorted(df.chid.unique())):
        data = df[df.chid == i]
        last = data.shape[0] - 1
        test_num = round(data.shape[0]*test_size)            

        x_train.append(data.iloc[0:last - test_num])
        y_train.append(data.iloc[1:last - test_num + 1, [-1]])

        x_test.append(data.iloc[last - test_num: last])
        y_test.append(data.iloc[last - test_num + 1: last + 1, [-1]])

    x_train = pd.concat(x_train)
    y_train = pd.concat(y_train)
    
    x_test = pd.concat(x_test)
    y_test = pd.concat(y_test)
    
    if x_minmax or y_minmax:
        scaler_dcit = dict()
    
    if x_minmax:
        x_scaler = MinMaxScaler(feature_range=x_minmax)
        x_train[numeric_cols] = x_scaler.fit_transform(x_train[numeric_cols])
        x_test[numeric_cols] = x_scaler.transform(x_test[numeric_cols]) 
        
        scaler_dcit['x'] = x_scaler
    if y_minmax:
        y_scaler = MinMaxScaler(feature_range=y_minmax)  
        y_train = y_scaler.fit_transform(y_train)
        y_test = y_scaler.transform(y_test)    
        
        scaler_dict['y'] = y_scaler
         
    if x_minmax or y_minmax:
        return x_train, x_test, y_train, y_test, scaler_dcit
    else:
        return x_train, x_test, y_train, y_test

In [15]:
x_minmax, y_minmax = (0,1), None

if x_minmax or y_minmax:
    x_train, x_test, y_train, y_test, scaler_dcit = data_split(df_full_y_sum, numeric_cols, category_cols, 
                                                               x_minmax=x_minmax, y_minmax=y_minmax, test_size=0.166)
else:
    x_train, x_test, y_train, y_test = data_split(df_full_y_sum, numeric_cols, category_cols, test_size=0.166)    

num_chid = len(set(df_full_y_sum.chid))
print('train:{}, test:{}'.format(x_train.shape[0]//num_chid, x_test.shape[0]//num_chid))
print(x_train.shape, y_train.shape, x_test.shape, y_test.shape)

HBox(children=(HTML(value=''), FloatProgress(value=0.0, max=50000.0), HTML(value='')))


train:9, test:2
(450000, 31) (450000, 1) (100000, 31) (100000, 1)


In [16]:
y_test.head()

Unnamed: 0,objam
10,10.621522
11,12.918493
22,0.0
23,0.0
34,13.009936


## Model

In [17]:
import copy
import torch
from torch.utils.data import DataLoader, TensorDataset
from sklearn.metrics import mean_squared_error, mean_absolute_error, median_absolute_error

In [19]:
def feature_index(x, feature_cols):
    feature_idx = {}
    x_cols = list(x.columns)
    for i in feature_cols:
        feature_idx[i] = x_cols.index(i)
        
    return feature_idx

def Linear_block(in_dim, out_dim):
    block = torch.nn.Sequential(torch.nn.Linear(in_dim, out_dim),
                                torch.nn.ReLU())
    return block

class MLP(torch.nn.Module):
    def __init__(self, category_cols, category_dims, ori_dim, layer_dims, embedding_dim):
        super(MLP, self).__init__()
        self.out_dims = [ori_dim, *layer_dims]
        Linear_blokcs = [Linear_block(in_dim, out_dim)
                         for in_dim, out_dim in zip(self.out_dims, self.out_dims[1:])]
        self.model = torch.nn.Sequential(*Linear_blokcs)
        self.embedding_dict = torch.nn.ModuleDict({category_col:torch.nn.Embedding(category_dim,
                                                                                   embedding_dim)
                                                   if category_col != 'chid' else torch.nn.Embedding(category_dim,
                                                                                   512)
                                                   for category_col, category_dim in zip(category_cols,category_dims)})



        
    def forward(self, x, category_cols, category_dict, numeric_dict):
    
        category_embeddings = [self.embedding_dict[item[0]](x[:,item[1]].long()) for item in category_dict.items()]
        category_embeddings = torch.cat(category_embeddings, -1)
        
        numeric_idx = torch.Tensor(list(numeric_dict.values())).long()
        
        x = torch.cat([category_embeddings, x[:,numeric_idx]], -1)
        
        x = self.model(x)
        
        return x

In [22]:
print(len(ignore_cols), ignore_cols, '\n')
print(len(category_cols), category_cols, '\n')
print(len(numeric_cols), numeric_cols)

1 ['data_dt'] 

7 ['chid', 'masts', 'educd', 'naty', 'trdtp', 'poscd', 'cuorg'] 

24 ['monin', 'wrky', 'first_mob', 'cycam', 'slam', 'sum_area_c', 'sum_u2_ind', 'sum_u3_ind', 'sum_u4_ind', 'sum_l2_ind', 'sum_l3_ind', 'sum_l4_ind', 'constant_area_c', 'constant_u2_ind', 'constant_u3_ind', 'constant_u4_ind', 'constant_l2_ind', 'constant_l3_ind', 'constant_l4_ind', 'constant_change', 'growth_rate', 'monotone_up', 'monotone_down', 'objam']


In [23]:
category_cols = category_cols[:]
numeric_cols = numeric_cols

In [32]:
category_dims = [df_full_y_sum[feat].nunique() for feat in category_cols]

category_dict = feature_index(x_train, category_cols)
numeric_dict = feature_index(x_train, numeric_cols)
embedding_size = 64

layer_dims = [256, 128, 1]
input_dim = (len(category_dict)-1)*embedding_size + len(numeric_dict) + 512

epochs = 400
batch_size = 2048
learning_rate = 0.001

In [33]:
train_dataset = TensorDataset(torch.from_numpy(x_train.to_numpy()),
                              torch.from_numpy(y_train.to_numpy()))
train_loader = DataLoader(dataset=train_dataset, shuffle=True, batch_size=batch_size)

test_dataset = TensorDataset(torch.from_numpy(x_test.to_numpy()),
                              torch.from_numpy(y_test.to_numpy()))
test_loader = DataLoader(dataset=test_dataset, shuffle=False, batch_size=batch_size)

In [34]:
device = torch.device('cuda:0' if torch.cuda.is_available() else 'cpu')
model = MLP(category_cols, category_dims, input_dim,layer_dims, embedding_size).to(device)
criterion = torch.nn.MSELoss()
model

MLP(
  (model): Sequential(
    (0): Sequential(
      (0): Linear(in_features=920, out_features=256, bias=True)
      (1): ReLU()
    )
    (1): Sequential(
      (0): Linear(in_features=256, out_features=128, bias=True)
      (1): ReLU()
    )
    (2): Sequential(
      (0): Linear(in_features=128, out_features=1, bias=True)
      (1): ReLU()
    )
  )
  (embedding_dict): ModuleDict(
    (chid): Embedding(50000, 512)
    (masts): Embedding(4, 64)
    (educd): Embedding(7, 64)
    (naty): Embedding(3, 64)
    (trdtp): Embedding(28, 64)
    (poscd): Embedding(10, 64)
    (cuorg): Embedding(31, 64)
  )
)

In [35]:
USE_NODE2VEC = True
if USE_NODE2VEC:
    node2vec_emb = np.load(os.path.join('../Embedding/GCNEncoder_0126.npy'), allow_pickle=True)
    node2vec_emb = torch.from_numpy(node2vec_emb[:50000])
    model.embedding_dict['chid'].weight.data.copy_(node2vec_emb)

In [36]:
CHID_FINETUNING = True

if CHID_FINETUNING:
    param_optimizer = list(model.named_parameters())
    optimizer_parameters = [{"params": [p for n, p in param_optimizer]}]  
    #model.embedding_dict['chid'].weight.requires_grad = True
else:
    param_optimizer = list(model.named_parameters())
    optimizer_parameters = [{"params": [p for n, p in param_optimizer if not 'embedding_dict.chid.weight' == n]}]
    #model.embedding_dict['chid'].weight.requires_grad = False
    
optimizer = torch.optim.Adam(optimizer_parameters, lr=learning_rate)

In [40]:
early_stop = 20

best_loss = 1e10
early_cnt = 0
RMSE = []
for epoch in tqdm(range(epochs)):
    
    train_loss = 0
    test_loss = 0
    train_output = np.array([])
    train_y = np.array([])
    test_output = np.array([])
    test_y = np.array([])
    
    for x , y in train_loader:
        x, y = x.float().to(device), y.float().to(device)
        model.train()
        optimizer.zero_grad()
        
        output = model(x, category_cols, category_dict, numeric_dict)

        loss = criterion(output, y)
        train_loss += loss.item()
        train_output = np.concatenate([train_output,output.cpu().detach().numpy().reshape(-1)])
        train_y = np.concatenate([train_y,y.cpu().detach().numpy().reshape(-1)])
        
        loss.backward()
        optimizer.step()
        
    for x , y in test_loader:
        x, y = x.float().to(device), y.float().to(device)
        model.eval()        
        output = model(x, category_cols, category_dict, numeric_dict)
        loss = criterion(output, y)
        test_loss += loss.item()
        test_output = np.concatenate([test_output,output.cpu().detach().numpy().reshape(-1)])
        test_y = np.concatenate([test_y,y.cpu().detach().numpy().reshape(-1)])
    
    #train_loss = np.sqrt(train_loss/len(train_loader))
    #test_loss = np.sqrt(test_loss/len(test_loader))
    
    train_output, train_y = np.e**train_output, np.e**train_y
    train_RMSE = mean_squared_error(train_output, train_y, squared=False)
    train_mean = mean_absolute_error(train_output, train_y)
    train_median = median_absolute_error(train_output, train_y)
    
    test_output, test_y = np.e**test_output, np.e**test_y
    test_RMSE = mean_squared_error(test_output, test_y, squared=False)
    test_mean = mean_absolute_error(test_output, test_y)
    test_median = median_absolute_error(test_output, test_y)
    
    print(f'epoch:{epoch}\ntrain loss:{train_RMSE:.0f},test loss:{test_RMSE:.0f}\ntrain MAE(mean):{train_mean:.0f},test MAE(mean):{test_mean:.0f}\ntrain MAE(median):{train_median:.0f}, test MAE(median):{test_median:.0f}')
    
    if test_RMSE <= best_loss:
        best_model_params = copy.deepcopy(model.state_dict())
        best_loss = test_RMSE
        print('\tBetter!')
        early_cnt = 0
    else:
        early_cnt += 1
    
    if early_cnt >= early_stop:
        break

model.load_state_dict(best_model_params)

HBox(children=(HTML(value=''), FloatProgress(value=0.0, max=400.0), HTML(value='')))

epoch:0
train loss:379507,test loss:404373
train MAE(mean):50198,test MAE(mean):59169
train MAE(median):9008, test MAE(median):10842
	Better!
epoch:1
train loss:376817,test loss:404946
train MAE(mean):49868,test MAE(mean):57099
train MAE(median):8936, test MAE(median):9990
epoch:2
train loss:377736,test loss:403823
train MAE(mean):49108,test MAE(mean):58054
train MAE(median):8759, test MAE(median):11065
	Better!
epoch:3
train loss:376305,test loss:404406
train MAE(mean):49073,test MAE(mean):57342
train MAE(median):8756, test MAE(median):10518
epoch:4
train loss:373926,test loss:408343
train MAE(mean):48782,test MAE(mean):57729
train MAE(median):8696, test MAE(median):9487
epoch:5
train loss:375683,test loss:404680
train MAE(mean):48964,test MAE(mean):58152
train MAE(median):8690, test MAE(median):10961
epoch:6
train loss:375376,test loss:405352
train MAE(mean):48900,test MAE(mean):57727
train MAE(median):8654, test MAE(median):10501
epoch:7
train loss:376012,test loss:406782
train MAE(

<All keys matched successfully>

In [38]:
train_output = np.array([])
train_y = np.array([])
test_output = np.array([])
test_y = np.array([])

model.eval()

for x , y in train_loader:
    x, y = x.float().to(device), y.float().to(device)
    optimizer.zero_grad()

    output = model(x, category_cols, category_dict, numeric_dict)
    train_output = np.concatenate([train_output,output.cpu().detach().numpy().reshape(-1)])
    train_y = np.concatenate([train_y,y.cpu().detach().numpy().reshape(-1)])

train_output, train_y = np.e**train_output, np.e**train_y
    
for x , y in test_loader:
    x, y = x.float().to(device), y.float().to(device)
            
    output = model(x, category_cols, category_dict, numeric_dict)
    test_output = np.concatenate([test_output,output.cpu().detach().numpy().reshape(-1)])
    test_y = np.concatenate([test_y,y.cpu().detach().numpy().reshape(-1)])

test_output, test_y = np.e**test_output, np.e**test_y

In [39]:
print('train\tRMSE: {:.0f} MAE(mean): {:.0f} MAE(median): {:.0f}'.format(
    mean_squared_error(train_y, train_output, squared=False), 
    mean_absolute_error(train_y, train_output), 
    median_absolute_error(train_y, train_output)
))
print('test\tRMSE: {:.0f} MAE(mean): {:.0f} MAE(median): {:.0f}'.format(
    mean_squared_error(test_y, test_output, squared=False), 
    mean_absolute_error(test_y, test_output), 
    median_absolute_error(test_y, test_output)
))

train	RMSE: 373885 MAE(mean): 47672 MAE(median): 8509
test	RMSE: 403241 MAE(mean): 57119 MAE(median): 10399


In [None]:
df_out = x_test[['chid']].copy()
df_out['true'] = test_y
df_out['pred'] = test_output

In [None]:
df_out.to_csv('result/mlp_output.csv', index=False, encoding='utf-8')