In [9]:
import os
import json
import copy
import numpy as np
import pandas as pd

from time import time
from tqdm import tqdm, trange

import torch
from torch_geometric.nn import GCNConv
from torch.utils.data import DataLoader, TensorDataset

from sklearn.preprocessing import MinMaxScaler
from sklearn.metrics import mean_squared_error, mean_absolute_error, median_absolute_error

from Model import GCNEncoder, MLP_with_pretrain, Whole_model_GCN
from utils import column_idx, make_edges_symmetry

In [13]:
device = torch.device('cuda:1' if torch.cuda.is_available() else 'cpu')

shop_col = 'stonc_6_label'
#shop_col = 'mcc'
#shop_col = 'stonc_label'
#shop_col = 'stonc_10_label'

embedding_size = 64

epochs = 400
early_stop = 20
batch_size = 2048
learning_rate = 0.001

pretrain_weights = './weights/GCNencoder_weights_stonc6'
result_path = 'result/GCN_stonc6_weights.csv'

In [14]:
sample_data_path = './data/sample'

chid_dict_file_name = 'sample_50k_idx_map.npy'
cdtx_file_name = 'sample_50k_cdtx.csv'
cust_file_name = 'sample_50k_cust.csv'

sample_chid_dict = os.path.join(sample_data_path, chid_dict_file_name)
sample_cdtx_file = os.path.join(sample_data_path, cdtx_file_name)
sample_cust_file = os.path.join(sample_data_path, cust_file_name)

In [15]:
downstream_data_path = './data/downstream'

x_train_file_name = 'x_train.csv'
x_test_file_name = 'x_test.csv'
y_train_file_name = 'y_train.csv'
y_test_file_name = 'y_test.csv'

x_train_file = os.path.join(downstream_data_path, x_train_file_name)
x_test_file = os.path.join(downstream_data_path, x_test_file_name)
y_train_file = os.path.join(downstream_data_path, y_train_file_name)
y_test_file = os.path.join(downstream_data_path, y_test_file_name)

In [16]:
df_cdtx = pd.read_csv(sample_cdtx_file)
df_cdtx.sort_values('csmdt')

df_cust = pd.read_csv(sample_cust_file)
df_cust.drop_duplicates(ignore_index=True, inplace=True)

idx_map = np.load(sample_chid_dict, allow_pickle=True).tolist()

In [17]:
df_cdtx.nunique()

bnsfg                  2
bnspt                338
chid               50000
csmdt                761
iterm                 15
mcc                  507
objam              52132
scity              11073
tcode                  7
hcefg                 10
ecfg                   2
etymd                 15
stonc_tag             49
stonc_label       202387
stonm_label       212342
stonc_6_label      78560
stonc_10_label    128075
dtype: int64

In [18]:
l = len(idx_map)
for i , j in enumerate(sorted(df_cdtx[shop_col].unique())):
    idx_map[j] = i+l

In [19]:
df_cdtx.chid = df_cdtx.chid.map(idx_map)
df_cdtx[shop_col] = df_cdtx[shop_col].map(idx_map)

df_cust.chid = df_cust.chid.map(idx_map)

In [20]:
df_cdtx.csmdt = df_cdtx.csmdt.apply(lambda x: x[:8]+'01')
df_cdtx.objam = df_cdtx.objam.apply(lambda x: int(x))

df_cust.data_dt = df_cust.data_dt.apply(lambda x: x[:10])

In [21]:
ignore_cols = ['chid', 'data_dt']
category_cols = ['masts', 'educd', 'naty', 'trdtp', 'poscd', 'cuorg']
numeric_cols = sorted(set(df_cust.columns) - set(category_cols) - set(ignore_cols)) + ['objam']

In [22]:
mapper = {col: {value: index for index, value in enumerate(sorted(df_cust[col].unique()))} 
          for col in category_cols}

df_cust[category_cols] = df_cust[category_cols].apply(lambda x: x.map(mapper[x.name]))

In [23]:
df_cdtx = df_cdtx[df_cdtx.csmdt < '2019-01-01']
df_cust = df_cust[df_cust.data_dt == '2018-12-01'].sort_values(by=['chid'])

df_cust['objam'] = np.ma.log(df_cdtx.groupby(['chid']).sum().objam.values/12).filled(0)

In [24]:
x_scaler = MinMaxScaler()
df_cust[numeric_cols] = x_scaler.fit_transform(df_cust[numeric_cols])

In [25]:
df_cust_ = df_cust[category_cols+numeric_cols]

cust_feature = torch.Tensor(df_cust_.to_numpy())
shop_feature = torch.zeros(len(idx_map)-cust_feature.shape[0], cust_feature.shape[1])
x_feature = torch.cat([cust_feature, shop_feature])

In [26]:
edge_pairs = df_cdtx[['chid', shop_col]].copy()
edge_pairs.drop_duplicates(ignore_index=True, inplace=True)
edge_pairs = edge_pairs.to_numpy().T

edge_pairs = make_edges_symmetry(edge_pairs)
edge_pairs = torch.LongTensor(edge_pairs)

In [27]:
x_train = pd.read_csv(x_train_file)
x_test = pd.read_csv(x_test_file)

y_train = pd.read_csv(y_train_file)
y_test = pd.read_csv(y_test_file)

In [28]:
ignore_cols = ['chid']

category_cols = ['masts', 'educd', 'naty', 'trdtp', 'poscd', 'cuorg']
    
numeric_cols = sorted(set(x_train.columns) - set(category_cols) - set(ignore_cols))

In [29]:
category_dims = {col_name : len(uni)
                 for col_name, uni in mapper.items()}
category_dict = column_idx(df_cust_, category_cols)
numeric_dict = column_idx(df_cust_, numeric_cols)


input_dim = len(category_dict)*embedding_size + len(numeric_dict) + embedding_size

layer_dims = [input_dim, 256, 128, 1]

In [30]:
train_dataset = TensorDataset(torch.from_numpy(x_train.to_numpy()),
                              torch.from_numpy(y_train.to_numpy()))
train_loader = DataLoader(dataset=train_dataset, shuffle=True, batch_size=batch_size)

test_dataset = TensorDataset(torch.from_numpy(x_test.to_numpy()),
                              torch.from_numpy(y_test.to_numpy()))
test_loader = DataLoader(dataset=test_dataset, shuffle=False, batch_size=batch_size)

In [31]:
pre_train_model = GCNEncoder(input_dim-embedding_size, embedding_size, category_dims)
pre_train_model.load_state_dict(torch.load(pretrain_weights))
pre_train_model.train()

down_stream_model = MLP_with_pretrain(category_dims, layer_dims, embedding_size)

In [32]:
model = Whole_model_GCN(pre_train_model, down_stream_model).to(device)
x_feature = x_feature.to(device)
edge_index = edge_pairs.to(device)
criterion = torch.nn.MSELoss()
optimizer = torch.optim.Adam(model.parameters(), lr=learning_rate)
print(model)

Whole_model_GCN(
  (pre_train_model): GCNEncoder(
    (conv1): GCNConv(408, 128)
    (conv2): GCNConv(128, 64)
    (embedding_dict): ModuleDict(
      (masts): Embedding(3, 64)
      (educd): Embedding(6, 64)
      (naty): Embedding(2, 64)
      (trdtp): Embedding(27, 64)
      (poscd): Embedding(9, 64)
      (cuorg): Embedding(30, 64)
    )
  )
  (down_stream_model): MLP_with_pretrain(
    (model): Sequential(
      (0): Sequential(
        (0): Linear(in_features=472, out_features=256, bias=True)
        (1): ReLU()
      )
      (1): Sequential(
        (0): Linear(in_features=256, out_features=128, bias=True)
        (1): ReLU()
      )
      (2): Sequential(
        (0): Linear(in_features=128, out_features=1, bias=True)
        (1): ReLU()
      )
    )
    (embedding_dict): ModuleDict(
      (masts): Embedding(3, 64)
      (educd): Embedding(6, 64)
      (naty): Embedding(2, 64)
      (trdtp): Embedding(27, 64)
      (poscd): Embedding(9, 64)
      (cuorg): Embedding(30, 64)
   

In [33]:
best_loss = 1e10
early_cnt = 0
RMSE = []
for epoch in range(epochs):
    
    train_loss = 0
    test_loss = 0
    train_output = np.array([])
    train_y = np.array([])
    test_output = np.array([])
    test_y = np.array([])
    
    for x , y in tqdm(train_loader):
        x, y = x.float().to(device), y.float().to(device)
        model.train()
        optimizer.zero_grad()
        
        output = model(x, x_feature, edge_index, category_dict, numeric_dict)

        loss = criterion(output, y)
        train_loss += loss.item()
        train_output = np.concatenate([train_output,output.cpu().detach().numpy().reshape(-1)])
        train_y = np.concatenate([train_y,y.cpu().detach().numpy().reshape(-1)])
        
        loss.backward()
        optimizer.step()
        
    for x , y in test_loader:
        x, y = x.float().to(device), y.float().to(device)
        model.eval()        
        output = model(x, x_feature, edge_index, category_dict, numeric_dict)
        loss = criterion(output, y)
        test_loss += loss.item()
        test_output = np.concatenate([test_output,output.cpu().detach().numpy().reshape(-1)])
        test_y = np.concatenate([test_y,y.cpu().detach().numpy().reshape(-1)])
    
    #train_loss = np.sqrt(train_loss/len(train_loader))
    #test_loss = np.sqrt(test_loss/len(test_loader))
    
    train_output, train_y = np.e**train_output, np.e**train_y
    train_RMSE = mean_squared_error(train_output, train_y, squared=False)
    train_mean = mean_absolute_error(train_output, train_y)
    train_median = median_absolute_error(train_output, train_y)
    
    test_output, test_y = np.e**test_output, np.e**test_y
    test_RMSE = mean_squared_error(test_output, test_y, squared=False)
    test_mean = mean_absolute_error(test_output, test_y)
    test_median = median_absolute_error(test_output, test_y)
    
    print(f'epoch:{epoch}\ntrain loss:{train_RMSE:.0f},test loss:{test_RMSE:.0f}\ntrain MAE(mean):{train_mean:.0f},test MAE(mean):{test_mean:.0f}\ntrain MAE(median):{train_median:.0f}, test MAE(median):{test_median:.0f}')
    
    if test_RMSE <= best_loss:
        best_model_params = copy.deepcopy(model.state_dict())
        best_loss = test_RMSE
        print('\tBetter!')
        early_cnt = 0
    else:
        early_cnt += 1
    
    if early_cnt >= early_stop:
        break

model.load_state_dict(best_model_params)

100%|██████████| 245/245 [00:15<00:00, 16.26it/s]
  1%|          | 2/245 [00:00<00:17, 13.82it/s]

epoch:0
train loss:1017793,test loss:616035
train MAE(mean):75510,test MAE(mean):80998
train MAE(median):11454, test MAE(median):11200
	Better!


100%|██████████| 245/245 [00:15<00:00, 15.79it/s]
  0%|          | 1/245 [00:00<00:27,  8.91it/s]

epoch:1
train loss:1021583,test loss:1358217
train MAE(mean):71577,test MAE(mean):80649
train MAE(median):10523, test MAE(median):10927


100%|██████████| 245/245 [00:15<00:00, 16.13it/s]
  0%|          | 1/245 [00:00<00:24,  9.97it/s]

epoch:2
train loss:2406774,test loss:4631634
train MAE(mean):75209,test MAE(mean):96482
train MAE(median):10149, test MAE(median):10117


100%|██████████| 245/245 [00:14<00:00, 16.34it/s]
  0%|          | 1/245 [00:00<00:25,  9.60it/s]

epoch:3
train loss:3995127,test loss:4842196
train MAE(mean):79138,test MAE(mean):99956
train MAE(median):10065, test MAE(median):10643


100%|██████████| 245/245 [00:14<00:00, 16.81it/s]
  1%|          | 2/245 [00:00<00:17, 13.74it/s]

epoch:4
train loss:24559718,test loss:24327715
train MAE(mean):124776,test MAE(mean):179902
train MAE(median):9976, test MAE(median):10344


100%|██████████| 245/245 [00:14<00:00, 16.88it/s]
  1%|          | 2/245 [00:00<00:17, 13.92it/s]

epoch:5
train loss:23266563,test loss:8699180
train MAE(mean):126290,test MAE(mean):117443
train MAE(median):9991, test MAE(median):10771


100%|██████████| 245/245 [00:14<00:00, 16.73it/s]
  0%|          | 1/245 [00:00<00:25,  9.50it/s]

epoch:6
train loss:33717401,test loss:20814668
train MAE(mean):148550,test MAE(mean):168194
train MAE(median):9966, test MAE(median):10192


100%|██████████| 245/245 [00:14<00:00, 16.79it/s]
  1%|          | 2/245 [00:00<00:17, 13.82it/s]

epoch:7
train loss:66859320,test loss:52134067
train MAE(mean):235685,test MAE(mean):298812
train MAE(median):9970, test MAE(median):10329


100%|██████████| 245/245 [00:14<00:00, 16.79it/s]
  1%|          | 2/245 [00:00<00:17, 14.04it/s]

epoch:8
train loss:44307670,test loss:77739185
train MAE(mean):169853,test MAE(mean):408301
train MAE(median):9937, test MAE(median):10520


100%|██████████| 245/245 [00:14<00:00, 16.60it/s]
  0%|          | 1/245 [00:00<00:25,  9.71it/s]

epoch:9
train loss:73019757,test loss:58584631
train MAE(mean):237471,test MAE(mean):325395
train MAE(median):9930, test MAE(median):10165


100%|██████████| 245/245 [00:14<00:00, 16.98it/s]
  1%|          | 2/245 [00:00<00:16, 14.37it/s]

epoch:10
train loss:76516581,test loss:87303010
train MAE(mean):248489,test MAE(mean):451510
train MAE(median):9964, test MAE(median):10135


100%|██████████| 245/245 [00:14<00:00, 16.87it/s]
  0%|          | 1/245 [00:00<00:24,  9.79it/s]

epoch:11
train loss:46807847,test loss:56822935
train MAE(mean):188714,test MAE(mean):319272
train MAE(median):9901, test MAE(median):10097


100%|██████████| 245/245 [00:14<00:00, 16.75it/s]
  0%|          | 1/245 [00:00<00:25,  9.41it/s]

epoch:12
train loss:46955675,test loss:43446163
train MAE(mean):185497,test MAE(mean):263887
train MAE(median):9892, test MAE(median):10090


100%|██████████| 245/245 [00:14<00:00, 16.62it/s]
  1%|          | 2/245 [00:00<00:17, 13.90it/s]

epoch:13
train loss:68182707,test loss:35844111
train MAE(mean):235931,test MAE(mean):230083
train MAE(median):9939, test MAE(median):10355


100%|██████████| 245/245 [00:14<00:00, 17.06it/s]
  1%|          | 2/245 [00:00<00:17, 13.92it/s]

epoch:14
train loss:67521476,test loss:27229317
train MAE(mean):221207,test MAE(mean):192777
train MAE(median):9961, test MAE(median):10171


100%|██████████| 245/245 [00:14<00:00, 16.87it/s]
  1%|          | 2/245 [00:00<00:17, 14.01it/s]

epoch:15
train loss:41913663,test loss:56995900
train MAE(mean):175346,test MAE(mean):317241
train MAE(median):9882, test MAE(median):10395


100%|██████████| 245/245 [00:14<00:00, 16.77it/s]
  1%|          | 2/245 [00:00<00:17, 14.28it/s]

epoch:16
train loss:75219734,test loss:40381708
train MAE(mean):205715,test MAE(mean):248452
train MAE(median):9895, test MAE(median):10172


100%|██████████| 245/245 [00:14<00:00, 16.78it/s]
  1%|          | 2/245 [00:00<00:17, 14.00it/s]

epoch:17
train loss:34389088,test loss:33626999
train MAE(mean):155264,test MAE(mean):218943
train MAE(median):9919, test MAE(median):10198


100%|██████████| 245/245 [00:14<00:00, 16.84it/s]
  0%|          | 0/245 [00:00<?, ?it/s]

epoch:18
train loss:42420085,test loss:59706202
train MAE(mean):177036,test MAE(mean):324838
train MAE(median):9877, test MAE(median):10169


100%|██████████| 245/245 [00:14<00:00, 16.80it/s]
  1%|          | 2/245 [00:00<00:17, 14.02it/s]

epoch:19
train loss:39352486,test loss:33829693
train MAE(mean):160491,test MAE(mean):218266
train MAE(median):9882, test MAE(median):10066


100%|██████████| 245/245 [00:14<00:00, 16.71it/s]


epoch:20
train loss:28540343,test loss:42676495
train MAE(mean):138068,test MAE(mean):254210
train MAE(median):9934, test MAE(median):10215


<All keys matched successfully>

In [28]:
train_output = np.array([])
train_y = np.array([])
test_output = np.array([])
test_y = np.array([])

model.eval()

for x , y in tqdm(train_loader):
    x, y = x.float().to(device), y.float().to(device)

    output = model(x, x_feature, edge_index, category_dict, numeric_dict)
    train_output = np.concatenate([train_output,output.cpu().detach().numpy().reshape(-1)])
    train_y = np.concatenate([train_y,y.cpu().detach().numpy().reshape(-1)])
    
for x , y in test_loader:
    x, y = x.float().to(device), y.float().to(device)       
    output = model(x, x_feature, edge_index, category_dict, numeric_dict)
    test_output = np.concatenate([test_output,output.cpu().detach().numpy().reshape(-1)])
    test_y = np.concatenate([test_y,y.cpu().detach().numpy().reshape(-1)])
    
train_output, train_y = np.e**train_output, np.e**train_y
test_output, test_y = np.e**test_output, np.e**test_y

100%|██████████| 245/245 [00:09<00:00, 26.48it/s]


In [29]:
print('train\tRMSE: {:.2f} MSE(mean): {:.2f} MSE(median): {:.2f}'.format(
    mean_squared_error(train_y, train_output, squared=False), 
    mean_absolute_error(train_y, train_output), 
    median_absolute_error(train_y, train_output)
))
print('test\tRMSE: {:.2f} MSE(mean): {:.2f} MSE(median): {:.2f}'.format(
    mean_squared_error(test_y, test_output, squared=False), 
    mean_absolute_error(test_y, test_output), 
    median_absolute_error(test_y, test_output)
))

train	RMSE: 1015456.50 MSE(mean): 73420.35 MSE(median): 10492.29
test	RMSE: 615057.77 MSE(mean): 80873.78 MSE(median): 10972.66


In [30]:
df_out = x_test[['chid']].copy()
df_out['true'] = test_y
df_out['pred'] = test_output

In [31]:
df_out

Unnamed: 0,chid,true,pred
0,0,11755.999087,7.328887
1,0,35519.989944,51.180935
2,1,150494.007197,19748.708809
3,1,3671.000531,214489.741070
4,2,62179.981812,6916.797713
...,...,...,...
99995,49997,235115.953642,15049.737520
99996,49998,1.000000,1233.773528
99997,49998,1.000000,433.081405
99998,49999,2747.000494,3615.231416


In [32]:
df_out.to_csv(result_path, index=False, encoding='utf-8')