In [1]:
import os
import json
import copy
import numpy as np
import pandas as pd

from time import time
from tqdm import tqdm, trange

import torch
from torch_geometric.nn import Node2Vec
from torch.utils.data import DataLoader, TensorDataset
from sklearn.metrics import mean_squared_error, mean_absolute_error, median_absolute_error

from Model import MLP_with_pretrain, Whole_model_Node2Vec
from utils import column_idx, make_edges_symmetry



In [2]:
device = torch.device('cuda:1' if torch.cuda.is_available() else 'cpu')

shop_col = 'stonc_6_label'
#shop_col = 'mcc'
#shop_col = 'stonc_label'
#shop_col = 'stonc_10_label'

embedding_size = 64

epochs = 400
early_stop = 20
batch_size = 2048
learning_rate = 0.001

pretrain_weights = './weights/node2vec_weights_stonc6'
result_path = './result/Node2Vec_stonc6.csv'

In [3]:
sample_data_path = './data/sample'

chid_dict_file_name = 'sample_50k_idx_map.npy'
cdtx_file_name = 'sample_50k_cdtx.csv'

sample_chid_dict = os.path.join(sample_data_path, chid_dict_file_name)
sample_cdtx_file = os.path.join(sample_data_path, cdtx_file_name)

In [4]:
downstream_data_path = './data/downstream'

x_train_file_name = 'x_train.csv'
x_test_file_name = 'x_test.csv'
y_train_file_name = 'y_train.csv'
y_test_file_name = 'y_test.csv'

x_train_file = os.path.join(downstream_data_path, x_train_file_name)
x_test_file = os.path.join(downstream_data_path, x_test_file_name)
y_train_file = os.path.join(downstream_data_path, y_train_file_name)
y_test_file = os.path.join(downstream_data_path, y_test_file_name)

In [5]:
df_cdtx = pd.read_csv(sample_cdtx_file)
df_cdtx.sort_values('csmdt')

# Load dict
idx_map = np.load(sample_chid_dict, allow_pickle=True).tolist()

l = len(idx_map)
for i , j in enumerate(sorted(df_cdtx[shop_col].unique())):
    idx_map[j] = i+l

In [6]:
df_cdtx.chid = df_cdtx.chid.map(idx_map)
df_cdtx[shop_col] = df_cdtx[shop_col].map(idx_map)

In [7]:
df_cdtx.csmdt = df_cdtx.csmdt.apply(lambda x: x[:8]+'01')
df_cdtx.objam = df_cdtx.objam.apply(lambda x: int(x))
df_cdtx = df_cdtx[df_cdtx.csmdt < '2019-01-01']

In [8]:
edge_pairs = df_cdtx[['chid', shop_col]].copy()
edge_pairs.drop_duplicates(ignore_index=True, inplace=True)
edge_pairs = edge_pairs.to_numpy().T

edge_pairs = make_edges_symmetry(edge_pairs)
edge_pairs = torch.LongTensor(edge_pairs)

In [9]:
x_train = pd.read_csv(x_train_file)
x_test = pd.read_csv(x_test_file)

y_train = pd.read_csv(y_train_file)
y_test = pd.read_csv(y_test_file)

In [10]:
ignore_cols = ['chid']

category_cols = ['masts', 'educd', 'naty', 'trdtp', 'poscd', 'cuorg']
    
numeric_cols = sorted(set(x_train.columns) - set(category_cols) - set(ignore_cols))

In [11]:
category_dims = {category_col:pd.concat([x_train[category_col],x_test[category_col]]).nunique() 
                 for category_col in category_cols}

category_dict = column_idx(x_train, category_cols)
numeric_dict = column_idx(x_train, numeric_cols)

input_dim = len(category_dict)*embedding_size + len(numeric_dict) + embedding_size

layer_dims = [input_dim, 256, 128, 1]

In [12]:
train_dataset = TensorDataset(torch.from_numpy(x_train.to_numpy()),
                              torch.from_numpy(y_train.to_numpy()))
train_loader = DataLoader(dataset=train_dataset, shuffle=True, batch_size=batch_size)

test_dataset = TensorDataset(torch.from_numpy(x_test.to_numpy()),
                              torch.from_numpy(y_test.to_numpy()))
test_loader = DataLoader(dataset=test_dataset, shuffle=False, batch_size=batch_size)

In [13]:
pre_train_model = Node2Vec(edge_pairs, embedding_dim=embedding_size, walk_length=2,
                           context_size=2, walks_per_node=10,
                           num_negative_samples=1, p=1, q=1, sparse=True)
pre_train_model.load_state_dict(torch.load(pretrain_weights))
pre_train_model.train()

down_stream_model = MLP_with_pretrain(category_dims, layer_dims, embedding_size)

In [14]:
model = Whole_model_Node2Vec(pre_train_model, down_stream_model).to(device)
criterion = torch.nn.MSELoss()
optimizer = torch.optim.Adam(model.parameters(), lr=learning_rate)
print(model)

Whole_model_Node2Vec(
  (pre_train_model): Node2Vec(128560, 64)
  (down_stream_model): MLP_with_pretrain(
    (model): Sequential(
      (0): Sequential(
        (0): Linear(in_features=472, out_features=256, bias=True)
        (1): ReLU()
      )
      (1): Sequential(
        (0): Linear(in_features=256, out_features=128, bias=True)
        (1): ReLU()
      )
      (2): Sequential(
        (0): Linear(in_features=128, out_features=1, bias=True)
        (1): ReLU()
      )
    )
    (embedding_dict): ModuleDict(
      (masts): Embedding(3, 64)
      (educd): Embedding(6, 64)
      (naty): Embedding(2, 64)
      (trdtp): Embedding(27, 64)
      (poscd): Embedding(9, 64)
      (cuorg): Embedding(30, 64)
    )
  )
)


In [15]:
best_loss = 1e10
early_cnt = 0
RMSE = []
for epoch in range(epochs):
    
    train_loss = 0
    test_loss = 0
    train_output = np.array([])
    train_y = np.array([])
    test_output = np.array([])
    test_y = np.array([])
    
    for x , y in train_loader:
        x, y = x.float().to(device), y.float().to(device)
        model.train()
        optimizer.zero_grad()
        
        output = model(x, category_dict, numeric_dict)

        loss = criterion(output, y)
        train_loss += loss.item()
        train_output = np.concatenate([train_output,output.cpu().detach().numpy().reshape(-1)])
        train_y = np.concatenate([train_y,y.cpu().detach().numpy().reshape(-1)])
        
        loss.backward()
        optimizer.step()
        
    for x , y in test_loader:
        x, y = x.float().to(device), y.float().to(device)
        model.eval()        
        output = model(x, category_dict, numeric_dict)
        loss = criterion(output, y)
        test_loss += loss.item()
        test_output = np.concatenate([test_output,output.cpu().detach().numpy().reshape(-1)])
        test_y = np.concatenate([test_y,y.cpu().detach().numpy().reshape(-1)])
    
    #train_loss = np.sqrt(train_loss/len(train_loader))
    #test_loss = np.sqrt(test_loss/len(test_loader))
    
    train_output, train_y = np.e**train_output, np.e**train_y
    train_RMSE = mean_squared_error(train_output, train_y, squared=False)
    train_mean = mean_absolute_error(train_output, train_y)
    train_median = median_absolute_error(train_output, train_y)
    
    test_output, test_y = np.e**test_output, np.e**test_y
    test_RMSE = mean_squared_error(test_output, test_y, squared=False)
    test_mean = mean_absolute_error(test_output, test_y)
    test_median = median_absolute_error(test_output, test_y)
    
    print(f'epoch:{epoch}\ntrain loss:{train_RMSE:.0f},test loss:{test_RMSE:.0f}\ntrain MAE(mean):{train_mean:.0f},test MAE(mean):{test_mean:.0f}\ntrain MAE(median):{train_median:.0f}, test MAE(median):{test_median:.0f}')
    
    if test_RMSE <= best_loss:
        best_model_params = copy.deepcopy(model.state_dict())
        best_loss = test_RMSE
        print('\tBetter!')
        early_cnt = 0
    else:
        early_cnt += 1
    
    if early_cnt >= early_stop:
        break

model.load_state_dict(best_model_params)

epoch:0
train loss:1018532,test loss:614334
train MAE(mean):75661,test MAE(mean):80180
train MAE(median):11531, test MAE(median):11034
	Better!
epoch:1
train loss:1013964,test loss:598450
train MAE(mean):69841,test MAE(mean):71724
train MAE(median):9462, test MAE(median):9306
	Better!
epoch:2
train loss:1006978,test loss:578812
train MAE(mean):63462,test MAE(mean):70125
train MAE(median):8547, test MAE(median):10699
	Better!
epoch:3
train loss:999126,test loss:564742
train MAE(mean):61187,test MAE(mean):70767
train MAE(median):8263, test MAE(median):11389
	Better!
epoch:4
train loss:995906,test loss:551733
train MAE(mean):60544,test MAE(mean):66999
train MAE(median):8222, test MAE(median):9493
	Better!
epoch:5
train loss:990045,test loss:536815
train MAE(mean):59957,test MAE(mean):66673
train MAE(median):8096, test MAE(median):9212
	Better!
epoch:6
train loss:986846,test loss:534209
train MAE(mean):59554,test MAE(mean):66728
train MAE(median):8057, test MAE(median):8953
	Better!
epoch:

<All keys matched successfully>

In [16]:
train_output = np.array([])
train_y = np.array([])
test_output = np.array([])
test_y = np.array([])

model.eval()

for x , y in tqdm(train_loader):
    x, y = x.float().to(device), y.float().to(device)
    
    output = model(x, category_dict, numeric_dict)
    train_output = np.concatenate([train_output,output.cpu().detach().numpy().reshape(-1)])
    train_y = np.concatenate([train_y,y.cpu().detach().numpy().reshape(-1)])

for x , y in test_loader:
    x, y = x.float().to(device), y.float().to(device)
          
    output = model(x, category_dict, numeric_dict)
    test_output = np.concatenate([test_output,output.cpu().detach().numpy().reshape(-1)])
    test_y = np.concatenate([test_y,y.cpu().detach().numpy().reshape(-1)])

train_output, train_y = np.e**train_output, np.e**train_y
test_output, test_y = np.e**test_output, np.e**test_y

100%|██████████| 245/245 [00:05<00:00, 43.39it/s]


In [17]:
print('train\tRMSE: {:.2f} MSE(mean): {:.2f} MSE(median): {:.2f}'.format(
    mean_squared_error(train_y, train_output, squared=False), 
    mean_absolute_error(train_y, train_output), 
    median_absolute_error(train_y, train_output)
))
print('test\tRMSE: {:.2f} MSE(mean): {:.2f} MSE(median): {:.2f}'.format(
    mean_squared_error(test_y, test_output, squared=False), 
    mean_absolute_error(test_y, test_output), 
    median_absolute_error(test_y, test_output)
))

train	RMSE: 983030.83 MSE(mean): 58176.82 MSE(median): 8317.89
test	RMSE: 504650.86 MSE(mean): 67555.17 MSE(median): 10092.15


In [18]:
df_out = x_test[['chid']].copy()
df_out['true'] = test_y
df_out['pred'] = test_output
df_out

Unnamed: 0,chid,true,pred
0,0,11755.999087,3.522862
1,0,35519.989944,3.508471
2,1,150494.007197,2031.030697
3,1,3671.000531,2724.436818
4,2,62179.981812,44748.776804
...,...,...,...
99995,49997,235115.953642,1514.547033
99996,49998,1.000000,139.079169
99997,49998,1.000000,134.432409
99998,49999,2747.000494,1732.665987


In [19]:
df_out.to_csv(result_path, index=False, encoding='utf-8')