In [1]:
import os
import json
import numpy as np
import pandas as pd

from time import time
from tqdm import tqdm, trange

import torch
from matplotlib import pyplot as plt
from torch_geometric.nn import Node2Vec
from sklearn.manifold import TSNE

from utils import make_edges_symmetry



# 讀取檔案

In [3]:
device = torch.device('cuda:1' if torch.cuda.is_available() else 'cpu')

shop_col = 'stonc_6_label'
#shop_col = 'mcc'
##shop_col = 'stonc_label'
#shop_col = 'stonc_10_label'

epoch = 400
batch_size = 512
embedding_size = 64
learning_rate = 0.01
pretrain_weights = './weights/node2vec_weights_stonc6'

In [4]:
sample_data_path = './data/sample'

chid_dict_file_name = 'sample_50k_idx_map.npy'
cdtx_file_name = 'sample_50k_cdtx.csv'

sample_chid_dict = os.path.join(sample_data_path, chid_dict_file_name)
sample_cdtx_file = os.path.join(sample_data_path, cdtx_file_name)

In [5]:
df_cdtx = pd.read_csv(sample_cdtx_file)
df_cdtx.sort_values('csmdt')

# Load dict
idx_map = np.load(sample_chid_dict, allow_pickle=True).tolist()

In [6]:
l = len(idx_map)
for i , j in enumerate(sorted(df_cdtx[shop_col].unique())):
    idx_map[j] = i+l

In [7]:
df_cdtx.chid = df_cdtx.chid.map(idx_map)
df_cdtx[shop_col] = df_cdtx[shop_col].map(idx_map)

In [8]:
df_cdtx.csmdt = df_cdtx.csmdt.apply(lambda x: x[:8]+'01')
df_cdtx = df_cdtx[df_cdtx.csmdt < '2019-01-01']

In [9]:
edge_pairs = df_cdtx[['chid', shop_col]].copy()
edge_pairs.drop_duplicates(ignore_index=True, inplace=True)
edge_pairs = edge_pairs.to_numpy().T

edge_pairs = make_edges_symmetry(edge_pairs)
edge_pairs = torch.LongTensor(edge_pairs)

In [10]:
model = Node2Vec(edge_pairs, embedding_dim=embedding_size, walk_length=2,
                 context_size=2, walks_per_node=10,
                 num_negative_samples=1, p=1, q=1, sparse=True).to(device)

In [11]:
loader = model.loader(batch_size=batch_size, shuffle=True)
optimizer = torch.optim.SparseAdam(list(model.parameters()), lr=learning_rate)

In [12]:
def train():
    model.train()
    total_loss = 0
    for pos_rw, neg_rw in loader:
        optimizer.zero_grad()
        loss = model.loss(pos_rw.to(device), neg_rw.to(device))
        loss.backward()
        optimizer.step()
        total_loss += loss.item()
    return total_loss / len(loader)

In [13]:
s_t = time()

for epoch in range(epoch):
    loss = train()
    print(f'Epoch: {epoch+1:02d}, Loss: {loss:.4f}')
print(time()-s_t)

Epoch: 01, Loss: 5.8908
Epoch: 02, Loss: 4.3920
Epoch: 03, Loss: 3.6607
Epoch: 04, Loss: 3.3022
Epoch: 05, Loss: 3.0457
Epoch: 06, Loss: 2.8363
Epoch: 07, Loss: 2.6629
Epoch: 08, Loss: 2.5185
Epoch: 09, Loss: 2.3903
Epoch: 10, Loss: 2.2657
Epoch: 11, Loss: 2.1680
Epoch: 12, Loss: 2.0686
Epoch: 13, Loss: 1.9859
Epoch: 14, Loss: 1.9048
Epoch: 15, Loss: 1.8303
Epoch: 16, Loss: 1.7587
Epoch: 17, Loss: 1.6936
Epoch: 18, Loss: 1.6375
Epoch: 19, Loss: 1.5802
Epoch: 20, Loss: 1.5277
Epoch: 21, Loss: 1.4790
Epoch: 22, Loss: 1.4344
Epoch: 23, Loss: 1.3920
Epoch: 24, Loss: 1.3535
Epoch: 25, Loss: 1.3178
Epoch: 26, Loss: 1.2847
Epoch: 27, Loss: 1.2557
Epoch: 28, Loss: 1.2260
Epoch: 29, Loss: 1.2001
Epoch: 30, Loss: 1.1740
Epoch: 31, Loss: 1.1534
Epoch: 32, Loss: 1.1323
Epoch: 33, Loss: 1.1134
Epoch: 34, Loss: 1.0952
Epoch: 35, Loss: 1.0751
Epoch: 36, Loss: 1.0582
Epoch: 37, Loss: 1.0445
Epoch: 38, Loss: 1.0315
Epoch: 39, Loss: 1.0171
Epoch: 40, Loss: 1.0040
Epoch: 41, Loss: 0.9929
Epoch: 42, Loss:

Epoch: 333, Loss: 0.7707
Epoch: 334, Loss: 0.7701
Epoch: 335, Loss: 0.7705
Epoch: 336, Loss: 0.7696
Epoch: 337, Loss: 0.7705
Epoch: 338, Loss: 0.7697
Epoch: 339, Loss: 0.7698
Epoch: 340, Loss: 0.7701
Epoch: 341, Loss: 0.7702
Epoch: 342, Loss: 0.7707
Epoch: 343, Loss: 0.7698
Epoch: 344, Loss: 0.7697
Epoch: 345, Loss: 0.7701
Epoch: 346, Loss: 0.7704
Epoch: 347, Loss: 0.7698
Epoch: 348, Loss: 0.7708
Epoch: 349, Loss: 0.7698
Epoch: 350, Loss: 0.7703
Epoch: 351, Loss: 0.7703
Epoch: 352, Loss: 0.7702
Epoch: 353, Loss: 0.7703
Epoch: 354, Loss: 0.7709
Epoch: 355, Loss: 0.7705
Epoch: 356, Loss: 0.7702
Epoch: 357, Loss: 0.7707
Epoch: 358, Loss: 0.7707
Epoch: 359, Loss: 0.7701
Epoch: 360, Loss: 0.7699
Epoch: 361, Loss: 0.7707
Epoch: 362, Loss: 0.7706
Epoch: 363, Loss: 0.7704
Epoch: 364, Loss: 0.7700
Epoch: 365, Loss: 0.7698
Epoch: 366, Loss: 0.7703
Epoch: 367, Loss: 0.7701
Epoch: 368, Loss: 0.7695
Epoch: 369, Loss: 0.7698
Epoch: 370, Loss: 0.7704
Epoch: 371, Loss: 0.7704
Epoch: 372, Loss: 0.7700


In [12]:
model.eval()
z = model(torch.arange(edge_pairs.max()+1, device=device))
#z = TSNE(n_components=2).fit_transform(z.detach().cpu().numpy())

In [None]:
np.save('./embedding/node2vec_0221.npz',z.detach().cpu().numpy())

In [None]:
torch.save(model.state_dict(), pretrain_weights)