In [1]:
import os
import json
import pandas as pd
import numpy as np

import torch
from torch_geometric.data import Data
from torch_geometric.nn import GCNConv, GAE

from time import time
from LightGCN import LightGCN
from utils import make_edges_symmetry, column_idx, train_test_split_edges

from sklearn.preprocessing import MinMaxScaler



# 讀取檔案

In [14]:
device = torch.device('cuda:1' if torch.cuda.is_available() else 'cpu')

shop_col = 'stonc_6_label'
#shop_col = 'mcc'
#shop_col = 'stonc_label'
#shop_col = 'stonc_10_label'

load_edges = True
edges_path = './edges_stonc6.pkl'
pretrain_weights = './weights/LightGCNencoder_stonc6'

epochs = 400
batch_size = 2048
learning_rate = 0.001

embedding_size = 64
n_layers = 3

In [15]:
sample_data_path = './data/sample'

chid_dict_file_name = 'sample_50k_idx_map.npy'
cdtx_file_name = 'sample_50k_cdtx.csv'
cust_file_name = 'sample_50k_cust.csv'

sample_chid_dict = os.path.join(sample_data_path, chid_dict_file_name)
sample_cdtx_file = os.path.join(sample_data_path, cdtx_file_name)
sample_cust_file = os.path.join(sample_data_path, cust_file_name)

In [16]:
df_cdtx = pd.read_csv(sample_cdtx_file)
df_cdtx.sort_values('csmdt')

df_cust = pd.read_csv(sample_cust_file)
df_cust.drop_duplicates(ignore_index=True, inplace=True)

idx_map = np.load(sample_chid_dict, allow_pickle=True).tolist()

In [17]:
l = len(idx_map)
for i , j in enumerate(sorted(df_cdtx[shop_col].unique())):
    idx_map[j] = i+l

In [18]:
df_cdtx.chid = df_cdtx.chid.map(idx_map)
df_cdtx[shop_col] = df_cdtx[shop_col].map(idx_map)

df_cust.chid = df_cust.chid.map(idx_map)

In [19]:
df_cdtx.csmdt = df_cdtx.csmdt.apply(lambda x: x[:8]+'01')
df_cdtx.objam = df_cdtx.objam.apply(lambda x: int(x))

df_cust.data_dt = df_cust.data_dt.apply(lambda x: x[:10])

In [20]:
ignore_cols = ['chid', 'data_dt']
category_cols = ['masts', 'educd', 'naty', 'trdtp', 'poscd', 'cuorg']
numeric_cols = sorted(set(df_cust.columns) - set(category_cols) - set(ignore_cols)) + ['objam']

In [21]:
mapper = {col: {value: index for index, value in enumerate(sorted(df_cust[col].unique()))} 
          for col in category_cols}

df_cust[category_cols] = df_cust[category_cols].apply(lambda x: x.map(mapper[x.name]))

In [22]:
df_cdtx = df_cdtx[df_cdtx.csmdt < '2019-01-01']
df_cust = df_cust[df_cust.data_dt == '2018-12-01'].sort_values(by=['chid'])

df_cust['objam'] = np.ma.log(df_cdtx.groupby(['chid']).sum().objam.values/12).filled(0)

In [23]:
edge_pairs = df_cdtx[['chid', shop_col]].copy()
edge_pairs.drop_duplicates(ignore_index=True, inplace=True)
edge_pairs = edge_pairs.to_numpy().T

edge_pairs = make_edges_symmetry(edge_pairs)
edge_pairs = torch.LongTensor(edge_pairs)

In [24]:
x_scaler = MinMaxScaler()
df_cust[numeric_cols] = x_scaler.fit_transform(df_cust[numeric_cols])

In [25]:
df_cust_ = df_cust[category_cols+numeric_cols]

cust_feature = torch.Tensor(df_cust_.to_numpy())
shop_feature = torch.zeros(len(idx_map)-cust_feature.shape[0], cust_feature.shape[1])
x_feature = torch.cat([cust_feature, shop_feature])

In [None]:
if load_edges:
    with open(edges_path, 'rb') as f:
        import pickle
        data = pickle.load(f)
else:
    data = Data(x=x_feature, edge_index=edge_pairs)
    data = train_test_split_edges(data, cust_feature.shape[0])
    with open(edges_path, 'wb') as output:
        import pickle
        pickle.dump(data, output, pickle.HIGHEST_PROTOCOL)

In [15]:
category_dims = {col_name : len(uni)
                 for col_name, uni in mapper.items()}

category_dict = column_idx(df_cust_, category_cols)
numeric_dict = column_idx(df_cust_, numeric_cols)

In [None]:
train_dataset = TensorDataset(data.train_pos_edge_index[0],
                              data.train_pos_edge_index[1],
                              data.train_pos_edge_weight.view(-1,1))
train_loader = DataLoader(dataset=train_dataset, shuffle=True, batch_size=batch_size)

test_dataset = TensorDataset()
test_loader = DataLoader(dataset=test_dataset, shuffle=False, batch_size=batch_size)

In [16]:
input_dim = len(category_dict)*embedding_size + len(numeric_dict)

layer_dims = [input_dim, 256, 128, 1]

n_users = cust_feature.shape[0]
n_shops = shop_feature.shape[0]

In [17]:
model = GAE(LightGCN(embedding_size, n_users, n_shops, n_layers, data.train_pos_edge_index[:,:int(data.train_pos_edge_index.shape[1]/2)])).to(device)
train_pos_edge_index = data.train_pos_edge_index.to(device)
optimizer = torch.optim.Adam(model.parameters(), lr=learning_rate)

  d_inv = np.power(rowsum, -0.5).flatten()


costing 223.4960036277771s, saved norm_mat...
don't split the matrix


In [18]:
def train():
    model.train()
    optimizer.zero_grad()
    z = model.encode(train_pos_edge_index)
    loss = model.recon_loss(z, train_pos_edge_index)
    
    loss.backward()
    optimizer.step()
    
    return float(loss)

In [19]:
def test(pos_edge_index, neg_edge_index):
    model.eval()
    with torch.no_grad():
        z = model.encode(train_pos_edge_index)
        
    return model.test(z, pos_edge_index, neg_edge_index)

In [20]:
for epoch in range(400):
    loss = train()
    
    auc, ap = test(data.test_pos_edge_index, data.test_neg_edge_index)
    print('Epoch: {:03d}, Train Loss:{:.4f}, AUC: {:.4f}, AP: {:.4f}'.format(epoch+1, loss, auc, ap))

Epoch: 001, Train Loss:1.3168, AUC: 0.5039, AP: 0.5145
Epoch: 002, Train Loss:1.3161, AUC: 0.5046, AP: 0.5154
Epoch: 003, Train Loss:1.3151, AUC: 0.5053, AP: 0.5164
Epoch: 004, Train Loss:1.3136, AUC: 0.5061, AP: 0.5175
Epoch: 005, Train Loss:1.3129, AUC: 0.5070, AP: 0.5186
Epoch: 006, Train Loss:1.3118, AUC: 0.5080, AP: 0.5199
Epoch: 007, Train Loss:1.3108, AUC: 0.5090, AP: 0.5213
Epoch: 008, Train Loss:1.3098, AUC: 0.5101, AP: 0.5229
Epoch: 009, Train Loss:1.3084, AUC: 0.5113, AP: 0.5246
Epoch: 010, Train Loss:1.3070, AUC: 0.5127, AP: 0.5264
Epoch: 011, Train Loss:1.3058, AUC: 0.5141, AP: 0.5284
Epoch: 012, Train Loss:1.3041, AUC: 0.5156, AP: 0.5305
Epoch: 013, Train Loss:1.3023, AUC: 0.5172, AP: 0.5328
Epoch: 014, Train Loss:1.3008, AUC: 0.5189, AP: 0.5352
Epoch: 015, Train Loss:1.2985, AUC: 0.5207, AP: 0.5378
Epoch: 016, Train Loss:1.2970, AUC: 0.5226, AP: 0.5406
Epoch: 017, Train Loss:1.2947, AUC: 0.5247, AP: 0.5435
Epoch: 018, Train Loss:1.2930, AUC: 0.5268, AP: 0.5467
Epoch: 019

Epoch: 150, Train Loss:0.9608, AUC: 0.8431, AP: 0.8788
Epoch: 151, Train Loss:0.9604, AUC: 0.8435, AP: 0.8791
Epoch: 152, Train Loss:0.9603, AUC: 0.8439, AP: 0.8795
Epoch: 153, Train Loss:0.9601, AUC: 0.8443, AP: 0.8798
Epoch: 154, Train Loss:0.9595, AUC: 0.8447, AP: 0.8802
Epoch: 155, Train Loss:0.9593, AUC: 0.8451, AP: 0.8805
Epoch: 156, Train Loss:0.9582, AUC: 0.8455, AP: 0.8808
Epoch: 157, Train Loss:0.9580, AUC: 0.8459, AP: 0.8811
Epoch: 158, Train Loss:0.9571, AUC: 0.8462, AP: 0.8814
Epoch: 159, Train Loss:0.9574, AUC: 0.8466, AP: 0.8817
Epoch: 160, Train Loss:0.9564, AUC: 0.8469, AP: 0.8820
Epoch: 161, Train Loss:0.9556, AUC: 0.8473, AP: 0.8823
Epoch: 162, Train Loss:0.9552, AUC: 0.8476, AP: 0.8826
Epoch: 163, Train Loss:0.9546, AUC: 0.8479, AP: 0.8828
Epoch: 164, Train Loss:0.9543, AUC: 0.8483, AP: 0.8831
Epoch: 165, Train Loss:0.9540, AUC: 0.8486, AP: 0.8834
Epoch: 166, Train Loss:0.9534, AUC: 0.8489, AP: 0.8836
Epoch: 167, Train Loss:0.9526, AUC: 0.8492, AP: 0.8839
Epoch: 168

Epoch: 299, Train Loss:0.9075, AUC: 0.8655, AP: 0.8985
Epoch: 300, Train Loss:0.9068, AUC: 0.8656, AP: 0.8985
Epoch: 301, Train Loss:0.9074, AUC: 0.8656, AP: 0.8986
Epoch: 302, Train Loss:0.9067, AUC: 0.8657, AP: 0.8986
Epoch: 303, Train Loss:0.9062, AUC: 0.8657, AP: 0.8987
Epoch: 304, Train Loss:0.9061, AUC: 0.8658, AP: 0.8987
Epoch: 305, Train Loss:0.9066, AUC: 0.8658, AP: 0.8988
Epoch: 306, Train Loss:0.9057, AUC: 0.8659, AP: 0.8988
Epoch: 307, Train Loss:0.9056, AUC: 0.8659, AP: 0.8989
Epoch: 308, Train Loss:0.9053, AUC: 0.8660, AP: 0.8989
Epoch: 309, Train Loss:0.9048, AUC: 0.8660, AP: 0.8990
Epoch: 310, Train Loss:0.9052, AUC: 0.8661, AP: 0.8990
Epoch: 311, Train Loss:0.9040, AUC: 0.8661, AP: 0.8991
Epoch: 312, Train Loss:0.9048, AUC: 0.8662, AP: 0.8991
Epoch: 313, Train Loss:0.9041, AUC: 0.8662, AP: 0.8992
Epoch: 314, Train Loss:0.9039, AUC: 0.8663, AP: 0.8992
Epoch: 315, Train Loss:0.9036, AUC: 0.8664, AP: 0.8993
Epoch: 316, Train Loss:0.9035, AUC: 0.8664, AP: 0.8993
Epoch: 317

In [22]:
torch.save(model.encoder.state_dict(), pretrain_weights)