In [21]:
import os
import json
import pandas as pd
import numpy as np

import torch
from torch_geometric.data import Data
from torch_geometric.nn import GCNConv, GAE

from time import time

from Model import GCNEncoder
from utils import make_edges_symmetry, column_idx, train_test_split_edges

from sklearn.preprocessing import MinMaxScaler

# 讀取檔案

In [2]:
device = torch.device('cuda:1' if torch.cuda.is_available() else 'cpu')

shop_col = 'stonc_6_label'
#shop_col = 'mcc'
#shop_col = 'stonc_label'
#shop_col = 'stonc_10_label'


load_edges = False
edges_path = './edges_stonc6.pkl'
pretrain_weights = './weights/GCNencoder_stonc6'

embedding_size = 64
epochs = 400
batch_size = 2048
learning_rate = 0.001

In [3]:
sample_data_path = './data/sample'

chid_dict_file_name = 'sample_50k_idx_map.npy'
cdtx_file_name = 'sample_50k_cdtx.csv'
cust_file_name = 'sample_50k_cust.csv'

sample_chid_dict = os.path.join(sample_data_path, chid_dict_file_name)
sample_cdtx_file = os.path.join(sample_data_path, cdtx_file_name)
sample_cust_file = os.path.join(sample_data_path, cust_file_name)

In [4]:
df_cdtx = pd.read_csv(sample_cdtx_file)
df_cdtx.sort_values('csmdt')

df_cust = pd.read_csv(sample_cust_file)
df_cust.drop_duplicates(ignore_index=True, inplace=True)

idx_map = np.load(sample_chid_dict, allow_pickle=True).tolist()

In [5]:
l = len(idx_map)
for i , j in enumerate(sorted(df_cdtx[shop_col].unique())):
    idx_map[j] = i+l

In [6]:
df_cdtx.chid = df_cdtx.chid.map(idx_map)
df_cdtx[shop_col] = df_cdtx[shop_col].map(idx_map)

df_cust.chid = df_cust.chid.map(idx_map)

In [7]:
df_cdtx.csmdt = df_cdtx.csmdt.apply(lambda x: x[:8]+'01')
df_cdtx.objam = df_cdtx.objam.apply(lambda x: int(x))

df_cust.data_dt = df_cust.data_dt.apply(lambda x: x[:10])

In [8]:
ignore_cols = ['chid', 'data_dt']
category_cols = ['masts', 'educd', 'naty', 'trdtp', 'poscd', 'cuorg']
numeric_cols = sorted(set(df_cust.columns) - set(category_cols) - set(ignore_cols)) + ['objam']

In [9]:
mapper = {col: {value: index for index, value in enumerate(sorted(df_cust[col].unique()))} 
          for col in category_cols}

df_cust[category_cols] = df_cust[category_cols].apply(lambda x: x.map(mapper[x.name]))

In [10]:
df_cdtx = df_cdtx[df_cdtx.csmdt < '2019-01-01']
df_cust = df_cust[df_cust.data_dt == '2018-12-01'].sort_values(by=['chid'])

df_cust['objam'] = np.ma.log(df_cdtx.groupby(['chid']).sum().objam.values/12).filled(0)

In [11]:
edge_pairs = df_cdtx[['chid', shop_col]].copy()
edge_pairs.drop_duplicates(ignore_index=True, inplace=True)
edge_pairs = edge_pairs.to_numpy().T

edge_pairs = make_edges_symmetry(edge_pairs)
edge_pairs = torch.LongTensor(edge_pairs)

In [12]:
x_scaler = MinMaxScaler()
df_cust[numeric_cols] = x_scaler.fit_transform(df_cust[numeric_cols])

In [13]:
df_cust_ = df_cust[category_cols+numeric_cols]

cust_feature = torch.Tensor(df_cust_.to_numpy())
shop_feature = torch.zeros(len(idx_map)-cust_feature.shape[0], cust_feature.shape[1])
x_feature = torch.cat([cust_feature, shop_feature])

In [14]:
if load_edges:
    with open(edges_path, 'rb') as f:
        import pickle
        data = pickle.load(f)
else:
    data = Data(x=x_feature, edge_index=edge_pairs)
    data = train_test_split_edges(data, cust_feature.shape[0])
    with open(edges_path, 'wb') as output:
        import pickle
        pickle.dump(data, output, pickle.HIGHEST_PROTOCOL)

In [15]:
category_dims = {col_name : len(uni)
                 for col_name, uni in mapper.items()}

category_dict = column_idx(df_cust_, category_cols)
numeric_dict = column_idx(df_cust_, numeric_cols)

In [16]:
input_dim = len(category_dict)*embedding_size + len(numeric_dict)
layer_dims = [input_dim, 256, 128, 1]

In [25]:
model = GAE(GCNEncoder(input_dim, embedding_size, category_dims)).to(device)

x = data.x.to(device)
train_pos_edge_index = data.train_pos_edge_index.to(device)

optimizer = torch.optim.Adam(model.parameters(), lr=learning_rate)

In [29]:
def train():
    model.train()
    optimizer.zero_grad()
    z = model.encode(x, train_pos_edge_index, category_dict, numeric_dict)
    loss = model.recon_loss(z, train_pos_edge_index)
    
    loss.backward()
    optimizer.step()
    
    return float(loss)

In [30]:
def test(pos_edge_index, neg_edge_index):
    model.eval()
    with torch.no_grad():
        z = model.encode(x, train_pos_edge_index, category_dict, numeric_dict)
        
    return model.test(z, pos_edge_index, neg_edge_index)

In [31]:
for epoch in range(400):
    loss = train()
    
    auc, ap = test(data.test_pos_edge_index, data.test_neg_edge_index)
    print('Epoch: {:03d}, Train Loss:{:.4f}, AUC: {:.4f}, AP: {:.4f}'.format(epoch+1, loss, auc, ap))

Epoch: 001, Train Loss:2.8316, AUC: 0.8668, AP: 0.8837
Epoch: 002, Train Loss:1.5028, AUC: 0.8880, AP: 0.9081
Epoch: 003, Train Loss:1.2293, AUC: 0.8905, AP: 0.9102
Epoch: 004, Train Loss:1.2071, AUC: 0.8901, AP: 0.9098
Epoch: 005, Train Loss:1.2464, AUC: 0.8920, AP: 0.9114
Epoch: 006, Train Loss:1.2059, AUC: 0.8952, AP: 0.9141
Epoch: 007, Train Loss:1.1564, AUC: 0.8965, AP: 0.9153
Epoch: 008, Train Loss:1.1523, AUC: 0.8977, AP: 0.9165
Epoch: 009, Train Loss:1.1691, AUC: 0.8961, AP: 0.9154
Epoch: 010, Train Loss:1.1665, AUC: 0.8938, AP: 0.9133
Epoch: 011, Train Loss:1.1483, AUC: 0.8915, AP: 0.9112
Epoch: 012, Train Loss:1.1414, AUC: 0.8903, AP: 0.9100
Epoch: 013, Train Loss:1.1476, AUC: 0.8897, AP: 0.9095
Epoch: 014, Train Loss:1.1466, AUC: 0.8896, AP: 0.9095
Epoch: 015, Train Loss:1.1380, AUC: 0.8894, AP: 0.9093
Epoch: 016, Train Loss:1.1344, AUC: 0.8895, AP: 0.9095
Epoch: 017, Train Loss:1.1373, AUC: 0.8900, AP: 0.9098
Epoch: 018, Train Loss:1.1395, AUC: 0.8904, AP: 0.9101
Epoch: 019

Epoch: 150, Train Loss:1.0700, AUC: 0.9224, AP: 0.9321
Epoch: 151, Train Loss:1.0684, AUC: 0.9224, AP: 0.9315
Epoch: 152, Train Loss:1.0680, AUC: 0.9229, AP: 0.9320
Epoch: 153, Train Loss:1.0720, AUC: 0.9241, AP: 0.9331
Epoch: 154, Train Loss:1.0660, AUC: 0.9205, AP: 0.9305
Epoch: 155, Train Loss:1.0664, AUC: 0.9218, AP: 0.9321
Epoch: 156, Train Loss:1.0676, AUC: 0.9228, AP: 0.9329
Epoch: 157, Train Loss:1.0656, AUC: 0.9207, AP: 0.9305
Epoch: 158, Train Loss:1.0653, AUC: 0.9223, AP: 0.9318
Epoch: 159, Train Loss:1.0655, AUC: 0.9250, AP: 0.9342
Epoch: 160, Train Loss:1.0645, AUC: 0.9219, AP: 0.9316
Epoch: 161, Train Loss:1.0627, AUC: 0.9219, AP: 0.9315
Epoch: 162, Train Loss:1.0629, AUC: 0.9248, AP: 0.9340
Epoch: 163, Train Loss:1.0623, AUC: 0.9239, AP: 0.9330
Epoch: 164, Train Loss:1.0622, AUC: 0.9221, AP: 0.9315
Epoch: 165, Train Loss:1.0616, AUC: 0.9235, AP: 0.9333
Epoch: 166, Train Loss:1.0609, AUC: 0.9230, AP: 0.9331
Epoch: 167, Train Loss:1.0605, AUC: 0.9221, AP: 0.9320
Epoch: 168

Epoch: 299, Train Loss:1.0133, AUC: 0.9290, AP: 0.9416
Epoch: 300, Train Loss:1.0097, AUC: 0.9245, AP: 0.9387
Epoch: 301, Train Loss:1.0069, AUC: 0.9283, AP: 0.9409
Epoch: 302, Train Loss:1.0099, AUC: 0.9280, AP: 0.9408
Epoch: 303, Train Loss:1.0131, AUC: 0.9254, AP: 0.9392
Epoch: 304, Train Loss:1.0033, AUC: 0.9247, AP: 0.9387
Epoch: 305, Train Loss:1.0088, AUC: 0.9290, AP: 0.9416
Epoch: 306, Train Loss:1.0098, AUC: 0.9274, AP: 0.9407
Epoch: 307, Train Loss:1.0092, AUC: 0.9289, AP: 0.9415
Epoch: 308, Train Loss:1.0063, AUC: 0.9205, AP: 0.9354
Epoch: 309, Train Loss:1.0060, AUC: 0.9299, AP: 0.9421
Epoch: 310, Train Loss:1.0082, AUC: 0.9288, AP: 0.9414
Epoch: 311, Train Loss:1.0069, AUC: 0.9225, AP: 0.9378
Epoch: 312, Train Loss:1.0086, AUC: 0.9213, AP: 0.9361
Epoch: 313, Train Loss:1.0030, AUC: 0.9314, AP: 0.9434
Epoch: 314, Train Loss:1.0164, AUC: 0.9270, AP: 0.9405
Epoch: 315, Train Loss:1.0081, AUC: 0.9215, AP: 0.9373
Epoch: 316, Train Loss:1.0147, AUC: 0.9310, AP: 0.9427
Epoch: 317

In [32]:
torch.save(model.encoder.state_dict(), pretrain_weights)