# Pretrain

In [28]:
import os
import numpy as np
import pandas as pd
from tqdm import tqdm

from sklearn.preprocessing import MinMaxScaler
from sklearn.metrics import roc_auc_score, average_precision_score
import torch
import torch_geometric.transforms as T
from torch_geometric.nn import GCNConv, GAE, VGAE
from torch_geometric.utils import train_test_split_edges
from torch_geometric.data import Data

In [29]:
sample_path = './data/'
chid_dict_file = 'sample_idx_map.npy'
cdtx_file = 'sample_zip_if_cca_cdtx0001_hist.csv'
cust_f_file = 'sample_zip_if_cca_cust_f.csv'

In [30]:
idx_map = np.load(os.path.join(sample_path, chid_dict_file), allow_pickle=True).tolist()
df_cdtx = pd.read_csv(os.path.join(sample_path, cdtx_file)) # 交易記錄檔
df_cust_f = pd.read_csv(os.path.join(sample_path, cust_f_file)) # user feature
df_cust_f.drop_duplicates(ignore_index=True, inplace=True)

print(len(idx_map), df_cdtx.shape, df_cust_f.shape)

50000 (6654938, 10) (1176172, 32)


In [31]:
l = len(idx_map)
for i, j  in tqdm(enumerate(set(df_cdtx.mcc))):
    idx_map[j] = i+l

df_cdtx.chid = df_cdtx.chid.map(idx_map)
df_cdtx.mcc = df_cdtx.mcc.map(idx_map)

df_cust_f.chid = df_cust_f.chid.map(idx_map)

502it [00:00, 419932.31it/s]


In [32]:
df_cust_f_pre = df_cust_f[df_cust_f.data_ym == '2019-01-01'].sort_values(by=['chid'])

In [33]:
category_cols = ['masts', 'educd', 'naty', 'trdtp', 'poscd', 'cuorg']

numeric_cols = sorted(set(df_cust_f.columns) - set(category_cols) - set(['chid', 'data_ym', 'data_dt']), 
                      key=list(df_cust_f.columns).index)

In [34]:
mapper = {col: {value: index for index, value in enumerate(sorted(df_cust_f_pre[col].unique()))} 
          for col in category_cols}

df_cust_f_pre.loc[:,category_cols] = df_cust_f_pre[category_cols].apply(lambda x: x.map(mapper[x.name]))

print(df_cust_f_pre.shape)
df_cust_f_pre.head(2)

(50000, 32)


Unnamed: 0,chid,data_ym,monin,wrky,first_mob,data_dt,masts,educd,naty,trdtp,...,constant_u2_ind,constant_u3_ind,constant_u4_ind,constant_l2_ind,constant_l3_ind,constant_l4_ind,constant_change,growth_rate,monotone_up,monotone_down
827843,0,2019-01-01,173472,0,192.0,2019-02-01,2,4,1,20,...,0.0,0.0,0.0,1.0,0.0,0.0,0.0,0.8,0.0,3.0
107093,1,2019-01-01,248914,0,192.0,2019-02-01,0,1,1,22,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0


In [35]:
df_cust_f_pre.drop(columns=['data_ym', 'data_dt'], inplace=True)
df_cust_f_pre = df_cust_f_pre[category_cols+numeric_cols]

In [36]:
x_scaler = MinMaxScaler()
df_cust_f_pre[numeric_cols] = x_scaler.fit_transform(df_cust_f_pre[numeric_cols])

In [37]:
x_feature = torch.Tensor(df_cust_f_pre.to_numpy())
x_feature = torch.cat([x_feature, torch.zeros(len(idx_map)-x_feature.shape[0],x_feature.shape[1])])

In [51]:
df_cdtx = df_cdtx[df_cdtx.csmdt < '2019-01-01']
edge_index = df_cdtx.iloc[:,[2,5]].drop_duplicates().to_numpy()

def make_edges_symmetry(edge_index):
    new_edge = []
    for i in edge_index:
        new_edge.append(np.array([i[1],i[0]]))
    new_edge = np.concatenate([new_edge],0)
    print(new_edge.shape, edge_index.shape)
    return torch.LongTensor(np.concatenate([edge_index,new_edge], 0).T)

edge_index = make_edges_symmetry(edge_index)

(518928, 2) (518928, 2)


In [54]:
df_cdtx.iloc[:,[2,5]].drop_duplicates().to_numpy().shape

(518928, 2)

In [48]:
data = Data(x=x_feature, edge_index=edge_index)

In [55]:
def sample_neg_edges(pos_edges, num_nodes, n_user):
    row , col = pos_edges
    mask = row < col
    row, col = row[mask], col[mask]
    neg_adj_mask = torch.ones(num_nodes, num_nodes, dtype=torch.uint8)
    neg_adj_mask = neg_adj_mask.triu(diagonal=1).to(torch.bool)
    neg_adj_mask[row, col] = 0
    neg_row, neg_col = neg_adj_mask[:n_user, n_user:].nonzero(as_tuple=False).t()
    neg_col = neg_col+n_user
    perm = torch.randperm(row.size(0))
    neg_row, neg_col = neg_row[perm], neg_col[perm]
    neg_edge_index = torch.cat([neg_row.view(1,-1), neg_col.view(1,-1)],0)
    return make_edges_symmetry(neg_edge_index.T)

neg_edge_index = sample_neg_edges(data.edge_index, 50502, 50000)

(518928, 2) (518928, 2)


In [57]:
neg_edge_index.shape

torch.Size([2, 1037856])

In [58]:
def feature_index(x, feature_cols):
    feature_idx = {}
    x_cols = list(x.columns)
    for i in feature_cols:
        feature_idx[i] = x_cols.index(i)
        
    return feature_idx

In [59]:
category_dict = feature_index(df_cust_f_pre, category_cols)
numeric_dict = feature_index(df_cust_f_pre, numeric_cols)

In [60]:
category_dict

{'masts': 0, 'educd': 1, 'naty': 2, 'trdtp': 3, 'poscd': 4, 'cuorg': 5}

In [61]:
class GCNEncoder(torch.nn.Module):
    def __init__(self, in_channels, out_channels , category_cols, category_dims):
        super(GCNEncoder, self).__init__()
        self.conv1 = GCNConv(in_channels, 2 * out_channels, cached=True)
        self.conv2 = GCNConv(2 * out_channels, out_channels, cached=True)
        self.embedding_dict = torch.nn.ModuleDict({category_col:torch.nn.Embedding(category_dim,
                                                                                   64)
                                                   for category_col, category_dim in zip(category_cols,category_dims)})


    def forward(self, x, edge_index):
        category_embeddings = [self.embedding_dict[item[0]](x[:,item[1]].long()) for item in category_dict.items()]
        category_embeddings = torch.cat(category_embeddings, -1)
        numeric_idx = torch.LongTensor(list(numeric_dict.values()))
        x = torch.cat([category_embeddings, x[:,numeric_idx]], -1)
        x = self.conv1(x, edge_index).relu()
        return self.conv2(x, edge_index)

class LinearEncoder(torch.nn.Module):
    def __init__(self, in_channels, out_channels, category_cols, category_dims):
        super(LinearEncoder, self).__init__()
        self.conv = GCNConv(in_channels, out_channels, cached=True)
        self.embedding_dict = torch.nn.ModuleDict({category_col:torch.nn.Embedding(category_dim,
                                                                                   out_channels)
                                                   for category_col, category_dim in zip(category_cols,category_dims)})


    def forward(self, x, edge_index):
        category_embeddings = [self.embedding_dict[item[0]](x[:,item[1]].long()) for item in category_dict.items()]
        category_embeddings = torch.cat(category_embeddings, -1)
        numeric_idx = torch.LongTensor(list(numeric_dict.values()))
        x = torch.cat([category_embeddings, x[:,numeric_idx]], -1)
        
        return self.conv(x, edge_index)


In [62]:
category_dims = [df_cust_f[feat].nunique() for feat in category_cols]
out_channels = 64
num_features = len(category_dict)*64 + len(numeric_dict)

In [63]:
model = GAE(GCNEncoder(num_features, out_channels, category_cols, category_dims))

In [65]:
device = torch.device('cuda:0' if torch.cuda.is_available() else 'cpu')
model = model.to(device)
x = data.x.to(device)
train_pos_edge_index = data.edge_index.to(device)
train_neg_edge_index = neg_edge_index.to(device)
optimizer = torch.optim.Adam(model.parameters(), lr=0.001)

In [66]:
def train():
    model.train()
    optimizer.zero_grad()
    z = model.encode(x, train_pos_edge_index)
    loss = model.recon_loss(z, train_pos_edge_index, )
    if False:
        loss = loss + (1 / data.num_nodes) * model.kl_loss()
    loss.backward()
    optimizer.step()
    return float(loss)

In [67]:
def test(pos_edge_index, neg_edge_index):
    model.eval()
    with torch.no_grad():
        z = model.encode(x, train_pos_edge_index)
        
    return model.test(z, pos_edge_index, neg_edge_index)

In [69]:
for epoch in range(1, 400 + 1):
    loss = train()
    print(epoch, loss)

1 23.783863067626953
2 28.18710708618164
3 26.689922332763672
4 17.434524536132812
5 5.4241108894348145
6 2.142585039138794
7 2.0948808193206787
8 1.979941964149475
9 1.5609272718429565
10 1.2508572340011597
11 1.090618371963501
12 0.9846475720405579
13 0.9202628135681152
14 0.8682698011398315
15 0.8347603678703308
16 0.8161513805389404
17 0.8120492100715637
18 0.805556058883667
19 0.8025895357131958
20 0.795393705368042
21 0.7936538457870483
22 0.7914948463439941
23 0.7949106693267822
24 0.7892694473266602
25 0.7866370677947998
26 0.789164662361145
27 0.7839850187301636
28 0.7835958003997803
29 0.7812873125076294
30 0.782378613948822
31 0.7804045081138611
32 0.7778943181037903
33 0.7776350378990173
34 0.7747756838798523
35 0.775317907333374
36 0.7744396924972534
37 0.7744811773300171
38 0.7700278759002686
39 0.772037923336029
40 0.7688952684402466
41 0.7697527408599854
42 0.7682616114616394
43 0.7680343389511108
44 0.7672072052955627
45 0.7653307318687439
46 0.765592098236084
47 0.764

KeyboardInterrupt: 

In [70]:
z = model.encode(x, train_pos_edge_index)

In [71]:
z = z.detach().cpu().numpy()

In [72]:
np.save('GCNEncoder_0126', z)