In [1]:
import os
import time
import torch
import numpy as np
import pandas as pd
from sklearn.manifold import TSNE
from sklearn.preprocessing import MinMaxScaler

#from torch_geometric.nn import Node2Vec
from model import Node2Vec

In [2]:
%matplotlib inline
import matplotlib.pyplot as plt
import seaborn as sns

In [3]:
def train():
    model.train()
    total_loss = 0
    for pos_rw, neg_rw, pos_val in loader:
        optimizer.zero_grad()
        loss = model.loss(pos_rw.to(device), neg_rw.to(device), pos_val.to(device))
        loss.backward()
        optimizer.step()
        total_loss += loss.item()
    return total_loss / len(loader)

In [4]:
path = './data/sample_50k'

data = pd.read_csv(os.path.join(path, 'sample_zip_if_cca_cdtx0001_hist.csv')).sort_values(by=['csmdt'], ignore_index=True)
chid_idx_map = np.load(os.path.join(path, 'sample_idx_map.npy'), allow_pickle=True).item()
months = sorted(data.csmdt.apply(lambda x: x[:-3]).unique())

In [5]:
sample_size = len(chid_idx_map)
data = data[data.chid.map(chid_idx_map) < sample_size].copy()

In [6]:
idx_map = {}

for i, (key, value) in zip(range(sample_size), chid_idx_map.items()):
    idx_map[key] = value
    
l = len(idx_map)
for i, j  in enumerate(set(data.mcc)):
    idx_map[str(j)] = i+l

In [7]:
# ws: window size
# sw: sliding window
ws = 12
batch_size = 2048
learning_rate = 1e-2
epochs = 400

embeds_list = []

t0 = time.time()
for i in range(12):
    print('Range:', months[i], months[i+ws])
    
    mask = data.index[data.csmdt.between(months[i], months[i+ws])]
    data_sw = data.loc[mask, ['chid', 'mcc', 'objam']]
    
    print('\tInteraction:', data_sw.shape)
    
    df_group = data_sw.groupby(by=['chid', 'mcc']).mean()

    df_edge = pd.DataFrame(list(map(list, df_group.index)), columns=['chid', 'mcc'])
    df_edge['value'] = df_group.objam.values
    df_edge['log_value'] = np.log1p(df_edge.value)

    scaler = MinMaxScaler(feature_range=(0, 1))
    df_edge['mm_log_value'] = scaler.fit_transform(df_edge.loc[:, ['log_value']])

    df_edge.chid = df_edge.chid.map(idx_map)
    df_edge.mcc = df_edge.mcc.map(idx_map)
    
    print('\tEdge:', df_edge.shape)
    
    edges = df_edge.loc[:, ['chid', 'mcc', 'mm_log_value']].values
    edges = np.append(edges, df_edge.loc[:, ['mcc', 'chid', 'mm_log_value']].values, axis = 0)
    
    ## build model
    device = 'cuda' if torch.cuda.is_available() else 'cpu'
    model = Node2Vec(torch.Tensor(edges.T), embedding_dim=64, num_nodes=len(idx_map), 
                     walk_length=2, context_size=2, walks_per_node=10,
                     num_negative_samples=4, p=1, q=1, sparse=False).to(device)    
    
    if i > 0:
        model.embedding.weight.data.copy_(torch.from_numpy(embeds_list[-1]))
    
    loader = model.loader(batch_size=batch_size, shuffle=True, num_workers=8)
    optimizer = torch.optim.AdamW(list(model.parameters()), lr=learning_rate)

    print('\n\tTraining start')
    t1 = time.time()
    
    for ep in range(epochs):
        loss = train()
        
        if (ep+1) % 25 == 0:
            print('\tEpoch: {:02d}, Loss {:.4f}'.format(ep+1, loss))
    print('\tTraining cost: {:.2f}\n'.format(time.time() - t1))
    
    embeds = model().detach().cpu().numpy()
    embeds_list.append(embeds)
    
print('Total cost: {:.2f}\n'.format(time.time() - t0))

Range: 2018-01 2019-01
	Interaction: (2785843, 3)
	Edge: (518927, 5)

	Training start
	Epoch: 25, Loss 1.3446
	Epoch: 50, Loss 0.8367
	Epoch: 75, Loss 0.7609
	Epoch: 100, Loss 0.7431
	Epoch: 125, Loss 0.7379
	Epoch: 150, Loss 0.7366
	Epoch: 175, Loss 0.7354
	Epoch: 200, Loss 0.7311
	Epoch: 225, Loss 0.7325
	Epoch: 250, Loss 0.7293
	Epoch: 275, Loss 0.7313
	Epoch: 300, Loss 0.7290
	Epoch: 325, Loss 0.7287
	Epoch: 350, Loss 0.7276
	Epoch: 375, Loss 0.7268
	Epoch: 400, Loss 0.7268
	Training cost: 591.27

Range: 2018-02 2019-02
	Interaction: (2837070, 3)
	Edge: (526835, 5)

	Training start
	Epoch: 25, Loss 1.3469
	Epoch: 50, Loss 0.8396
	Epoch: 75, Loss 0.7637
	Epoch: 100, Loss 0.7472
	Epoch: 125, Loss 0.7398
	Epoch: 150, Loss 0.7386
	Epoch: 175, Loss 0.7363
	Epoch: 200, Loss 0.7306
	Epoch: 225, Loss 0.7304
	Epoch: 250, Loss 0.7306
	Epoch: 275, Loss 0.7310
	Epoch: 300, Loss 0.7280
	Epoch: 325, Loss 0.7284
	Epoch: 350, Loss 0.7276
	Epoch: 375, Loss 0.7266
	Epoch: 400, Loss 0.7265
	Training 

In [None]:
colors = ['#065535', '#bada55']
for embeds in embeds_list:
    tsne_embeds = TSNE(n_components=2, n_jobs=8, random_state=4036).fit_transform(embeds) 
    
    plt.figure(figsize=(6, 6))
    plt.scatter(tsne_embeds[:sample_size, 0], tsne_embeds[:sample_size, 1], s=20, color=colors[0], label='user')
    plt.scatter(tsne_embeds[sample_size:, 0], tsne_embeds[sample_size:, 1], s=20, color=colors[1], label='item')
    plt.legend()
    plt.show()

In [8]:
for i, (embeds, ms, me) in enumerate(zip(embeds_list, months[:12], months[ws:ws+12])):
    path_ = os.path.join(path, 'embedding_ws12_noCP', 'node2vec_50k_{}_{}'.format(ms.replace('-', ''), me.replace('-', '')))
    np.save(path_, embeds)
    
np.save(os.path.join(path, 'embedding_ws12_noCP', 'cust_mcc_idx_map_50k'), idx_map)    

In [None]:
for i in range(12):
    print('Range:', months[i], months[i+ws])
    
    mask = data.index[data.csmdt.between(months[i], months[i+ws])]
    data_sw = data.loc[mask, ['chid', 'mcc', 'objam']]
        
    df_group = data_sw.groupby(by=['chid', 'mcc']).mean()

    df_edge = pd.DataFrame(list(map(list, df_group.index)), columns=['chid', 'mcc'])
    df_edge['value'] = df_group.objam.values
    df_edge['log_value'] = np.log1p(df_edge.value)

    with sns.axes_style("darkgrid"):
        fig, axes = plt.subplots(1, 2, figsize=(16, 4), sharey=True)
        fig.suptitle('Data in range({}, {})'.format(months[i], months[i+ws]))
        
        sns.histplot(ax=axes[0], x='value', data=df_edge, kde=True, element='step', stat='probability')
        sns.histplot(ax=axes[1], x='log_value', data=df_edge, kde=True, element='step', stat='probability')
        plt.show()