In [None]:
import time
import torch
import numpy as np
import pandas as pd
from tqdm.notebook import tqdm
from sklearn.manifold import TSNE
from sklearn.preprocessing import MinMaxScaler

#from torch_geometric.nn import Node2Vec
from model import Node2Vec

In [None]:
%matplotlib inline
import matplotlib.pyplot as plt
import seaborn as sns

In [None]:
data = pd.read_csv('./data/sample_50k/sample_zip_if_cca_cdtx0001_hist.csv').sort_values(by=['csmdt'], ignore_index=True)
idx_map = np.load('./data/sample_50k/sample_idx_map.npy', allow_pickle=True).item()

In [None]:
sample_size = len(idx_map)

data = data[data.chid.map(idx_map) < sample_size].copy()
data = data[data.csmdt < '2019-01-01']
data = data.loc[:, ['chid', 'mcc', 'objam']]

In [None]:
sample_idx_map = {}

for i, (key, value) in zip(range(sample_size), idx_map.items()):
    sample_idx_map[key] = value
    
l = len(sample_idx_map)
for i, j  in enumerate(set(data.mcc)):
    sample_idx_map[j] = i+l

In [None]:
df_group = data.groupby(by=['chid', 'mcc']).mean()
df_edge = pd.DataFrame(list(map(list, df_group.index)), columns=['chid', 'mcc'])
df_edge['value'] = df_group.objam.values
df_edge.chid = df_edge.chid.map(sample_idx_map)
df_edge.mcc = df_edge.mcc.map(sample_idx_map)

scaler = MinMaxScaler(feature_range=(0, 1))
df_edge.loc[:, ['value']] = scaler.fit_transform(df_edge.loc[:, ['value']])

df_edge.head(2)

In [None]:
edges = df_edge.values
edges = np.append(edges, df_edge.loc[:, ['mcc', 'chid', 'value']].values, axis = 0)

In [None]:
device = 'cuda' if torch.cuda.is_available() else 'cpu'
model = Node2Vec(torch.Tensor(edges.T), embedding_dim=64, walk_length=2,
                 context_size=2, walks_per_node=10,
                 num_negative_samples=4, p=1, q=1, sparse=False).to(device)

In [None]:
#loader = model.loader(sample_range=range(sample_size), batch_size=2048, shuffle=True, num_workers=8)
loader = model.loader(batch_size=2048, shuffle=True, num_workers=16)
#optimizer = torch.optim.SparseAdam(list(model.parameters()), lr=0.01)
optimizer = torch.optim.Adam(list(model.parameters()), lr=0.01)

In [None]:
model.eval()
embeds = model()
tsne_embeds = TSNE(n_components=2).fit_transform(embeds.detach().cpu().numpy())

In [None]:
colors = ['#065535', '#bada55']

plt.figure(figsize=(8, 8))
plt.scatter(tsne_embeds[:sample_size, 0], tsne_embeds[:sample_size, 1], s=20, color=colors[0], label='user')
plt.scatter(tsne_embeds[sample_size:, 0], tsne_embeds[sample_size:, 1], s=20, color=colors[1], label='item')
plt.legend()

In [None]:
def train():
    model.train()
    total_loss = 0
    for pos_rw, neg_rw, pos_val in loader:
        optimizer.zero_grad()
        loss = model.loss(pos_rw.to(device), neg_rw.to(device), pos_val.to(device))
        loss.backward()
        optimizer.step()
        total_loss += loss.item()
    return total_loss / len(loader)

In [None]:
epochs = 200

t0 = time.time()

for ep in tqdm(range(epochs)):
    loss = train()
    
    print('Epoch: {:02d}, LossL {:.4f}'.format(ep+1, loss))
print(time.time() - t0)

In [None]:
model.eval()
embeds = model()
tsne_embeds = TSNE(n_components=2).fit_transform(embeds.detach().cpu().numpy())

In [None]:
colors = ['#065535', '#bada55']

plt.figure(figsize=(8, 8))
plt.scatter(tsne_embeds[:sample_size, 0], tsne_embeds[:sample_size, 1], s=20, color=colors[0], label='user')
plt.scatter(tsne_embeds[sample_size:, 0], tsne_embeds[sample_size:, 1], s=20, color=colors[1], label='item')
plt.legend()

In [None]:
#np.save('data/sample_50k/embedding/node2vec_50k_0112', embeds.detach().cpu().numpy())
#np.save('data/sample_50k/embedding/sample_cust_mcc_idx_map_50k_0112', sample_idx_map)