# Model to predict graph embedding from paths

## TQDM dark mode

In [1]:
%%html
<style>
.cell-output-ipywidget-background {
    background-color: transparent !important;
}
:root {
    --jp-widgets-color: var(--vscode-editor-foreground);
    --jp-widgets-font-size: var(--vscode-editor-font-size);
}  
</style>


# Load data

In [2]:
import pickle
from pathlib import Path
with open(Path('F:\\') / 'data' / 'prov_dp' / 'tc3-trace-training.pkl', 'rb') as f:
    data_unflattened = pickle.load(f)

In [3]:
from itertools import chain
dataset = list(chain.from_iterable(data_unflattened))

len(dataset)

1489123

## Model Parameters

In [5]:
import torch

embedding_size = 64
device = 'cuda' if torch.cuda.is_available() else 'cpu'

## Generate tokens

In [6]:
tokens = set()
distinct_paths = set()

def tokenize(path):
    assert len(path) % 2 == 0
    return [f'{path[idx]}|{path[idx+1]}' for idx in range(0, len(path), 2)]

for path, _ in dataset:
    path = tokenize(path.split(' '))
    tokens.update(path)
    distinct_paths.add(' '.join(path))
tokens = list(tokens)
tokens[:0] = ['.']
itos = tokens
stoi = {
    token: i for i, token in enumerate(itos)
}

print(f'Found {len(tokens)} tokens in {len(dataset)} entries')
print(f'Distinct paths: {len(distinct_paths)}')

Found 16 tokens in 1489123 entries
Distinct paths: 37


## Embed Graphs

In [7]:
from tqdm.auto import tqdm
from source.algorithm.utility import to_nx

nx_graphs = [
    to_nx(graph)
    for _, graph in tqdm(dataset, desc='Converting to NetworkX graphs')
]
len(nx_graphs)

  from .autonotebook import tqdm as notebook_tqdm


Converting to NetworkX graphs: 100%|██████████| 1489123/1489123 [00:23<00:00, 63455.25it/s] 


1489123

In [8]:
import numpy as np
lens = [len(g) for p, g in dataset]
print(f'Min: {min(lens)}, Max: {max(lens)}')
print(f'Avg: {np.average(lens)}, Std: {np.std(lens)}')


Min: 1, Max: 8
Avg: 1.0003753887355176, Std: 0.03720074179980315


## Train graph2vec

In [8]:
from karateclub import Graph2Vec

graph2vec = Graph2Vec(
    wl_iterations=80,
    attributed=True,
    dimensions=embedding_size,
    workers=4,
    epochs=5
)

graph2vec.fit(nx_graphs)
with open(Path('F:\\') / 'data' / 'prov_dp' / 'tc3-trace-graph2vec.pkl', 'wb') as file:
    pickle.dump(graph2vec, file)


In [9]:
from karateclub import Graph2Vec
with open(Path('F:\\') / 'data' / 'prov_dp' / 'tc3-trace-graph2vec.pkl', 'rb') as file:
    graph2vec = pickle.load(file)

In [10]:
graph_embeddings = graph2vec.get_embedding()
normalized_embeddings = [ 
    v/np.linalg.norm(v) 
    for v in graph_embeddings
    ]
len(graph_embeddings)

1489123

In [42]:
## Format training data
X, Y = [], []

for i, row in enumerate(dataset):
    path_str, _ = row
    path_list = tokenize(path_str.split(' '))
    path = [stoi[s] for s in path_list]
    context = [0] * block_size
    for i in range(min(len(path), block_size)):
        context[-i-1] = path[i]
    X.append(context)    
    Y.append(normalized_embeddings[i])

X = torch.tensor(X, device=device)
Y = torch.tensor(Y, device=device)
print(X.shape, Y.shape)

torch.Size([1489123, 8]) torch.Size([1489123, 64])


In [12]:
import numpy as np
np.linalg.norm(normalized_embeddings[0])

1.0

In [33]:
X[1]

tensor([0, 0, 0, 0, 0, 0, 8, 8])

In [59]:
import torch.nn as nn
import torch.optim as optim

vocab_size = len(tokens)
n_embedding = 10 # Embedding dimension
n_hidden = 100
learning_rate = 0.01
batch_size = 32768
block_size = 8

# Predict embedding, then lookup closest embedding from graph2vec... gradients don't flow through to graph2vec though :(
model = nn.Sequential(
    nn.Embedding(vocab_size, n_embedding),
    nn.Flatten(),
    nn.Linear(n_embedding * block_size, n_embedding * block_size), nn.LeakyReLU(),
    nn.Linear(n_embedding * block_size, n_hidden), nn.LeakyReLU(),
    nn.Linear(n_hidden, embedding_size)
)
model.to(device)
optimizer = optim.SGD(model.parameters(), lr=learning_rate, momentum=0.9)
lossi = []

In [45]:
layers = [
    nn.Embedding(vocab_size, n_embedding),
    nn.Flatten(),
    nn.Linear(n_embedding * block_size, n_embedding * block_size), nn.ReLU(),
    nn.Linear(n_embedding * block_size, n_hidden), nn.ReLU(),
    nn.Linear(n_hidden, embedding_size)
    ]
x = X[0]
x = layers[0](x)
print(x.shape)
x = layers[1](x)
print(x.shape)

x = layers[2](x)
print(x.shape)


torch.Size([8, 10])
torch.Size([8, 10])


RuntimeError: mat1 and mat2 shapes cannot be multiplied (8x10 and 80x80)

In [60]:
import torch.nn.functional as F
from tqdm import tqdm
from torch.nn.utils import clip_grad_norm

max_steps = 1000
for i in range(max_steps):
    optimizer.zero_grad()

    ix = torch.randint(0, Y.shape[0], (batch_size,))
    X_batch, Y_batch = X[ix], Y[ix]

    # Forward pass
    output = model(X_batch) # output layer
    # print(embedding)
    # print(Y_batch)
    loss = F.mse_loss(output, Y_batch)
    # print(loss)

    # Backward pass
    for p in model.parameters():
        p.requires_grad = True
    loss.backward()
    
    # update
    # clip_grad_norm(model.parameters(), max_norm=1.0)
    optimizer.step()

    # Track stats
    if i % 100 == 0: # print every once in a while
        print(f'{i:7d}/{max_steps:7d}: {loss.item():.7f}')
    lossi.append(loss.log10().item())
    # break
    

      0/   1000: 0.0343036
    100/   1000: 0.0042134
    200/   1000: 0.0022222
    300/   1000: 0.0018093
    400/   1000: 0.0016219
    500/   1000: 0.0014289
    600/   1000: 0.0013052
    700/   1000: 0.0011813
    800/   1000: 0.0010690
    900/   1000: 0.0009849


In [61]:
print(torch.cuda.memory_summary())

|                  PyTorch CUDA memory summary, device ID 0                 |
|---------------------------------------------------------------------------|
|            CUDA OOMs: 0            |        cudaMalloc retries: 0         |
|        Metric         | Cur Usage  | Peak Usage | Tot Alloc  | Tot Freed  |
|---------------------------------------------------------------------------|
| Allocated memory      | 512028 KiB | 659401 KiB |   1105 GiB |   1105 GiB |
|       from large pool | 509070 KiB | 656526 KiB |    783 GiB |    782 GiB |
|       from small pool |   2957 KiB |   7058 KiB |    322 GiB |    322 GiB |
|---------------------------------------------------------------------------|
| Active memory         | 512028 KiB | 659401 KiB |   1105 GiB |   1105 GiB |
|       from large pool | 509070 KiB | 656526 KiB |    783 GiB |    782 GiB |
|       from small pool |   2957 KiB |   7058 KiB |    322 GiB |    322 GiB |
|---------------------------------------------------------------