# Model to predict graph embedding from paths

## TQDM dark mode

In [28]:
%%html
<style>
.cell-output-ipywidget-background {
    background-color: transparent !important;
}
:root {
    --jp-widgets-color: var(--vscode-editor-foreground);
    --jp-widgets-font-size: var(--vscode-editor-font-size);
}  
</style>


# Load data

In [3]:
import pickle
from pathlib import Path
with open(Path('F:\\') / 'data' / 'prov_dp' / 'tc3-trace-training.pkl', 'rb') as f:
    data_unflattened = pickle.load(f)

In [4]:
from itertools import chain
dataset = list(chain.from_iterable(data_unflattened))

len(dataset)

1489123

# Train model

## Model Parameters

In [5]:
import torch

batch_size = 64
block_size = 8
embedding_size = 64
device = 'cuda' if torch.cuda.is_available() else 'cpu'
learning_rate = 0.1

## Generate tokens

In [6]:
tokens = set()
distinct_paths = set()

def tokenize(path):
    assert len(path) % 2 == 0
    return [f'{path[idx]}|{path[idx+1]}' for idx in range(0, len(path), 2)]

for path, _ in dataset:
    path = tokenize(path.split(' '))
    tokens.update(path)
    distinct_paths.add(' '.join(path))
tokens = list(tokens)
tokens[:0] = ['.']
itos = tokens
stoi = {
    token: i for i, token in enumerate(itos)
}

print(f'Found {len(tokens)} tokens in {len(dataset)} entries')
print(f'Distinct paths: {len(distinct_paths)}')

Found 16 tokens in 1489123 entries
Distinct paths: 37


## Embed Graphs

In [7]:
from tqdm.auto import tqdm
from source.algorithm.utility import to_nx

nx_graphs = [
    to_nx(graph)
    for _, graph in tqdm(dataset, desc='Converting to NetworkX graphs')
]
len(nx_graphs)

  from .autonotebook import tqdm as notebook_tqdm
Converting to NetworkX graphs:   0%|          | 0/1489123 [00:00<?, ?it/s]

Converting to NetworkX graphs: 100%|██████████| 1489123/1489123 [00:23<00:00, 63457.86it/s] 


1489123

In [8]:
import numpy as np
lens = [len(g) for p, g in dataset]
print(f'Min: {min(lens)}, Max: {max(lens)}')
print(f'Avg: {np.average(lens)}, Std: {np.std(lens)}')


Min: 1, Max: 8
Avg: 1.0003753887355176, Std: 0.03720074179980315


In [8]:
from karateclub import Graph2Vec

graph2vec = Graph2Vec(
    wl_iterations=80,
    attributed=True,
    dimensions=embedding_size,
    workers=4,
    epochs=5
)

graph2vec.fit(nx_graphs)
with open(Path('F:\\') / 'data' / 'prov_dp' / 'tc3-theia-graph2vec.pkl', 'wb') as file:
    pickle.dump(graph2vec, file)


In [9]:
from karateclub import Graph2Vec
with open(Path('F:\\') / 'data' / 'prov_dp' / 'tc3-theia-graph2vec.pkl', 'rb') as file:
    graph2vec = pickle.load(file)

In [10]:
graph_embeddings = graph2vec.get_embedding()
len(graph_embeddings)

1489123

In [1]:
## Format training data
X, Y = [], []

for i, row in enumerate(dataset):
    path_str, _ = row
    path_list = tokenize(path_str.split(' '))
    path = [stoi[s] for s in path_list]
    context = [0] * block_size
    for i in range(min(len(path), block_size)):
        context[-i-1] = path[i]
    print(path, context)
    break
    X.append(context)    
    Y.append(graph_embeddings[i])

X = torch.tensor(X)
Y = torch.tensor(Y)
print(X.shape, Y.shape)

NameError: name 'dataset' is not defined

In [19]:
X[1]

tensor([])

In [13]:
import torch.nn as nn

vocab_size = len(tokens)
n_embedding = 10 # Embedding dimension
n_hidden = 100

# Predict embedding, then lookup closest embedding from graph2vec... gradients don't flow through embeddings though :(
model = nn.Sequential(
    nn.Embedding(vocab_size, n_embedding),
    nn.Flatten(),
    nn.Linear(n_embedding * block_size, n_hidden), nn.ReLU(),
    nn.Linear(n_hidden, embedding_size)
)
lossi = []

In [17]:
from tqdm import tqdm
max_steps = 200000
for i in range(max_steps):
    ix = torch.randint(0, Y.shape[0], (batch_size,))
    X_batch, Y_batch = X[ix], Y[ix]

    # Forward pass
    embedding = model(X_batch) # output layer
    loss = F.cross_entropy(logits, Y_batch)

    # Backward pass
    for p in model.parameters():
        p.requires_grad = True
    loss.backward()
    
    # update
    for p in model.parameters():
        p.data += -learning_rate * p.grad

    # Track stats
    if i % 10000 == 0: # print every once in a while
        print(f'{i:7d}/{max_steps:7d}: {loss.item():.4f}')
    lossi.append(loss.log10().item())
    

RuntimeError: Expected tensor for argument #1 'indices' to have one of the following scalar types: Long, Int; but got torch.FloatTensor instead (while checking arguments for embedding)