In [1]:
%load_ext autoreload
%autoreload 2
from IPython.display import display, HTML
display(HTML("<style>.container { width:100% !important; }</style>"))

import os, re, sys
from pathlib import Path
PROJ_PATH = Path(os.path.join(re.sub("/PyDHNet.*$", '', os.getcwd()), 'PyDHNet'))
sys.path.insert(1, str(PROJ_PATH / 'PyDHNet'))
sys.path.insert(1, str(PROJ_PATH / 'PyDHNet' / 'src'))

## Network Representation Learning

In [2]:
from PyDHNet import PyDHNet
pydhnet = PyDHNet(config_path='./PyDHNet/config/dblp.json')

In [3]:
pydhnet.config

OrderedDict([('name', 'dblp'),
             ('num_time_steps', 8),
             ('max_size', 5),
             ('seed', 0),
             ('multilabel', True),
             ('num_workers', 0),
             ('sample_walk_len', 15),
             ('random_walk_len', 15),
             ('structure_patch_type', 'triangular_random_walk'),
             ('max_sim_epochs', 5),
             ('n_anchor_patches_structure', 45),
             ('n_triangular_walks', 10),
             ('n_processes', 4),
             ('batch_size', 64),
             ('meta_paths',
              '0-1-0-1-0-1-0-1-0-1-0-1-0-1-0-1-0-1-0-1-0-1-0-1-0-1-0-1-0-1-0-1-0 1-0-1-0-1-0-1-0-1-0-1-0-1-0-1-0-1-0-1-0-1-0-1-0-1-0-1-0-1-0-1-0-1 1-2-1-2-2-1-2-2-1-2-2-1-2-2-1-2-2-1-2-2-1-2-2-1-2-2-1-2-2-1-2-2-1 2-1-2-2-1-2-2-1-2-2-1-2-2-1-2-2-1-2-2-1-2-2-1-2-2-1-2-2-1-2-2-1-2 0-1-2-1-0-1-2-1-0-1-2-1-0-1-2-1-0-1-2-1-0-1-2-1-0-1-2-1-0-1-2-1-0 1-2-1-0-1-2-1-0-1-2-1-0-1-2-1-0-1-2-1-0-1-2-1-0-1-2-1-0-1-2-1-0-1 2-1-0-1-2-1-0-1-2-1-0-1-2-1-0-1-2-1-0

In [None]:
# Preprocess data
pydhnet.preprocess_data()

In [None]:
# Initialize data, model, trainer
data_module, model_module, trainer = pydhnet.initialize()

In [None]:
# Train
data_module, model_module, trainer = pydhnet.initialize()
pydhnet.train(data_module, model_module, trainer)

In [None]:
# Test with all checkpoints
data_module, model_module, trainer = pydhnet.initialize()
checkpoint_paths = pydhnet.get_checkpoint_paths()
for checkpoint_path in checkpoint_paths:
    pydhnet.test(data_module, model_module, trainer, checkpoint_path)

In [None]:
# Infer with the last checkpoints
checkpoint_paths = pydhnet.get_checkpoint_paths()
restore_model_dir = str(pydhnet.config['checkpoint_dir'])
restore_model_name = str(checkpoint_paths[-1].name)
output_dir = str(PROJ_PATH / 'output')
pydhnet.generate_embedding(data_module, model_module, restore_model_dir, restore_model_name, output_dir)

## Evaluation

In [3]:
from evaluation import predict_link_without_classifier, predict_link_with_classifier

  from .autonotebook import tqdm as notebook_tqdm


In [4]:
df = pd.DataFrame(np.random.randint(0, 100, size=(100, 4)), columns=['f1', 'f2', 'f3', 'f4'])
df['tvt'] = ['train'] * 60 + ['val'] * 20 + ['test'] * 20
labels = 50 * [1] + 50 * [0]# + 30 * [2]
random.shuffle(labels)
df['label'] = labels

In [8]:
features = df[['f1', 'f2', 'f3', 'f4']].values.tolist()
labels = df['label'].values.tolist()
train_val_test_index = df['tvt'].values.tolist()
results, models = eval_node_classification(features, labels, train_val_test_index)
display(pd.DataFrame(results))

{'accuracy': {'train': 0.6166666666666667, 'val': 0.4, 'test': 0.45}, 'auc': {'train': 0.6318131256952169, 'val': 0.4166666666666667, 'test': 0.4444444444444444}, 'f1': {'train': 0.634920634920635, 'val': 0.45454545454545453, 'test': 0.5217391304347826}}


Unnamed: 0,accuracy,auc,f1
train,0.616667,0.631813,0.634921
val,0.4,0.416667,0.454545
test,0.45,0.444444,0.521739


In [9]:
source_features = df[['f1', 'f2']].values.tolist()
target_features = df[['f3', 'f4']].values.tolist()
labels = df['label'].values.tolist()
train_val_test_index = df['tvt'].values.tolist()
results, models = eval_link_prediction(source_features, target_features, labels, train_val_test_index)
display(pd.DataFrame(results))

{'sigmoid_auc': {'train': 0.5, 'val': 0.5, 'test': 0.5}, 'sigmoid_f1': {'train': 0.6813186813186815, 'val': 0.5714285714285715, 'test': 0.7096774193548387}, 'HAD_auc': {'train': 0.5706340378197998, 'val': 0.34375, 'test': 0.42424242424242425}, 'HAD_f1': {'train': 0.6753246753246753, 'val': 0.56, 'test': 0.4999999999999999}, 'AVG_auc': {'train': 0.5750834260289212, 'val': 0.3125, 'test': 0.46464646464646464}, 'AVG_f1': {'train': 0.5538461538461539, 'val': 0.43478260869565216, 'test': 0.5217391304347826}, 'L1_auc': {'train': 0.6095661846496107, 'val': 0.65625, 'test': 0.3434343434343434}, 'L1_f1': {'train': 0.4918032786885246, 'val': 0.5555555555555556, 'test': 0.380952380952381}, 'L2_auc': {'train': 0.5717463848720801, 'val': 0.6666666666666666, 'test': 0.36363636363636365}, 'L2_f1': {'train': 0.5671641791044777, 'val': 0.5714285714285714, 'test': 0.45454545454545453}}


Unnamed: 0,sigmoid_auc,sigmoid_f1,HAD_auc,HAD_f1,AVG_auc,AVG_f1,L1_auc,L1_f1,L2_auc,L2_f1
train,0.5,0.681319,0.570634,0.675325,0.575083,0.553846,0.609566,0.491803,0.571746,0.567164
val,0.5,0.571429,0.34375,0.56,0.3125,0.434783,0.65625,0.555556,0.666667,0.571429
test,0.5,0.709677,0.424242,0.5,0.464646,0.521739,0.343434,0.380952,0.363636,0.454545


## Subgraph sampling

In [5]:
from subgraph_sampler import TemporalSubgraphSampler
sampler = TemporalSubgraphSampler(
    node_path='/home/hoang/github/PyDHNet/dataset/dblp/node_types.csv',
    edge_path='/home/hoang/github/PyDHNet/dataset/dblp/temporal_edge_list.txt', 
    sampled_node_ids=[0, 1, 2], 
    max_size=5, 
    number_of_nodes=20,
    seed=0,
    output_dir='./',
)

sampler.sampling_temporal_subgraph()
sampler.write_temporal_subgraphs()

Sampling subgraph at time id: 0


100%|███████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████| 3/3 [00:00<00:00, 39.79it/s]


Sampling subgraph at time id: 1


100%|███████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████| 3/3 [00:00<00:00, 12.52it/s]


Sampling subgraph at time id: 2


100%|███████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████| 3/3 [00:00<00:00, 32.61it/s]


Sampling subgraph at time id: 3


100%|███████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████| 3/3 [00:00<00:00, 32.01it/s]


Sampling subgraph at time id: 4


100%|███████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████| 3/3 [00:00<00:00, 30.42it/s]


Sampling subgraph at time id: 5


100%|███████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████| 3/3 [00:00<00:00, 29.41it/s]


Sampling subgraph at time id: 6


100%|███████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████| 3/3 [00:00<00:00, 29.39it/s]


Sampling subgraph at time id: 7


100%|███████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████| 3/3 [00:00<00:00, 38.00it/s]
