In [1]:
from numpy import genfromtxt
import pandas as pd
import GIDS
import torch
import time
import sys
import argparse, datetime
import dgl
import sklearn.metrics
import torch, torch.nn as nn, torch.optim as optim
import time, tqdm, numpy as np
from models import *
from dataloader import IGB260MDGLDataset
import csv
import warnings

  from .autonotebook import tqdm as notebook_tqdm


In [2]:
if __name__ == '__main__':
    sys.argv=['--dataset_size=full --epochs 1 --batch_size 4096 --model_type sage --num_layers 3 --fan_out "10,5,5" ']
    parser = argparse.ArgumentParser()

    parser.add_argument('--path', type=str, default='/mnt/nvme14/IGB260M',
        help='path containing the datasets')
    parser.add_argument('--dataset_size', type=str, default='full',
        choices=['experimental', 'small', 'medium', 'large', 'full'],
        help='size of the datasets')
    parser.add_argument('--num_classes', type=int, default=19,
        choices=[19, 2983, 171,172, 173], help='number of classes')
    parser.add_argument('--in_memory', type=int, default=0,
        choices=[0, 1], help='0:read only mmap_mode=r, 1:load into memory')
    parser.add_argument('--synthetic', type=int, default=0,
        choices=[0, 1], help='0:nlp-node embeddings, 1:random') 
    
    parser.add_argument('--model_type', type=str, default='sage',
                          choices=['gat', 'sage', 'gcn'])
    parser.add_argument('--modelpath', type=str, default='deletethis.pt')
    parser.add_argument('--model_save', type=int, default=0)

    # Model parameters
    parser.add_argument('--fan_out', type=str, default='10,5,5')
    parser.add_argument('--batch_size', type=int, default=4096)
    parser.add_argument('--num_workers', type=int, default=0)

    parser.add_argument('--hidden_channels', type=int, default=128)
    parser.add_argument('--learning_rate', type=float, default=0.01)
    parser.add_argument('--decay', type=float, default=0.001)
    parser.add_argument('--epochs', type=int, default=1)
    parser.add_argument('--num_layers', type=int, default=3)
    parser.add_argument('--num_heads', type=int, default=4)

    parser.add_argument('--log_every', type=int, default=2)
    parser.add_argument('--device', type=int, default=0)

    parser.add_argument('--data', type=str, default='IGB')
    # Experiment (to be deleted)
    parser.add_argument('--emb_size', type=int, default=1024)

    parser.add_argument('--cpu_cache', type=int, default=0)
    parser.add_argument('--uva', type=int, default=0)
    parser.add_argument('--uva_graph', type=int, default=0)

    parser.add_argument('--num_gpus', type=int, default=4)
    parser.add_argument('--buffer_depth', type=int, default=128)


    args = parser.parse_args()
    dataset = IGB260MDGLDataset(args)
    g = dataset[0]

    device = f'cuda:' + str(args.device) if torch.cuda.is_available() else 'cpu'
    #gnn_sim(g, args, device)

node edge: tensor([[136011405,  28830019],
        [ 28830019, 136011405],
        [136011405,  81328391],
        ...,
        [ 44868518, 203113760],
        [203113760, 243329035],
        [243329035, 203113760]])
self graph:  {'created': ['csc'], 'not created': []}
self graph2:  {'created': ['csc'], 'not created': []}
self graph3:  {'created': ['csc'], 'not created': []}
self graph4:  {'created': ['csc'], 'not created': []}
self graph5:  {'created': ['csc'], 'not created': []}


In [4]:
uva_flag = False
if(args.uva == 1):
    uva_flag = True

g.ndata['features'] = g.ndata['feat']
g.ndata['labels'] = g.ndata['label']

train_nid = torch.nonzero(g.ndata['train_mask'], as_tuple=True)[0]
val_nid = torch.nonzero(g.ndata['val_mask'], as_tuple=True)[0]
test_nid = torch.nonzero(g.ndata['test_mask'], as_tuple=True)[0]
in_feats = g.ndata['features'].shape[1]
dim = 1024


num_gpus = args.num_gpus
buffer_depth = args.buffer_depth


sampler = dgl.dataloading.MultiLayerNeighborSampler(
            [int(fanout) for fanout in args.fan_out.split(',')])

train_dataloader = dgl.dataloading.DataLoader(
    g,
    train_nid,
    sampler,
    batch_size=args.batch_size,
    shuffle=True,
    drop_last=False,
    num_workers=args.num_workers,
    use_uva=uva_flag,
    feature_dim=dim,
    use_alternate_streams=False
)

if args.model_type == 'gcn':
    model = GCN(in_feats, args.hidden_channels, args.num_classes,
        args.num_layers).to(device)
if args.model_type == 'sage':
    model = SAGE(in_feats, args.hidden_channels, args.num_classes,
        args.num_layers).to(device)
if args.model_type == 'gat':
    model = GAT(in_feats, args.hidden_channels, args.num_classes,
        args.num_layers, args.num_heads).to(device)

param_size = 0
for param in model.parameters():
    param_size += param.nelement() * param.element_size()
buffer_size = 0
for buffer in model.buffers():
    buffer_size += buffer.nelement() * buffer.element_size()

size_all_mb = (param_size + buffer_size) / 1024**2

loss_fcn = nn.CrossEntropyLoss().to(device)
optimizer = optim.Adam(model.parameters(),
    lr=args.learning_rate, weight_decay=args.decay)

# Training loop
best_accuracy = 0
print("training start")


training start


In [40]:
batch_buffer = []
for i in range(num_gpus):
    batch_buffer.append([])
for epoch in tqdm.tqdm(range(args.epochs)):
    # Loop over the dataloader to sample the computation dependency graph as a list of
    # blocks.
    epoch_loss = 0
    gpu_mem_alloc = 0
    train_acc = 0
    epoch_start = time.time()
    idx = 0
    model.train()

    batch_input_time = 0
    train_time = 0
    transfer_time = 0
    e2e_time = 0
    e2e_time_start = time.time()
    for step, (input_nodes, seeds, blocks, ret) in enumerate(train_dataloader):
        print("step: ", step)
        print("input len: ", len(input_nodes))

        gpu_idx = step % num_gpus

        batch_buffer[gpu_idx].append(input_nodes.tolist())

        if step == (buffer_depth * num_gpus):
            break

  0%|                                                                                                                                  | 0/1 [00:00<?, ?it/s]

step:  0
input len:  543033
step:  1
input len:  538403
step:  2
input len:  543925
step:  3
input len:  556197
step:  4
input len:  554665
step:  5
input len:  558439
step:  6
input len:  559808
step:  7
input len:  544978
step:  8
input len:  539904
step:  9
input len:  541355
step:  10
input len:  556565
step:  11
input len:  547316
step:  12
input len:  552920
step:  13
input len:  533303
step:  14
input len:  559833
step:  15
input len:  554987
step:  16
input len:  542080
step:  17
input len:  546701
step:  18
input len:  554048
step:  19
input len:  547749
step:  20
input len:  561165
step:  21
input len:  539877
step:  22
input len:  538576
step:  23
input len:  538232
step:  24
input len:  545197
step:  25
input len:  552519
step:  26
input len:  530941
step:  27
input len:  550257
step:  28
input len:  553716
step:  29
input len:  543036
step:  30
input len:  541280
step:  31
input len:  541658
step:  32
input len:  550983
step:  33
input len:  541218
step:  34
input len:  55

100%|█████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████| 1/1 [01:43<00:00, 103.80s/it]

step:  512
input len:  555395





In [42]:
for i in range(num_gpus):
    print(len(batch_buffer[i]))

129
128
128
128


In [164]:
cur_batch = batch_buffer[0][0]

window_set = set()
for i in range(4):
    window_set.update(batch_buffer[0][1+i])
                     

In [179]:
nvlink_set = set()
for i in range(4):
    nvlink_set.update(batch_buffer[1][i])
    nvlink_set.update(batch_buffer[2][i])
    nvlink_set.update(batch_buffer[3][i])

In [180]:
cur_batch_size = len(cur_batch)
cur_set = set(cur_batch)

In [181]:
window_overlap = cur_set.intersection(window_set)

In [182]:
nvlink_overlap = cur_set.intersection(nvlink_set)

In [183]:
combine_overlap = window_overlap.union(nvlink_overlap)

In [184]:
window_buffer_count = len(window_overlap)
nvlink_overlap_count = len(nvlink_overlap)
combine_overlap_count = len(combine_overlap)
print("window buffer count: ", window_buffer_count)
print("nvlink buffer count: ", nvlink_overlap_count)
print("combin count: ", combine_overlap_count)


window buffer count:  46535
nvlink buffer count:  106563
combin count:  129047


In [185]:
window_hit = window_buffer_count/cur_batch_size
nvlink_hit = nvlink_overlap_count/cur_batch_size
combine_hit = combine_overlap_count/cur_batch_size

print("window hit: ",window_hit)
print("nvlink hit: ",nvlink_hit)
print("combine hit: ",combine_hit)


window hit:  0.08569460787834257
nvlink hit:  0.19623669279767528
combine hit:  0.23764117466157675


In [49]:
cur_batch_size


543033

In [63]:
base_set = set()
for i in range(128):
    base_set.update(batch_buffer[0][1+i])
base_overlap = cur_set.intersection(base_set)
base_overlap_count = len(base_overlap)
base_hit = base_overlap_count/cur_batch_size
print("base hit: ", base_hit)

base hit:  0.7045557084007786
