In [1]:
import torch_geometric as pyg

  from .autonotebook import tqdm as notebook_tqdm


In [2]:
from torch_geometric.nn import MetaPath2Vec

## ACM

In [3]:
acm_data = pyg.datasets.HGBDataset(root="./data/HGB", name="ACM")[0]
#acm_data = acm_data.node_type_subgraph(['author', 'paper', 'subject'])

acm_nx = pyg.utils.to_networkx(acm_data)
acm_homo = acm_data.to_homogeneous()
acm_homo, acm_data

(Data(edge_index=[2, 547872], x=[10942, 1902], y=[10942], train_mask=[10942], test_mask=[10942], node_type=[10942], edge_type=[547872]),
 HeteroData(
   paper={
     x=[3025, 1902],
     y=[3025],
     train_mask=[3025],
     test_mask=[3025],
   },
   author={ x=[5959, 1902] },
   subject={ x=[56, 1902] },
   term={ num_nodes=1902 },
   (paper, cite, paper)={ edge_index=[2, 5343] },
   (paper, ref, paper)={ edge_index=[2, 5343] },
   (paper, to, author)={ edge_index=[2, 9949] },
   (author, to, paper)={ edge_index=[2, 9949] },
   (paper, to, subject)={ edge_index=[2, 3025] },
   (subject, to, paper)={ edge_index=[2, 3025] },
   (paper, to, term)={ edge_index=[2, 255619] },
   (term, to, paper)={ edge_index=[2, 255619] }
 ))

In [4]:
acm_data = acm_data.edge_type_subgraph([
    ('author', 'to', 'paper'),
    ('paper', 'to', 'author'),
    ('paper', 'to', 'subject'),
    ('subject', 'to', 'paper')
])
acm_data

HeteroData(
  paper={
    x=[3025, 1902],
    y=[3025],
    train_mask=[3025],
    test_mask=[3025],
  },
  author={ x=[5959, 1902] },
  subject={ x=[56, 1902] },
  (paper, to, author)={ edge_index=[2, 9949] },
  (author, to, paper)={ edge_index=[2, 9949] },
  (paper, to, subject)={ edge_index=[2, 3025] },
  (subject, to, paper)={ edge_index=[2, 3025] }
)

In [5]:
model2 = MetaPath2Vec(
    acm_data.edge_index_dict, 
    embedding_dim=50, 
    walk_length=20, 
    context_size=7, 
    metapath=[
        ('author', 'to', 'paper'),
        ('paper', 'to', 'author')
    ],
    sparse=True
)
model2

MetaPath2Vec(8984, 50)

In [6]:
model = MetaPath2Vec(
    acm_data.edge_index_dict, 
    embedding_dim=50, 
    walk_length=20, 
    context_size=11, 
    metapath=[
        ('author', 'to', 'paper'),
        ('paper', 'to', 'subject'),
        ('subject', 'to', 'paper'),
        ('paper', 'to', 'author')
    ],
    sparse=True
)
model

MetaPath2Vec(9040, 50)

In [8]:
import torch
from tqdm import tqdm
def train_mp2vec(
        model: MetaPath2Vec,
        loader: pyg.data.DataLoader, 
        optimizer: torch.optim.Optimizer,
        log_steps=100, epochs=20) -> MetaPath2Vec:


    for epoch in range(1, epochs+1):
        model.train()
        total_loss = 0
        for i, (pos_rw, neg_rw) in (pbar:= tqdm(enumerate(loader), total=len(loader))):
            pbar.set_description(f'Epoch: {epoch}, Step: {i + 1:05d}/{len(loader)}, '
                    f'Loss: {total_loss / log_steps:.4f}')
            optimizer.zero_grad()
            loss = model.loss(pos_rw, neg_rw)
            loss.backward()
            optimizer.step()

            total_loss += loss.item()
            if (i + 1) % log_steps == 0:
                pbar.set_description(f'Epoch: {epoch}, Step: {i + 1:05d}/{len(loader)}, '
                    f'Loss: {total_loss / log_steps:.4f}')
                total_loss = 0

    return model


loader = model.loader(batch_size=256, shuffle=True, num_workers=16)
optimizer = torch.optim.SparseAdam(list(model.parameters()), lr=0.01)
train_mp2vec(model, loader, optimizer)

Epoch: 1, Step: 00024/24, Loss: 1.2311: 100%|██████████| 24/24 [00:01<00:00, 21.85it/s]
Epoch: 2, Step: 00024/24, Loss: 1.0187: 100%|██████████| 24/24 [00:00<00:00, 24.02it/s]
Epoch: 3, Step: 00024/24, Loss: 0.8640: 100%|██████████| 24/24 [00:01<00:00, 20.87it/s]
Epoch: 4, Step: 00024/24, Loss: 0.7442: 100%|██████████| 24/24 [00:01<00:00, 21.93it/s]
Epoch: 5, Step: 00024/24, Loss: 0.6536: 100%|██████████| 24/24 [00:01<00:00, 21.85it/s]
Epoch: 6, Step: 00024/24, Loss: 0.5818: 100%|██████████| 24/24 [00:01<00:00, 20.78it/s]
Epoch: 7, Step: 00024/24, Loss: 0.5248: 100%|██████████| 24/24 [00:00<00:00, 24.04it/s]
Epoch: 8, Step: 00024/24, Loss: 0.4768: 100%|██████████| 24/24 [00:01<00:00, 22.27it/s]
Epoch: 9, Step: 00024/24, Loss: 0.4399: 100%|██████████| 24/24 [00:00<00:00, 24.30it/s]
Epoch: 10, Step: 00024/24, Loss: 0.4081: 100%|██████████| 24/24 [00:01<00:00, 22.83it/s]
Epoch: 11, Step: 00024/24, Loss: 0.3818: 100%|██████████| 24/24 [00:01<00:00, 23.42it/s]
Epoch: 12, Step: 00024/24, Los

MetaPath2Vec(9040, 50)

In [5]:
fb_data = pyg.datasets.HGBDataset(root="./data/HGB", name="Freebase")[0]
fb_data

HeteroData(
  book={
    num_nodes=40402,
    y=[40402],
    train_mask=[40402],
    test_mask=[40402],
  },
  film={ num_nodes=19427 },
  music={ num_nodes=82351 },
  sports={ num_nodes=1025 },
  people={ num_nodes=17641 },
  location={ num_nodes=9368 },
  organization={ num_nodes=2731 },
  business={ num_nodes=7153 },
  (book, and, book)={ edge_index=[2, 202674] },
  (book, to, film)={ edge_index=[2, 38299] },
  (book, on, sports)={ edge_index=[2, 6615] },
  (book, on, location)={ edge_index=[2, 26921] },
  (book, about, organization)={ edge_index=[2, 21900] },
  (film, and, film)={ edge_index=[2, 87838] },
  (music, in, book)={ edge_index=[2, 31486] },
  (music, in, film)={ edge_index=[2, 11291] },
  (music, and, music)={ edge_index=[2, 283670] },
  (music, for, sports)={ edge_index=[2, 8975] },
  (music, on, location)={ edge_index=[2, 42915] },
  (sports, in, film)={ edge_index=[2, 6763] },
  (sports, and, sports)={ edge_index=[2, 1290] },
  (sports, on, location)={ edge_index=[2, 

In [6]:
fb_data.node_type_subgraph(
    ['people', 'business', 'location']
)

HeteroData(
  people={ num_nodes=17641 },
  location={ num_nodes=9368 },
  business={ num_nodes=7153 },
  (people, and, people)={ edge_index=[2, 22813] },
  (people, on, location)={ edge_index=[2, 15134] },
  (people, in, business)={ edge_index=[2, 5378] },
  (location, and, location)={ edge_index=[2, 47817] },
  (business, on, location)={ edge_index=[2, 6647] },
  (business, and, business)={ edge_index=[2, 4448] }
)

In [7]:
dblp_data = pyg.datasets.DBLP(root='./data/DBLP')[0]
dblp_data

HeteroData(
  author={
    x=[4057, 334],
    y=[4057],
    train_mask=[4057],
    val_mask=[4057],
    test_mask=[4057],
  },
  paper={ x=[14328, 4231] },
  term={ x=[7723, 50] },
  conference={ num_nodes=20 },
  (author, to, paper)={ edge_index=[2, 19645] },
  (paper, to, author)={ edge_index=[2, 19645] },
  (paper, to, term)={ edge_index=[2, 85810] },
  (paper, to, conference)={ edge_index=[2, 14328] },
  (term, to, paper)={ edge_index=[2, 85810] },
  (conference, to, paper)={ edge_index=[2, 14328] }
)

In [8]:
yelp_data = pyg.datasets.Yelp(root='./data/Yelp')[0]
yelp_data

Data(x=[716847, 300], edge_index=[2, 13954819], y=[716847, 100], train_mask=[716847], val_mask=[716847], test_mask=[716847])

In [9]:
yelp_data.num_edge_types

1

In [10]:
pubmed_data = pyg.datasets.Planetoid(root="./data", name="PubMed")[0]
pubmed_data

Data(x=[19717, 500], edge_index=[2, 88648], y=[19717], train_mask=[19717], val_mask=[19717], test_mask=[19717])

In [9]:
imdb_data = pyg.datasets.IMDB(root='./data/IMDB')[0]
imdb_data

HeteroData(
  movie={
    x=[4278, 3066],
    y=[4278],
    train_mask=[4278],
    val_mask=[4278],
    test_mask=[4278],
  },
  director={ x=[2081, 3066] },
  actor={ x=[5257, 3066] },
  (movie, to, director)={ edge_index=[2, 4278] },
  (movie, to, actor)={ edge_index=[2, 12828] },
  (director, to, movie)={ edge_index=[2, 4278] },
  (actor, to, movie)={ edge_index=[2, 12828] }
)

In [10]:
model = MetaPath2Vec(
    imdb_data.edge_index_dict, 
    embedding_dim=50, 
    walk_length=20, 
    context_size=11, 
    metapath=[
        ('actor', 'to', 'movie'),
        ('movie', 'to', 'director'),
        ('director', 'to', 'movie'),
        ('movie', 'to', 'actor')
    ],
    sparse=True
)
model

MetaPath2Vec(11616, 50)

In [11]:
import torch
from tqdm import tqdm
def train_mp2vec(
        model: MetaPath2Vec,
        loader: pyg.data.DataLoader, 
        optimizer: torch.optim.Optimizer,
        log_steps=100, epochs=20) -> MetaPath2Vec:


    for epoch in range(1, epochs+1):
        model.train()
        total_loss = 0
        for i, (pos_rw, neg_rw) in (pbar:= tqdm(enumerate(loader), total=len(loader))):
            pbar.set_description(f'Epoch: {epoch}, Step: {i + 1:05d}/{len(loader)}, '
                    f'Loss: {total_loss / log_steps:.4f}')
            optimizer.zero_grad()
            loss = model.loss(pos_rw, neg_rw)
            loss.backward()
            optimizer.step()

            total_loss += loss.item()
            if (i + 1) % log_steps == 0:
                pbar.set_description(f'Epoch: {epoch}, Step: {i + 1:05d}/{len(loader)}, '
                    f'Loss: {total_loss / log_steps:.4f}')
                total_loss = 0

    return model


loader = model.loader(batch_size=256, shuffle=True, num_workers=16)
optimizer = torch.optim.SparseAdam(list(model.parameters()), lr=0.01)
train_mp2vec(model, loader, optimizer)

Epoch: 1, Step: 00021/21, Loss: 1.0767: 100%|██████████| 21/21 [00:01<00:00, 20.91it/s]
Epoch: 2, Step: 00021/21, Loss: 0.9588: 100%|██████████| 21/21 [00:00<00:00, 21.58it/s]
Epoch: 3, Step: 00021/21, Loss: 0.8515: 100%|██████████| 21/21 [00:01<00:00, 20.51it/s]
Epoch: 4, Step: 00021/21, Loss: 0.7653: 100%|██████████| 21/21 [00:01<00:00, 20.97it/s]
Epoch: 5, Step: 00021/21, Loss: 0.6952: 100%|██████████| 21/21 [00:00<00:00, 21.93it/s]
Epoch: 6, Step: 00021/21, Loss: 0.6378: 100%|██████████| 21/21 [00:00<00:00, 22.13it/s]
Epoch: 7, Step: 00021/21, Loss: 0.5883: 100%|██████████| 21/21 [00:00<00:00, 22.39it/s]
Epoch: 8, Step: 00021/21, Loss: 0.5467: 100%|██████████| 21/21 [00:01<00:00, 20.12it/s]
Epoch: 9, Step: 00021/21, Loss: 0.5117: 100%|██████████| 21/21 [00:01<00:00, 20.59it/s]
Epoch: 10, Step: 00021/21, Loss: 0.4796: 100%|██████████| 21/21 [00:00<00:00, 22.28it/s]
Epoch: 11, Step: 00021/21, Loss: 0.4534: 100%|██████████| 21/21 [00:01<00:00, 19.98it/s]
Epoch: 12, Step: 00021/21, Los

MetaPath2Vec(11616, 50)