In [1]:
import dgl
from dgl.data.utils import load_graphs

import torch
import torch.nn as nn
import torch.nn.functional as F
import numpy as np
import random
import pandas as pd

from model.model import HTGNN, NodePredictor
from utils.pytorchtools import EarlyStopping
from utils.data import load_COVID_data, load_MAG_data

dgl.seed(0)
np.random.seed(0)
torch.manual_seed(0)
torch.cuda.manual_seed(0)
torch.cuda.manual_seed_all(0)

  from .autonotebook import tqdm as notebook_tqdm


In [8]:
original_pred = torch.tensor([1.,2.,3.,4.])
masked_pred = torch.tensor([3.,2.,5.,8.])

In [9]:
mseloss = nn.MSELoss()
l1 = F.l1_loss(original_pred, masked_pred)
l2 = torch.sqrt(mseloss(original_pred, masked_pred))

In [3]:
np.mean([1,2,3,4])

2.5

In [5]:
F.l1_loss(a,b)

tensor(2.)

In [7]:
mseloss= nn.MSELoss()
mse = mseloss(a,b)
torch.sqrt(mse)

tensor(2.4495)

In [2]:
glist, label_dict = load_graphs('/home/jiazhengli/xdgnn/HTGNN/acm_graph.bin')

In [2]:
glist, label_dict = load_graphs('data/ogbn_graphs.bin')

In [10]:
glist, label_dict = load_graphs('data/ogbn_graphs.bin')
device = torch.device('cuda')
time_window = 3

train_feats, train_labels, val_feats, val_labels, test_feats, test_labels = load_MAG_data(glist, time_window, device)


loading mp2vec
generating train, val, test sets 


In [21]:
train_feats[0]

Graph(num_nodes={'author': 17764, 'field_of_study': 22866, 'institution': 2276, 'paper': 86186},
      num_edges={('author', 'affiliated_with_t0', 'institution'): 40307, ('author', 'affiliated_with_t1', 'institution'): 40307, ('author', 'affiliated_with_t2', 'institution'): 40307, ('author', 'writes_t0', 'paper'): 101109, ('author', 'writes_t1', 'paper'): 130156, ('author', 'writes_t2', 'paper'): 200760, ('field_of_study', 'has_topic_r_t0', 'paper'): 285074, ('field_of_study', 'has_topic_r_t1', 'paper'): 299817, ('field_of_study', 'has_topic_r_t2', 'paper'): 318040, ('institution', 'affiliated_with_r_t0', 'author'): 40307, ('institution', 'affiliated_with_r_t1', 'author'): 40307, ('institution', 'affiliated_with_r_t2', 'author'): 40307, ('paper', 'cites_r_t0', 'paper'): 19926, ('paper', 'cites_r_t1', 'paper'): 21276, ('paper', 'cites_r_t2', 'paper'): 24575, ('paper', 'cites_t0', 'paper'): 19926, ('paper', 'cites_t1', 'paper'): 21276, ('paper', 'cites_t2', 'paper'): 24575, ('paper', 'ha

In [19]:
train_feats[0].ndata['t0']

{'author': tensor([[-0.3462, -0.7368, -0.1895,  ..., -0.4297,  0.5344,  0.1376],
         [ 0.1640, -0.2393,  0.4055,  ..., -0.0537,  0.0109,  0.0777],
         [ 0.3127, -0.0188,  0.2037,  ...,  0.3168, -0.4902,  0.7343],
         ...,
         [ 0.1556,  0.0476,  0.1873,  ...,  0.5120, -0.1914,  0.2530],
         [ 0.6975, -0.3847, -0.0367,  ..., -0.0240,  0.6860, -0.4370],
         [ 0.5069, -0.9206,  0.5064,  ..., -0.0856,  0.2594,  0.7022]]),
 'field_of_study': tensor([[-0.1983, -0.1398, -0.9528,  ...,  0.6425, -0.7199, -0.1190],
         [ 0.4992, -0.1825, -0.5634,  ..., -0.2716, -0.2596, -0.5283],
         [ 0.1161, -0.9553, -0.5342,  ...,  0.5143,  0.8805,  0.6736],
         ...,
         [ 0.3607,  0.1622,  0.0878,  ...,  0.4886,  0.4765,  0.3659],
         [ 0.0000,  0.0000,  0.0000,  ...,  0.0000,  0.0000,  0.0000],
         [ 0.4505, -0.9270,  0.4386,  ..., -0.1139,  0.2695,  0.7066]]),
 'institution': tensor([[ 0.7366,  0.0565, -0.9512,  ...,  0.0396,  0.2451,  0.3114],
  

In [54]:
device = torch.device('cuda')
glist, _ = load_graphs('data/covid_graphs.bin')
time_window = 7

train_feats, train_labels, val_feats, val_labels, test_feats, test_labels = load_COVID_data(glist, time_window)


In [56]:
glist[0]

Graph(num_nodes={'county': 3223, 'state': 51},
      num_edges={('county', 'affiliate_r', 'state'): 3141, ('county', 'nearby_county', 'county'): 22176, ('state', 'affiliate', 'county'): 3141, ('state', 'nearby_state', 'state'): 269},
      metagraph=[('county', 'state', 'affiliate_r'), ('county', 'county', 'nearby_county'), ('state', 'county', 'affiliate'), ('state', 'state', 'nearby_state')])

In [9]:
train_feats[0].nodes('state')

tensor([ 0,  1,  2,  3,  4,  5,  6,  7,  8,  9, 10, 11, 12, 13, 14, 15, 16, 17,
        18, 19, 20, 21, 22, 23, 24, 25, 26, 27, 28, 29, 30, 31, 32, 33, 34, 35,
        36, 37, 38, 39, 40, 41, 42, 43, 44, 45, 46, 47, 48, 49, 50],
       dtype=torch.int32)

In [3]:
sg, inverse_indices = dgl.khop_in_subgraph(train_feats[0], {'state': 2}, k=2, store_ids=True)

In [5]:
sg.num_edges()

13377

In [6]:
sg, inverse_indices = dgl.khop_in_subgraph(train_feats[0], {'state': [1,2]}, k=2, store_ids=True)

In [14]:
dic = {}
z = 0
# k = 0 
for stype, etype, dtype in sg.canonical_etypes:
    k = sg[stype, etype, dtype].number_of_edges()
    # k = sg[stype, etype, dtype].number_of_edges()
    dic[etype] = (z,z + k)
    z += k


In [15]:
dic

{'affiliate_r_t0': (0, 29),
 'affiliate_r_t1': (29, 58),
 'affiliate_r_t2': (58, 87),
 'affiliate_r_t3': (87, 116),
 'affiliate_r_t4': (116, 145),
 'affiliate_r_t5': (145, 174),
 'affiliate_r_t6': (174, 203),
 'nearby_county_t0': (203, 334),
 'nearby_county_t1': (334, 465),
 'nearby_county_t2': (465, 596),
 'nearby_county_t3': (596, 727),
 'nearby_county_t4': (727, 858),
 'nearby_county_t5': (858, 989),
 'nearby_county_t6': (989, 1120),
 'affiliate_t0': (1120, 1149),
 'affiliate_t1': (1149, 1178),
 'affiliate_t2': (1178, 1207),
 'affiliate_t3': (1207, 1236),
 'affiliate_t4': (1236, 1265),
 'affiliate_t5': (1265, 1294),
 'affiliate_t6': (1294, 1323),
 'nearby_state_t0': (1323, 1324),
 'nearby_state_t1': (1324, 1325),
 'nearby_state_t2': (1325, 1326),
 'nearby_state_t3': (1326, 1327),
 'nearby_state_t4': (1327, 1328),
 'nearby_state_t5': (1328, 1329),
 'nearby_state_t6': (1329, 1330)}

In [25]:
c = {type:0 for type in sg.etypes}

In [27]:
g = dgl.heterograph({
    ('user', 'plays', 'game'): (torch.tensor([0, 1, 1, 2]),
                                torch.tensor([0, 0, 1, 1])),
    ('developer', 'develops', 'game'): (torch.tensor([0, 1]),
                                        torch.tensor([0, 1]))
    })
g = dgl.remove_edges(g, torch.tensor([0, 1]), 'plays',store_ids=True)
g.edges('all', etype='plays')

(tensor([1, 2]), tensor([1, 1]), tensor([0, 1]))

In [29]:
g.num_edges()

4

In [28]:
g.edata[dgl.EID]

{('developer', 'develops', 'game'): tensor([0, 1]),
 ('user', 'plays', 'game'): tensor([2, 3])}

# GNNExplainer

In [2]:
dataset = dgl.data.CoraGraphDataset()

Downloading /home/jiazhengli/.dgl/cora_v2.zip from https://data.dgl.ai/dataset/cora_v2.zip...
Extracting file to /home/jiazhengli/.dgl/cora_v2
Finished data loading and preprocessing.
  NumNodes: 2708
  NumEdges: 10556
  NumFeats: 1433
  NumClasses: 7
  NumTrainingSamples: 140
  NumValidationSamples: 500
  NumTestSamples: 1000
Done saving data into cached files.


In [13]:
dataset[0].ndata['feat'].float()

torch.Size([2708, 1433])

In [57]:
device = torch.device('cuda')
glist, _ = load_graphs('data/covid_graphs.bin')
time_window = 7

train_feats, train_labels, val_feats, val_labels, test_feats, test_labels = load_COVID_data(glist, time_window)

In [58]:
glist[0]

Graph(num_nodes={'county': 3223, 'state': 51},
      num_edges={('county', 'affiliate_r', 'state'): 3141, ('county', 'nearby_county', 'county'): 22176, ('state', 'affiliate', 'county'): 3141, ('state', 'nearby_state', 'state'): 269},
      metagraph=[('county', 'state', 'affiliate_r'), ('county', 'county', 'nearby_county'), ('state', 'county', 'affiliate'), ('state', 'state', 'nearby_state')])

In [59]:
glist[1]

Graph(num_nodes={'county': 3223, 'state': 51},
      num_edges={('county', 'affiliate_r', 'state'): 3141, ('county', 'nearby_county', 'county'): 22176, ('state', 'affiliate', 'county'): 3141, ('state', 'nearby_state', 'state'): 269},
      metagraph=[('county', 'state', 'affiliate_r'), ('county', 'county', 'nearby_county'), ('state', 'county', 'affiliate'), ('state', 'state', 'nearby_state')])

In [18]:
g = test_feats[0]
g

Graph(num_nodes={'county': 3223, 'state': 51},
      num_edges={('county', 'affiliate_r_t0', 'state'): 3141, ('county', 'affiliate_r_t1', 'state'): 3141, ('county', 'affiliate_r_t2', 'state'): 3141, ('county', 'affiliate_r_t3', 'state'): 3141, ('county', 'affiliate_r_t4', 'state'): 3141, ('county', 'affiliate_r_t5', 'state'): 3141, ('county', 'affiliate_r_t6', 'state'): 3141, ('county', 'nearby_county_t0', 'county'): 22176, ('county', 'nearby_county_t1', 'county'): 22176, ('county', 'nearby_county_t2', 'county'): 22176, ('county', 'nearby_county_t3', 'county'): 22176, ('county', 'nearby_county_t4', 'county'): 22176, ('county', 'nearby_county_t5', 'county'): 22176, ('county', 'nearby_county_t6', 'county'): 22176, ('state', 'affiliate_t0', 'county'): 3141, ('state', 'affiliate_t1', 'county'): 3141, ('state', 'affiliate_t2', 'county'): 3141, ('state', 'affiliate_t3', 'county'): 3141, ('state', 'affiliate_t4', 'county'): 3141, ('state', 'affiliate_t5', 'county'): 3141, ('state', 'affiliate

In [40]:
g.ndata['t0']['county'].shape

torch.Size([3223, 1])

In [46]:
g.ntypes

['county', 'state']

In [42]:
g.canonical_etypes

[('county', 'affiliate_r_t0', 'state'),
 ('county', 'affiliate_r_t1', 'state'),
 ('county', 'affiliate_r_t2', 'state'),
 ('county', 'affiliate_r_t3', 'state'),
 ('county', 'affiliate_r_t4', 'state'),
 ('county', 'affiliate_r_t5', 'state'),
 ('county', 'affiliate_r_t6', 'state'),
 ('county', 'nearby_county_t0', 'county'),
 ('county', 'nearby_county_t1', 'county'),
 ('county', 'nearby_county_t2', 'county'),
 ('county', 'nearby_county_t3', 'county'),
 ('county', 'nearby_county_t4', 'county'),
 ('county', 'nearby_county_t5', 'county'),
 ('county', 'nearby_county_t6', 'county'),
 ('state', 'affiliate_t0', 'county'),
 ('state', 'affiliate_t1', 'county'),
 ('state', 'affiliate_t2', 'county'),
 ('state', 'affiliate_t3', 'county'),
 ('state', 'affiliate_t4', 'county'),
 ('state', 'affiliate_t5', 'county'),
 ('state', 'affiliate_t6', 'county'),
 ('state', 'nearby_state_t0', 'state'),
 ('state', 'nearby_state_t1', 'state'),
 ('state', 'nearby_state_t2', 'state'),
 ('state', 'nearby_state_t3', 'st

In [19]:
hg = dgl.to_homogeneous(g)
hg

Graph(num_nodes=3274, num_edges=201089,
      ndata_schemes={'_ID': Scheme(shape=(), dtype=torch.int32), '_TYPE': Scheme(shape=(), dtype=torch.int64)}
      edata_schemes={'_ID': Scheme(shape=(), dtype=torch.int32), '_TYPE': Scheme(shape=(), dtype=torch.int64)})

In [38]:
hg.ndata[dgl.NTYPE]

tensor([0, 0, 0,  ..., 1, 1, 1])

# New dataset

In [63]:
from ogb.linkproppred import DglLinkPropPredDataset

dataset = DglLinkPropPredDataset(name = 'ogbl-collab')

split_edge = dataset.get_edge_split()
train_edge, valid_edge, test_edge = split_edge["train"], split_edge["valid"], split_edge["test"]
graph = dataset[0]

Downloading http://snap.stanford.edu/ogb/data/linkproppred/collab.zip


Downloaded 0.11 GB: 100%|██████████| 117/117 [00:09<00:00, 11.81it/s]


Extracting dataset/collab.zip
Loading necessary files...
This might take a while.
Processing graphs...


100%|██████████| 1/1 [00:00<00:00, 44.89it/s]


Converting graphs into DGL objects...


100%|██████████| 1/1 [00:00<00:00, 412.66it/s]

Saving...





In [64]:
graph

Graph(num_nodes=235868, num_edges=2358104,
      ndata_schemes={'feat': Scheme(shape=(128,), dtype=torch.float32)}
      edata_schemes={'year': Scheme(shape=(1,), dtype=torch.int64), 'weight': Scheme(shape=(1,), dtype=torch.int64)})

In [11]:
r = np.array([0.803 , 0.841 , 0.861 , 0.877 , 0.897 , 0.907])
np.mean(r)

0.8643333333333333

# Sentence transformer

In [1]:
from sentence_transformers import SentenceTransformer
sbert_model = SentenceTransformer('all-MiniLM-L6-v2')

  from .autonotebook import tqdm as notebook_tqdm
2023-04-21 00:07:08.698675: I tensorflow/core/util/port.cc:110] oneDNN custom operations are on. You may see slightly different numerical results due to floating-point round-off errors from different computation orders. To turn them off, set the environment variable `TF_ENABLE_ONEDNN_OPTS=0`.
2023-04-21 00:07:08.722251: I tensorflow/core/platform/cpu_feature_guard.cc:182] This TensorFlow binary is optimized to use available CPU instructions in performance-critical operations.
To enable the following instructions: AVX2 AVX_VNNI FMA, in other operations, rebuild TensorFlow with the appropriate compiler flags.
Downloading: 100%|██████████| 1.18k/1.18k [00:00<00:00, 3.68MB/s]
Downloading: 100%|██████████| 190/190 [00:00<00:00, 715kB/s]
Downloading: 100%|██████████| 10.6k/10.6k [00:00<00:00, 30.4MB/s]
Downloading: 100%|██████████| 612/612 [00:00<00:00, 1.62MB/s]
Downloading: 100%|██████████| 116/116 [00:00<00:00, 446kB/s]
Downloading: 100%|█

In [2]:
sentences = ['i love you']

In [14]:
sentence_embeddings = sbert_model.encode(sentences)

In [7]:
movie_info = pd.read_csv('data/Movielens/movie_info.csv',sep='|',header=None)

In [8]:
movie_info

Unnamed: 0,0,1,2,3,4,5,6,7,8,9,...,14,15,16,17,18,19,20,21,22,23
0,1,Toy Story (1995),01-Jan-1995,,http://us.imdb.com/M/title-exact?Toy%20Story%2...,0,0,0,1,1,...,0,0,0,0,0,0,0,0,0,0
1,2,GoldenEye (1995),01-Jan-1995,,http://us.imdb.com/M/title-exact?GoldenEye%20(...,0,1,1,0,0,...,0,0,0,0,0,0,0,1,0,0
2,3,Four Rooms (1995),01-Jan-1995,,http://us.imdb.com/M/title-exact?Four%20Rooms%...,0,0,0,0,0,...,0,0,0,0,0,0,0,1,0,0
3,4,Get Shorty (1995),01-Jan-1995,,http://us.imdb.com/M/title-exact?Get%20Shorty%...,0,1,0,0,0,...,0,0,0,0,0,0,0,0,0,0
4,5,Copycat (1995),01-Jan-1995,,http://us.imdb.com/M/title-exact?Copycat%20(1995),0,0,0,0,0,...,0,0,0,0,0,0,0,1,0,0
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
1677,1678,Mat' i syn (1997),06-Feb-1998,,http://us.imdb.com/M/title-exact?Mat%27+i+syn+...,0,0,0,0,0,...,0,0,0,0,0,0,0,0,0,0
1678,1679,B. Monkey (1998),06-Feb-1998,,http://us.imdb.com/M/title-exact?B%2E+Monkey+(...,0,0,0,0,0,...,0,0,0,0,0,1,0,1,0,0
1679,1680,Sliding Doors (1998),01-Jan-1998,,http://us.imdb.com/Title?Sliding+Doors+(1998),0,0,0,0,0,...,0,0,0,0,0,1,0,0,0,0
1680,1681,You So Crazy (1994),01-Jan-1994,,http://us.imdb.com/M/title-exact?You%20So%20Cr...,0,0,0,0,0,...,0,0,0,0,0,0,0,0,0,0


In [12]:
names = []
for i in range(len(movie_info)):
    name = movie_info.iloc[i][1]
    name = name.split('(')[0][:-1]
    #print(name)
    names.append(name)
    #break

In [16]:
movie_embeddings = sbert_model.encode(names)