In [1]:
import torch
import torch.nn as nn
import torch.nn.functional as F
import dgl
import dgl.nn as dglnn
import dgl.function as fn
import pandas as pd

Using backend: pytorch


In [2]:
class DotProductPredictor(nn.Module):
    def forward(self, graph, h):
        # h是从5.1节的GNN模型中计算出的节点表示
        with graph.local_scope():
            graph.ndata['h'] = h
            graph.apply_edges(fn.u_dot_v('h', 'h', 'score'))
            return graph.edata['score']

In [3]:
class SAGE(nn.Module):
    def __init__(self, in_feats, hid_feats, out_feats):
        super().__init__()
        # 实例化SAGEConve，in_feats是输入特征的维度，out_feats是输出特征的维度，aggregator_type是聚合函数的类型
        self.conv1 = dglnn.SAGEConv(
            in_feats=in_feats, out_feats=hid_feats, aggregator_type='mean')
        self.conv2 = dglnn.SAGEConv(
            in_feats=hid_feats, out_feats=out_feats, aggregator_type='mean')

    def forward(self, graph, inputs):
        # 输入是节点的特征
        h = self.conv1(graph, inputs)
        h = F.relu(h)
        h = self.conv2(graph, h)
        return h

In [4]:
class Model(nn.Module):
    def __init__(self, in_features, hidden_features, out_features):
        super().__init__()
        self.sage = SAGE(in_features, hidden_features, out_features)
        self.pred = DotProductPredictor()
    def forward(self, g, neg_g, x):
        h = self.sage(g, x)
        return self.pred(g, h), self.pred(neg_g, h)

In [5]:
def construct_negative_graph(graph, k):
    src, dst = graph.edges()

    neg_src = src.repeat_interleave(k)
    neg_dst = torch.randint(0, graph.num_nodes(), (len(src) * k,))
    return dgl.graph((neg_src, neg_dst), num_nodes=graph.num_nodes())

def compute_loss(pos_score, neg_score):
    # 间隔损失
    n_edges = pos_score.shape[0]
    return (1 - pos_score.unsqueeze(1) + neg_score.view(n_edges, -1)).clamp(min=0).mean()

In [6]:
train_df = pd.read_csv('../tyc_cm/train_df.csv')
valid_df = pd.read_csv('../tyc_cm/valid_df.csv')
test_df = pd.read_csv('../tyc_cm/test_df.csv')
test_neg_df = pd.read_csv('../tyc_cm/test_neg_df.csv')

train_graph = dgl.graph((train_df.src_ind, train_df.dst_ind), num_nodes=64424)
valid_graph = dgl.graph((valid_df.src_ind, valid_df.dst_ind), num_nodes=64424)
test_graph = dgl.graph((test_df.src_ind, test_df.dst_ind), num_nodes=64424)
test_neg_graph = dgl.graph((test_neg_df.src_ind, test_neg_df.dst_ind), num_nodes=64424)

In [None]:
node_features = torch.rand(train_graph.num_nodes(), 100)
n_features = node_features.shape[1]
k = 5
model = Model(n_features, 100, 100)
opt = torch.optim.Adam(model.parameters())
for epoch in range(10):
    negative_graph = construct_negative_graph(train_graph, k)
    pos_score, neg_score = model(train_graph, negative_graph, node_features)
    loss = compute_loss(pos_score, neg_score)
    opt.zero_grad()
    loss.backward()
    opt.step()
    print(loss.item())

In [1]:
import random

class DataLoader:
    def __init__(self):
        self.data = list(range(100))
    
    def __iter__(self):
        print('calls __iter__')
        random.shuffle(self.data)
        for i in range(4):
            print('yielding:')
            yield self.data[i * 20:(i+1)*20]

data = DataLoader()

for batch in data:
    print(batch)

for batch in data:
    print(batch)

calls __iter__
yielding:
[63, 8, 76, 2, 55, 17, 47, 29, 1, 48, 90, 97, 26, 60, 45, 84, 33, 78, 58, 42]
yielding:
[28, 31, 9, 13, 57, 37, 50, 3, 81, 49, 56, 68, 94, 22, 5, 53, 52, 10, 70, 40]
yielding:
[43, 61, 79, 86, 12, 93, 36, 83, 15, 74, 64, 7, 44, 87, 89, 16, 35, 27, 32, 34]
yielding:
[51, 23, 24, 54, 4, 62, 14, 25, 85, 95, 30, 80, 46, 88, 67, 99, 18, 82, 39, 77]
calls __iter__
yielding:
[5, 10, 75, 98, 37, 41, 16, 33, 83, 7, 96, 92, 20, 95, 42, 99, 60, 31, 24, 74]
yielding:
[70, 23, 63, 55, 79, 64, 0, 38, 72, 65, 73, 78, 25, 34, 68, 9, 88, 13, 94, 61]
yielding:
[2, 62, 43, 27, 40, 53, 97, 86, 8, 89, 36, 50, 26, 59, 15, 46, 56, 76, 52, 14]
yielding:
[6, 85, 80, 81, 18, 29, 77, 47, 4, 12, 17, 51, 93, 30, 82, 69, 91, 21, 28, 58]


In [6]:
import pandas as pd

train = pd.read_csv('../data/train_df_.csv')[['src_ind', 'dst_ind']]
test = pd.read_csv('../data/test_df_.csv')[['src_ind', 'dst_ind']]
test_neg = pd.read_csv('../data/test_neg_df_.csv')[['src_ind', 'dst_ind']]
test_neg_user = pd.read_csv('../data/test_neg_df_user_.csv')[['src_ind', 'dst_ind']]

In [14]:
train.merge(test, how='inner')

Unnamed: 0,src_ind,dst_ind


In [15]:
train.merge(test_neg, how='inner')

Unnamed: 0,src_ind,dst_ind
0,32815,55721
1,29818,54122
2,29818,18914
3,13919,10173
4,53694,47483
...,...,...
545,29818,43200
546,39216,54304
547,59164,41775
548,59164,56


In [16]:
train.merge(test_neg_user, how='inner')

Unnamed: 0,src_ind,dst_ind
0,56679,62321
1,13693,30480
2,17633,35366
3,1570,14043
4,39236,17084
...,...,...
89,29818,39242
90,14360,4684
91,18172,41727
92,5656,35871


In [17]:
len(train)

65769