# Подготовительный этап

In [1]:
import dgl
import torch

In [2]:
import scipy.io as sio
file_path = 'econ-psmigr1.mtx'
mat_contents = sio.mmread(file_path)
A = mat_contents.A

In [3]:
import networkx as nx
G = nx.from_numpy_array(A)

In [4]:
node_weights = {k: 0 for k in list(G.nodes)}
node_mid_neighbor = {k: 0 for k in list(G.nodes)}
for i, j, w in G.edges(data=True):
    node_weights[i] += w["weight"]
    node_weights[j] += w["weight"]
for i in G.nodes:
    for n in G.neighbors(i):
        node_mid_neighbor[i] += node_weights[n]
    node_mid_neighbor[i] /= len(list(G.neighbors(i)))

In [5]:
y = []
for u in G.nodes:
    for v in G.nodes:
        if u != v:
            if G.has_edge(u, v):
                y.append(1)
            else:
                y.append(0)
            u_weight = node_weights[u]
            v_weight = node_weights[v]
            u_mid = node_mid_neighbor[u]
            v_mid =node_mid_neighbor[v]

In [6]:
X = []
for i in range(len(list(G.nodes))):
    X.append((list(node_weights.values())[i], list(node_mid_neighbor.values())[i]))

In [7]:
X = torch.tensor(X)
y = torch.tensor(y)

In [8]:
import dgl.data
g = dgl.from_networkx(G)

In [9]:
g.ndata["feat"] = X

In [10]:
import dgl
import torch
import torch.nn as nn
import torch.nn.functional as F
import itertools
import numpy as np
import scipy.sparse as sp

In [11]:
u, v = g.edges()

eids = np.arange(g.number_of_edges())
eids = np.random.permutation(eids)
test_size = int(len(eids) * 0.1)
train_size = g.number_of_edges() - test_size
test_pos_u, test_pos_v = u[eids[:test_size]], v[eids[:test_size]]
train_pos_u, train_pos_v = u[eids[test_size:]], v[eids[test_size:]]

adj = sp.coo_matrix((np.ones(len(u)), (u.numpy(), v.numpy())))
adj_neg = 1 - adj.todense() - np.eye(g.number_of_nodes())
neg_u, neg_v = np.where(adj_neg != 0)

neg_eids = np.random.choice(len(neg_u), g.number_of_edges())
test_neg_u, test_neg_v = neg_u[neg_eids[:test_size]], neg_v[neg_eids[:test_size]]
train_neg_u, train_neg_v = neg_u[neg_eids[test_size:]], neg_v[neg_eids[test_size:]]

In [12]:
from dgl.nn import SAGEConv

class GraphSAGE(nn.Module):
    def __init__(self, in_feats, h_feats):
        super(GraphSAGE, self).__init__()
        self.conv1 = SAGEConv(in_feats, h_feats, 'mean')
        self.conv2 = SAGEConv(h_feats, h_feats, 'mean')

    def forward(self, g, in_feat):
        h = self.conv1(g, in_feat)
        h = F.relu(h)
        h = self.conv2(g, h)
        return h

In [13]:
train_pos_g = dgl.graph((train_pos_u, train_pos_v), num_nodes=g.number_of_nodes())
train_neg_g = dgl.graph((train_neg_u, train_neg_v), num_nodes=g.number_of_nodes())

test_pos_g = dgl.graph((test_pos_u, test_pos_v), num_nodes=g.number_of_nodes())
test_neg_g = dgl.graph((test_neg_u, test_neg_v), num_nodes=g.number_of_nodes())

In [14]:
import dgl.function as fn

class DotPredictor(nn.Module):
    def forward(self, g, h):
        with g.local_scope():
            g.ndata['h'] = h
            g.apply_edges(fn.u_dot_v('h', 'h', 'score'))
            return g.edata['score'][:, 0]

In [15]:
class MLPPredictor(nn.Module):
    def __init__(self, h_feats):
        super().__init__()
        self.W1 = nn.Linear(h_feats * 2, h_feats)
        self.W2 = nn.Linear(h_feats, 1)

    def apply_edges(self, edges):
        h = torch.cat([edges.src['h'], edges.dst['h']], 1)
        return {'score': self.W2(F.relu(self.W1(h))).squeeze(1)}

    def forward(self, g, h):
        with g.local_scope():
            g.ndata['h'] = h
            g.apply_edges(self.apply_edges)
            return g.edata['score']

In [16]:
train_g = dgl.remove_edges(g, eids[:test_size])

In [17]:
model = GraphSAGE(train_g.ndata['feat'].shape[1], 16)
pred = MLPPredictor(16)


def compute_loss(pos_score, neg_score):
    scores = torch.cat([pos_score, neg_score])
    labels = torch.cat([torch.ones(pos_score.shape[0]), torch.zeros(neg_score.shape[0])])
    return F.binary_cross_entropy_with_logits(scores, labels)

def compute_metrics(pos_score, neg_score):
    scores = torch.where(torch.cat([pos_score, neg_score]) >= (torch.mean(pos_score) + torch.mean(neg_score)) / 2, 1, 0)
    labels = torch.cat(
        [torch.ones(pos_score.shape[0]), torch.zeros(neg_score.shape[0])]).numpy()
    print(labels, scores, labels.shape, scores.shape)
    return accuracy_score(labels, scores), balanced_accuracy_score(labels, scores), f1_score(labels, scores), precision_score(labels, scores), roc_auc_score(labels, scores)

In [18]:
from sklearn.metrics import accuracy_score, f1_score, precision_score, roc_auc_score, balanced_accuracy_score

## Модель I

In [19]:
%%time
import pandas as pd
from sklearn.metrics import accuracy_score, f1_score, precision_score, roc_auc_score

df_result = pd.DataFrame({"lr", "hidden_dim", "accuracy_score", "balanced_accuracy_score", "f1_score", "precision_score", "roc_auc_score"})

for hidden_dim in [16, 64, 256]:
    for lr in [0.005, 0.01]:
        model = GraphSAGE(train_g.ndata['feat'].shape[1], hidden_dim)
        pred = MLPPredictor(hidden_dim)

        def compute_loss(pos_score, neg_score):
            scores = torch.cat([pos_score, neg_score])
            labels = torch.cat([torch.ones(pos_score.shape[0]), torch.zeros(neg_score.shape[0])])
            return F.binary_cross_entropy_with_logits(scores, labels)

        def compute_auc(pos_score, neg_score):
            scores = torch.cat([pos_score, neg_score]).numpy()
            labels = torch.cat(
                [torch.ones(pos_score.shape[0]), torch.zeros(neg_score.shape[0])]).numpy()
            return roc_auc_score(labels, scores)

        def compute_metrics(pos_score, neg_score):
            scores = torch.where(torch.cat([pos_score, neg_score]) >= (torch.mean(pos_score) + torch.mean(neg_score)) / 2, 1, 0)
            labels = torch.cat(
                [torch.ones(pos_score.shape[0]), torch.zeros(neg_score.shape[0])]).numpy()
            return accuracy_score(labels, scores), balanced_accuracy_score(labels, scores), f1_score(labels, scores), precision_score(labels, scores), roc_auc_score(labels, scores)

        optimizer = torch.optim.Adam(itertools.chain(model.parameters(), pred.parameters()), lr=lr)

        all_logits = []
        for e in range(50):
            h = model(train_g, train_g.ndata['feat'])
            pos_score = pred(train_pos_g, h)
            neg_score = pred(train_neg_g, h)
            loss = compute_loss(pos_score, neg_score)
            optimizer.zero_grad()
            loss.backward()
            optimizer.step()

        with torch.no_grad():
            pos_score = pred(test_pos_g, h)
            neg_score = pred(test_neg_g, h)
            accuracy_scor, balanced_accuracy_scor, f1_scor, precision_scor, _ = compute_metrics(pos_score, neg_score)
            roc_auc_scor = compute_auc(pos_score, neg_score)
            df_result = df_result.append({"lr": lr, "hidden_dim": hidden_dim, "accuracy_score": accuracy_scor, "balanced_accuracy_score": balanced_accuracy_scor, "f1_score": f1_scor, "precision_score": precision_scor, "roc_auc_score": roc_auc_scor}, ignore_index=True)

Wall time: 18min 1s


In [20]:
df_result.iloc[7:, 1:]

Unnamed: 0,accuracy_score,balanced_accuracy_score,f1_score,hidden_dim,lr,precision_score,roc_auc_score
7,0.679611,0.679611,0.617567,16.0,0.005,0.765881,0.747525
8,0.633091,0.633091,0.456604,16.0,0.01,0.879792,0.758468
9,0.713217,0.713217,0.640863,64.0,0.005,0.857108,0.810931
10,0.711362,0.711362,0.632943,64.0,0.01,0.869051,0.8189
11,0.607857,0.607857,0.3757,256.0,0.005,0.920886,0.819918
12,0.533297,0.533297,0.13086,256.0,0.01,0.950312,0.702599


Наилучшие результаты показала модель с lr = 0.05 и hidden_dim = 64. Некоторые из предыдущих моделей имеют очень плохие показатели, что может быть связно с переобучением.

## Модель II

In [21]:
model = GraphSAGE(train_g.ndata['feat'].shape[1], 16)
pred = DotPredictor()

def compute_loss(pos_score, neg_score):
    scores = torch.cat([pos_score, neg_score])
    labels = torch.cat([torch.ones(pos_score.shape[0]), torch.zeros(neg_score.shape[0])])
    return F.binary_cross_entropy_with_logits(scores, labels)

def compute_metrics(pos_score, neg_score):
    scores = torch.where(torch.cat([pos_score, neg_score]) >= (torch.mean(pos_score) + torch.mean(neg_score)) / 2, 1, 0)
    labels = torch.cat(
        [torch.ones(pos_score.shape[0]), torch.zeros(neg_score.shape[0])]).numpy()
    print(labels, scores, labels.shape, scores.shape)
    return accuracy_score(labels, scores), balanced_accuracy_score(labels, scores), f1_score(labels, scores), precision_score(labels, scores), roc_auc_score(labels, scores)

In [22]:
from sklearn.metrics import accuracy_score, f1_score, precision_score, roc_auc_score, balanced_accuracy_score

In [23]:
%%time
import pandas as pd
from sklearn.metrics import accuracy_score, f1_score, precision_score, roc_auc_score

df_result = pd.DataFrame({"lr", "hidden_dim", "accuracy_score", "balanced_accuracy_score", "f1_score", "precision_score", "roc_auc_score"})

for hidden_dim in [16, 64, 256]:
    for lr in [0.005, 0.01]:
        model = GraphSAGE(train_g.ndata['feat'].shape[1], hidden_dim)
        pred = DotPredictor()

        def compute_loss(pos_score, neg_score):
            scores = torch.cat([pos_score, neg_score])
            labels = torch.cat([torch.ones(pos_score.shape[0]), torch.zeros(neg_score.shape[0])])
            return F.binary_cross_entropy_with_logits(scores, labels)

        def compute_auc(pos_score, neg_score):
            scores = torch.cat([pos_score, neg_score]).numpy()
            labels = torch.cat(
                [torch.ones(pos_score.shape[0]), torch.zeros(neg_score.shape[0])]).numpy()
            return roc_auc_score(labels, scores)

        def compute_metrics(pos_score, neg_score):
            scores = torch.where(torch.cat([pos_score, neg_score]) >= (torch.mean(pos_score) + torch.mean(neg_score)) / 2, 1, 0)
            labels = torch.cat(
                [torch.ones(pos_score.shape[0]), torch.zeros(neg_score.shape[0])]).numpy()
            return accuracy_score(labels, scores), balanced_accuracy_score(labels, scores), f1_score(labels, scores), precision_score(labels, scores), roc_auc_score(labels, scores)

        optimizer = torch.optim.Adam(itertools.chain(model.parameters(), pred.parameters()), lr=lr)

        all_logits = []
        for e in range(300):
            h = model(train_g, train_g.ndata['feat'])
            pos_score = pred(train_pos_g, h)
            neg_score = pred(train_neg_g, h)
            loss = compute_loss(pos_score, neg_score)
            optimizer.zero_grad()
            loss.backward()
            optimizer.step()

        with torch.no_grad():
            pos_score = pred(test_pos_g, h)
            neg_score = pred(test_neg_g, h)
            accuracy_scor, balanced_accuracy_scor, f1_scor, precision_scor, _ = compute_metrics(pos_score, neg_score)
            roc_auc_scor = compute_auc(pos_score, neg_score)
            df_result = df_result.append({"lr": lr, "hidden_dim": hidden_dim, "accuracy_score": accuracy_scor, "balanced_accuracy_score": balanced_accuracy_scor, "f1_score": f1_scor, "precision_score": precision_scor, "roc_auc_score": roc_auc_scor}, ignore_index=True)

Wall time: 11min 45s


In [24]:
df_result.iloc[7:, 1:]

Unnamed: 0,accuracy_score,balanced_accuracy_score,f1_score,hidden_dim,lr,precision_score,roc_auc_score
7,0.630872,0.630872,0.429252,16.0,0.005,0.945918,0.747216
8,0.611677,0.611677,0.418499,16.0,0.01,0.832779,0.723422
9,0.607106,0.607106,0.374225,64.0,0.005,0.918864,0.746405
10,0.586832,0.586832,0.30491,64.0,0.01,0.959864,0.745954
11,0.581011,0.581011,0.306771,256.0,0.005,0.887979,0.738173
12,0.409791,0.409791,0.180088,256.0,0.01,0.294835,0.435153


## Модель III

In [25]:
class MLPPredictor(nn.Module):
    def __init__(self, h_feats):
        super().__init__()
        self.W1 = nn.Linear(h_feats * 2, h_feats)
        self.W2 = nn.Linear(h_feats, 1)

    def apply_edges(self, edges):
        h = torch.cat([edges.src['h'], edges.dst['h']], 1)
        return {'score': self.W2(F.tanh(self.W1(h))).squeeze(1)}

    def forward(self, g, h):
        with g.local_scope():
            g.ndata['h'] = h
            g.apply_edges(self.apply_edges)
            return g.edata['score']

In [26]:
%%time
import pandas as pd
from sklearn.metrics import accuracy_score, f1_score, precision_score, roc_auc_score

model = GraphSAGE(train_g.ndata['feat'].shape[1], 16)
pred = DotPredictor()
df_result = pd.DataFrame({"lr", "hidden_dim", "accuracy_score", "balanced_accuracy_score", "f1_score", "precision_score", "roc_auc_score"})

for hidden_dim in [16, 64, 256]:
    for lr in [0.005, 0.01]:
        model = GraphSAGE(train_g.ndata['feat'].shape[1], hidden_dim)
        pred = MLPPredictor(hidden_dim)

        def compute_loss(pos_score, neg_score):
            scores = torch.cat([pos_score, neg_score])
            labels = torch.cat([torch.ones(pos_score.shape[0]), torch.zeros(neg_score.shape[0])])
            return F.binary_cross_entropy_with_logits(scores, labels)

        def compute_auc(pos_score, neg_score):
            scores = torch.cat([pos_score, neg_score]).numpy()
            labels = torch.cat(
                [torch.ones(pos_score.shape[0]), torch.zeros(neg_score.shape[0])]).numpy()
            return roc_auc_score(labels, scores)

        def compute_metrics(pos_score, neg_score):
            scores = torch.where(torch.cat([pos_score, neg_score]) >= (torch.mean(pos_score) + torch.mean(neg_score)) / 2, 1, 0)
            labels = torch.cat(
                [torch.ones(pos_score.shape[0]), torch.zeros(neg_score.shape[0])]).numpy()
            return accuracy_score(labels, scores), balanced_accuracy_score(labels, scores), f1_score(labels, scores), precision_score(labels, scores), roc_auc_score(labels, scores)

        optimizer = torch.optim.Adam(itertools.chain(model.parameters(), pred.parameters()), lr=lr)

        all_logits = []
        for e in range(50):
            h = model(train_g, train_g.ndata['feat'])
            pos_score = pred(train_pos_g, h)
            neg_score = pred(train_neg_g, h)
            loss = compute_loss(pos_score, neg_score)
            optimizer.zero_grad()
            loss.backward()
            optimizer.step()

        with torch.no_grad():
            pos_score = pred(test_pos_g, h)
            neg_score = pred(test_neg_g, h)
            accuracy_scor, balanced_accuracy_scor, f1_scor, precision_scor, _ = compute_metrics(pos_score, neg_score)
            roc_auc_scor = compute_auc(pos_score, neg_score)
            df_result = df_result.append({"lr": lr, "hidden_dim": hidden_dim, "accuracy_score": accuracy_scor, "balanced_accuracy_score": balanced_accuracy_scor, "f1_score": f1_scor, "precision_score": precision_scor, "roc_auc_score": roc_auc_scor}, ignore_index=True)

Wall time: 19min 17s


In [27]:
df_result.iloc[7:, 1:]

Unnamed: 0,accuracy_score,balanced_accuracy_score,f1_score,hidden_dim,lr,precision_score,roc_auc_score
7,0.742597,0.742597,0.727301,16.0,0.005,0.773252,0.802886
8,0.746253,0.746253,0.719929,16.0,0.01,0.803261,0.810289
9,0.753425,0.753425,0.729343,64.0,0.005,0.808287,0.823507
10,0.754984,0.754984,0.737978,64.0,0.01,0.793019,0.822192
11,0.747769,0.747769,0.714827,256.0,0.005,0.822209,0.822476
12,0.720377,0.720377,0.652476,256.0,0.01,0.861732,0.814196
