# Imports 🐟

In [1]:
!pip install -qq torch_geometric

[?25l     [90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━[0m [32m0.0/661.6 kB[0m [31m?[0m eta [36m-:--:--[0m[2K     [91m━━━━━━━━━━━━━━━━━━[0m[91m╸[0m[90m━━━━━━━━━━━━━━━━━━━━[0m [32m317.4/661.6 kB[0m [31m9.2 MB/s[0m eta [36m0:00:01[0m[2K     [91m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━[0m[91m╸[0m [32m655.4/661.6 kB[0m [31m14.4 MB/s[0m eta [36m0:00:01[0m[2K     [91m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━[0m[91m╸[0m [32m655.4/661.6 kB[0m [31m14.4 MB/s[0m eta [36m0:00:01[0m[2K     [90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━[0m [32m661.6/661.6 kB[0m [31m5.7 MB/s[0m eta [36m0:00:00[0m
[?25h  Installing build dependencies ... [?25l[?25hdone
  Getting requirements to build wheel ... [?25l[?25hdone
  Preparing metadata (pyproject.toml) ... [?25l[?25hdone
  Building wheel for torch_geometric (pyproject.toml) ... [?25l[?25hdone


In [2]:
import torch
import random

import networkx as nx
import numpy as np
import pandas as pd
import torch.nn as nn
import torch.nn.functional as F
import plotly.express as px

from itertools import combinations
from collections import defaultdict
from torch_geometric.data import InMemoryDataset, Data
from torch_geometric.loader import DataLoader
from torch_geometric.nn import GCNConv, GATConv, TransformerConv, SAGEConv
from sklearn.model_selection import train_test_split
from sklearn.metrics import precision_score, recall_score, roc_auc_score
from IPython.display import Javascript

In [3]:
random.seed(2023)

# SBM graphs generation 🫐

In [4]:
AMOUNT_GRAPHS = 120
VAL_TEST_PART = 0.4

MIN_CLUSTER_SIZE = 30
MAX_CLUSTER_SIZE = 200

In [5]:
PROBS_3_CLUSTERS = [[[0.75, 0.015, 0.0002], [0.015, 0.85, 0.0075], [0.0002, 0.0075, 0.90]],
                    [[0.2, 0.001, 0.05], [0.001, 0.35, 0.075], [0.05, 0.075, 0.40]],
                    [[0.25, 0.05, 0.02], [0.05, 0.35, 0.07], [0.02, 0.07, 0.40]],
                    [[0.65, 0.05, 0.001], [0.05, 0.7, 0.15], [0.001, 0.15, 0.85]],
                    [[0.35, 0.15, 0.0002], [0.15, 0.45, 0.0075], [0.0002, 0.0075, 0.50]],
                    [[0.1, 0.60, 0.55], [0.60, 0.25, 0.80], [0.55, 0.80, 0.30]]]

In [6]:
PROBS_5_CLUSTERS = [[[0.75, 0.05, 0.02, 0.01, 0.05], [0.05, 0.85, 0.07, 0.08, 0.01], [0.02, 0.07, 0.90, 0.03, 0.04], [0.01, 0.08, 0.03, 0.65, 0.10], [0.05, 0.01, 0.04, 0.10, 0.95]],
                    [[0.25, 0.15, 0.35, 0.10, 0.55], [0.15, 0.45, 0.25, 0.10, 0.17], [0.35, 0.25, 0.50, 0.25, 0.25], [0.10, 0.10, 0.25, 0.65, 0.15], [0.55, 0.17, 0.25, 0.15, 0.35]],
                    [[0.05, 0.45, 0.35, 0.55, 0.65], [0.45, 0.15, 0.37, 0.88, 0.20], [0.35, 0.37, 0.20, 0.40, 0.50], [0.55, 0.88, 0.40, 0.35, 0.80], [0.65, 0.20, 0.50, 0.80, 0.17]],
                    [[0.95, 0.15, 0.05, 0.03, 0.19], [0.15, 0.45, 0.07, 0.18, 0.40], [0.05, 0.07, 0.80, 0.02, 0.15], [0.03, 0.18, 0.02, 0.75, 0.01], [0.19, 0.40, 0.15, 0.01, 0.57]],
                    [[0.55, 0.33, 0.25, 0.07, 0.39], [0.33, 0.38, 0.17, 0.58, 0.25], [0.25, 0.17, 0.63, 0.35, 0.40], [0.07, 0.58, 0.35, 0.48, 0.28], [0.39, 0.25, 0.40, 0.28, 0.98]],
                    [[1.00, 0.75, 0.55, 0.30, 0.65], [0.75, 0.88, 0.77, 0.98, 0.56], [0.55, 0.77, 0.93, 0.62, 0.45], [0.30, 0.98, 0.62, 0.56, 0.67], [0.65, 0.56, 0.45, 0.67, 0.86]]]

In [7]:
PROBS_10_CLUSTERS = [[[0.75, 0.05, 0.02, 0.01, 0.05, 0.007, 0.02, 0.3, 0.25, 0.015], 
                     [0.05, 0.85, 0.07, 0.08, 0.01, 0.001, 0.03, 0.0, 0.04, 0.02], 
                     [0.02, 0.07, 0.90, 0.03, 0.04, 0.03, 0.09, 0.017, 0.003, 0.01],
                     [0.01, 0.08, 0.03, 0.65, 0.10, 0.005, 0.10, 0.025, 0.05, 0.1],
                     [0.05, 0.01, 0.04, 0.10, 0.95, 0.085, 0.07, 0.06, 0.077, 0.009],
                     [0.007, 0.001, 0.03, 0.005, 0.085, 0.70, 0.2, 0.0001, 0.08, 0.0015],
                     [0.02, 0.03, 0.09, 0.10, 0.07, 0.2, 0.5, 0.3, 0.19, 0.0002],
                     [0.3, 0.0, 0.017, 0.025, 0.06, 0.0001, 0.3, 0.99, 0.001, 0.15],
                     [0.25, 0.04, 0.003, 0.05, 0.077, 0.08, 0.19, 0.001, 0.87, 0.035],
                     [0.015, 0.02, 0.01, 0.1, 0.009, 0.0015, 0.0002, 0.15, 0.035, 0.85]],
                     # 2
                     [[0.99, 0.15, 0.25, 0.05, 0.55, 0.375, 0.2, 0.03, 0.125, 0.001], 
                     [0.15, 0.57, 0.47, 0.89, 0.31, 0.11, 0.13, 0.002, 0.14, 0.122], 
                     [0.25, 0.47, 0.33, 0.06, 0.45, 0.023, 0.099, 0.27, 0.033, 0.70],
                     [0.05, 0.89, 0.06, 0.95, 0.40, 0.075, 0.101, 0.25, 0.26, 0.031],
                     [0.55, 0.31, 0.45, 0.40, 0.53, 0.105, 0.47, 0.869, 0.177, 0.099],
                     [0.375, 0.11, 0.023, 0.075, 0.105, 0.748, 0.023, 0.019, 0.0089, 0.25],
                     [0.2, 0.13, 0.099, 0.101, 0.47, 0.023, 0.86, 0.33, 0.49, 0.0502],
                     [0.03, 0.002, 0.27, 0.25, 0.869, 0.019, 0.33, 0.678, 0.501, 0.15],
                     [0.125, 0.14, 0.033, 0.26, 0.177, 0.0089, 0.49, 0.501, 0.745, 0.001],
                     [0.001, 0.122, 0.70, 0.031, 0.099, 0.25, 0.0502, 0.15, 0.001, 0.89995]]]

In [8]:
def get_clusters_probs(amount_clusters):
  if amount_clusters == 3:
    return PROBS_3_CLUSTERS
  elif amount_clusters == 5:
    return PROBS_5_CLUSTERS
  elif amount_clusters == 10:
    return PROBS_10_CLUSTERS
  else:
    return []

In [9]:
def generate_graphs_with_random_props(amount_clusters):
  graphs = []
  clusters_probs = get_clusters_probs(amount_clusters)
  amount_graphs = int(AMOUNT_GRAPHS / 10) if amount_clusters == 10 else AMOUNT_GRAPHS

  for i in range(amount_graphs):
    cluster_sizes = [random.randint(MIN_CLUSTER_SIZE, MAX_CLUSTER_SIZE) for j in range(amount_clusters)]
    probs_index = random.randint(0, len(clusters_probs) - 1)

    G = nx.stochastic_block_model(cluster_sizes, clusters_probs[probs_index])
    graphs.append(G)

  return graphs

In [10]:
def get_graph_true_clusters(G):
  clusters_dict = {}

  for i in range(len(G.graph["partition"])):
    for node in G.graph["partition"][i]:
      clusters_dict[node] = i

  return clusters_dict

# Prepare graphs data ⛲

In [11]:
def get_graph_node_pairs(G):
  clusters_dict = get_graph_true_clusters(G)
  degrees_dict = dict(G.degree())

  combination_target = []
  initial_embeddings = []

  for nodes_pair in combinations(G.nodes(), 2):
    first_node = nodes_pair[0]
    second_node = nodes_pair[1]

    degrees_pair = [degrees_dict[first_node], degrees_dict[second_node]]
    initial_embeddings.append(degrees_pair)
    
    # вершины в одном кластере
    if clusters_dict[first_node] == clusters_dict[second_node]:
      combination_target.append(1)
    else:
      combination_target.append(0)

  embeddings = np.array(initial_embeddings)
  labels = np.array(combination_target)

  return embeddings, labels

In [12]:
def get_graphs_embeddings_labels(graphs):
  embeddings_all = []
  labels_all = []

  for graph in graphs:
    embeddings, labels = get_graph_node_pairs(graph)
    embeddings_all.append(embeddings)
    labels_all.append(labels)

  return embeddings_all, labels_all

# Dataset creation 🍚

In [13]:
BATCH_SIZE = 8

In [14]:
class GraphDataset(InMemoryDataset):
    def __init__(self, graphs, embeddings, labels):
        super(GraphDataset, self).__init__('.', None, None, None)

        data_graphs = []

        for index, graph in enumerate(graphs):
          adj = nx.to_scipy_sparse_array(graph).tocoo()
          row = torch.from_numpy(adj.row.astype(np.int64)).to(torch.long)
          col = torch.from_numpy(adj.col.astype(np.int64)).to(torch.long)
          edge_index = torch.stack([row, col], dim=0)

          x = torch.from_numpy(embeddings[index]).type(torch.float32)
          y = torch.from_numpy(labels[index]).type(torch.long)

          data = Data(edge_index=edge_index,
                      num_nodes=graph.number_of_nodes(),
                      x=x,
                      y=y.clone().detach(),
                      num_classes=2)
          
          data_graphs.append(data)

        self.data, self.slices = self.collate([data_graphs])

    def _download(self):
        return

    def _process(self):
        return

    def __repr__(self):
        return '{}()'.format(self.__class__.__name__)

In [15]:
def get_dataset(graphs, embeddings, labels):
  dataset = GraphDataset(graphs, embeddings, labels)

  dataset = dataset.shuffle()

  return dataset

In [16]:
def get_train_val_test_dataset(graphs, embeddings, labels):
  amount_graphs = len(graphs)
  TRAIN_MAX = int(amount_graphs - amount_graphs * VAL_TEST_PART * 2)
  VAL_MAX = int(amount_graphs - amount_graphs * VAL_TEST_PART)

  train_dataset = get_dataset(graphs[:TRAIN_MAX], 
                                embeddings[:TRAIN_MAX], 
                                labels[:TRAIN_MAX])
  val_dataset = get_dataset(graphs[TRAIN_MAX:VAL_MAX], 
                              embeddings[TRAIN_MAX:VAL_MAX], 
                              labels[TRAIN_MAX:VAL_MAX])
  test_dataset = get_dataset(graphs[VAL_MAX:], 
                               embeddings[VAL_MAX:], 
                               labels[VAL_MAX:])
  
  return train_dataset, val_dataset, test_dataset

In [17]:
def get_train_val_test_loaders(train, val, test):
  train_loader = DataLoader(train, batch_size=BATCH_SIZE)
  val_loader = DataLoader(val, batch_size=BATCH_SIZE)
  test_loader = DataLoader(test, batch_size=BATCH_SIZE)

  return train_loader, val_loader, test_loader

# Nets 🍄

In [18]:
class ConvNet(torch.nn.Module):
    def __init__(self):
        super(ConvNet, self).__init__()
        self.conv1 = GCNConv(2, 64)
        self.conv2 = GCNConv(64, 2)

    def forward(self, data):
        x, edge_index = data.x, data.edge_index
        x = F.relu(self.conv1(x, edge_index))
        x = F.dropout(x, p=0.5, training=self.training)
        x = self.conv2(x, edge_index)
        
        return F.log_softmax(x, dim=1)

In [19]:
class GATNet(torch.nn.Module):
    def __init__(self):
        super().__init__()
        self.conv1 = GATConv(2, 64)
        self.conv2 = GATConv(64, 2)

    def forward(self, data):
        x, edge_index = data.x, data.edge_index
        x = F.relu(self.conv1(x, edge_index))
        x = F.dropout(x, p=0.6, training=self.training)
        x = self.conv2(x, edge_index)

        return F.log_softmax(x, dim=1)

In [20]:
class TransformerNet(torch.nn.Module):
  def __init__(self):
        super().__init__()
        self.conv1 = TransformerConv(2, 256, dropout=0.6) 
        self.conv2 = TransformerConv(256, 2, dropout=0.5) 

  def forward(self, data):
        x, edge_index = data.x, data.edge_index
        x = F.relu(self.conv1(x, edge_index))
        x = F.dropout(x, training=self.training)
        x = self.conv2(x, edge_index)

        return F.log_softmax(x, dim=1)

In [21]:
class SageNet(torch.nn.Module):
  def __init__(self):
        super().__init__()
        self.conv1 = SAGEConv(2, 64) 
        self.conv2 = SAGEConv(64, 2)

  def forward(self, data):
        x, edge_index = data.x, data.edge_index
        x = F.relu(self.conv1(x, edge_index))
        x = F.dropout(x, p=0.5, training=self.training)
        x = self.conv2(x, edge_index)
        
        return F.log_softmax(x, dim=1)

# Train helpers 🍏

In [22]:
EPOCHS = 30

In [23]:
device = torch.device('cuda' if torch.cuda.is_available() else 'cpu')

In [24]:
def train_epoch(model, train_loader):
  optimizer = torch.optim.Adam(model.parameters(),
                               lr=0.01,
                               weight_decay=5e-4)

  criterion = torch.nn.CrossEntropyLoss()

  model.train()

  losses = []
  precision_scores = []
  recall_scores = []
  roc_auc_scores = []

  for train_loader_ in train_loader:
    for data in train_loader_:
      data = data.to(device)
      optimizer.zero_grad()
      output = model(data)
      _, preds = torch.max(output, dim=1)
      label = data.y.to(device)
      loss = criterion(output, label.long())
      loss.backward()
      optimizer.step()

      precision_scores.append(precision_score(label, preds, zero_division=0))
      recall_scores.append(recall_score(label, preds, zero_division=0))
      roc_auc_scores.append(roc_auc_score(label, preds))
      losses.append(loss.item())

  return np.mean(precision_scores), np.mean(recall_scores), np.mean(roc_auc_scores), np.mean(losses)

In [25]:
def evaluate(model, loader):
  criterion = torch.nn.CrossEntropyLoss()

  model.eval()

  losses = []
  precision_scores = []
  recall_scores = []
  roc_auc_scores = []

  with torch.no_grad():
    for loader_ in loader:
      for data in loader_:
        data = data.to(device)
        output = model(data)
        _, preds = torch.max(output, dim=1)
        label = data.y.to(device)
        loss = criterion(output, label.long())

        precision_scores.append(precision_score(label, preds, zero_division=0))
        recall_scores.append(recall_score(label, preds, zero_division=0))
        roc_auc_scores.append(roc_auc_score(label, preds))
        losses.append(loss.item())
    
    return np.mean(precision_scores), np.mean(recall_scores), np.mean(roc_auc_scores), np.mean(losses)

In [26]:
def run_epochs(model, train_loader, val_loader, test_loader):
  history = defaultdict(list)

  for epoch in range(EPOCHS):
    print(f'Epoch {epoch + 1}/{EPOCHS}')
    print('-' * 10)

    train_precision, train_recall, train_roc_auc, train_loss = train_epoch(model, train_loader)
    print(f'Train loss {train_loss} precision {train_precision} recall {train_recall} roc auc {train_roc_auc}')

    val_precision, val_recall, val_roc_auc, val_loss = evaluate(model,val_loader)
    print(f'Val loss {val_loss} precision {val_precision} recall {val_recall} roc auc {val_roc_auc}')

    test_precision, test_recall, test_roc_auc, test_loss = evaluate(model, test_loader)
    print(f'Test loss {test_loss} precision {test_precision} recall {test_recall} roc auc {test_roc_auc}')
    print()

    history['train_precision'].append(train_precision)
    history['train_recall'].append(train_recall)
    history['train_roc_auc'].append(train_roc_auc)
    history['train_loss'].append(train_loss)

    history['val_precision'].append(val_precision)
    history['val_recall'].append(val_recall)
    history['val_roc_auc'].append(val_roc_auc)
    history['val_loss'].append(val_loss)

    history['test_precision'].append(test_precision)
    history['test_recall'].append(test_recall)
    history['test_roc_auc'].append(test_roc_auc)
    history['test_loss'].append(test_loss)

  return history

In [27]:
def plot_accuracy_scores(history, metric):
  accuracy_scores = pd.DataFrame([history[f'train_{metric}'], 
                                  history[f'val_{metric}'],
                                  history[f'test_{metric}']], 
                                 index=['Train', 'Val', 'Test']).T
  accuracy_scores['index'] = [i + 1 for i in range(accuracy_scores.shape[0])]

  accuracy_scores_melt = accuracy_scores.melt(id_vars='index', value_vars=['Train', 'Val', 'Test'])
      
  fig = px.line(accuracy_scores_melt, 
                x='index', y='value', 
                title='Training history', 
                color='variable', labels={'value': 'Score', 'index': 'Epoch'})
  fig.show()

# Test helpers 🍋

In [28]:
CLUSTERS_PROBS_TEST_3 = [[0.75, 0.015, 0.0002], [0.015, 0.85, 0.0075], [0.0002, 0.0075, 0.90]]

In [29]:
CLUSTERS_SIZES_TEST_3 = [[10, 10, 10], 
                         [30, 10, 20],
                         [20, 50, 30],
                         [10, 100, 50],
                         [100, 100, 10],
                         [150, 40, 110],
                         [300, 150, 50],
                         [200, 80, 260],
                         [250, 250, 100],
                         [150, 450, 90]]

In [30]:
CLUSTERS_PROBS_TEST_5 = [[0.75, 0.05, 0.02, 0.01, 0.05], 
             [0.05, 0.85, 0.07, 0.08, 0.01], 
             [0.02, 0.07, 0.90, 0.03, 0.04],
             [0.01, 0.08, 0.03, 0.65, 0.10],
             [0.05, 0.01, 0.04, 0.10, 0.95]]

In [31]:
CLUSTERS_SIZES_TEST_5 = [[5, 5, 5, 5, 5],
                    [10, 10, 10, 10, 20],
                    [30, 10, 20, 15, 5],
                    [20, 50, 30, 40, 10],
                    [10, 100, 15, 40, 15],
                    [100, 70, 10, 30, 20],
                    [140, 20, 100, 40, 50],
                    [200, 100, 150, 50, 90],
                    [220, 50, 260, 120, 70],
                    [200, 250, 100, 300, 90],
                    [100, 440, 90, 210, 140]]

In [32]:
CLUSTERS_PROBS_TEST_10 = [[0.75, 0.05, 0.02, 0.01, 0.05, 0.007, 0.02, 0.3, 0.25, 0.015], 
            [0.05, 0.85, 0.07, 0.08, 0.01, 0.001, 0.03, 0.0, 0.04, 0.02], 
            [0.02, 0.07, 0.90, 0.03, 0.04, 0.03, 0.09, 0.017, 0.003, 0.01],
            [0.01, 0.08, 0.03, 0.65, 0.10, 0.005, 0.10, 0.025, 0.05, 0.1],
            [0.05, 0.01, 0.04, 0.10, 0.95, 0.085, 0.07, 0.06, 0.077, 0.009],
            [0.007, 0.001, 0.03, 0.005, 0.085, 0.70, 0.2, 0.0001, 0.08, 0.0015],
            [0.02, 0.03, 0.09, 0.10, 0.07, 0.2, 0.5, 0.3, 0.19, 0.0002],
            [0.3, 0.0, 0.017, 0.025, 0.06, 0.0001, 0.3, 0.99, 0.001, 0.15],
            [0.25, 0.04, 0.003, 0.05, 0.077, 0.08, 0.19, 0.001, 0.87, 0.035],
            [0.015, 0.02, 0.01, 0.1, 0.009, 0.0015, 0.0002, 0.15, 0.035, 0.85]]

In [33]:
CLUSTERS_SIZES_TEST_10 = [[5, 5, 5, 5, 5, 5, 5, 5, 5, 5],
                    [10, 10, 10, 10, 20, 10, 10, 10, 10, 20],
                    [30, 10, 20, 15, 5, 10, 40, 30, 50, 10],
                    [20, 50, 30, 40, 10, 80, 70, 5, 60, 100],
                    [10, 100, 15, 40, 15, 90, 60, 50, 110, 20],
                    [140, 20, 100, 40, 50, 10, 50, 50, 80, 95],
                    [100, 70, 10, 30, 20, 150, 90, 80, 170, 50],
                    [200, 100, 150, 50, 90, 10, 70, 40, 90, 100],
                    [220, 50, 260, 120, 70, 40, 105, 15, 25, 75],
                    [200, 250, 100, 300, 90, 120, 90, 50, 100, 30],
                    [100, 240, 90, 210, 140, 195, 45, 175, 45, 110]
                    ]

In [34]:
def get_nn_metrics_df(precision, recall, roc_auc, graphs_nodes_amount):
  return pd.DataFrame(np.concatenate((np.transpose([precision]), np.transpose([recall]), np.transpose([roc_auc])), axis=1), 
                      index=graphs_nodes_amount,
                      columns=['Precision', 'Recall', 'Roc AUC'])

In [35]:
def perform_model_testing(model, sizes, probs):
  precision_scores = []
  recall_scores = []
  roc_auc_scores = []

  graphs_test = []

  for size in sizes:
    graph = nx.stochastic_block_model(size, probs, seed=0)
    graphs_test.append(graph)

    embeddings, labels = get_graphs_embeddings_labels([graph])

    dataset = get_dataset([graph], embeddings, labels)

    dataset_loader = DataLoader(dataset, batch_size=BATCH_SIZE)

    precision, recall, roc_auc, _ = evaluate(model, dataset_loader)

    precision_scores.append(precision)
    recall_scores.append(recall)
    roc_auc_scores.append(roc_auc)

  graphs_nodes_amount = [len(graph.nodes()) for graph in graphs_test]

  metrics_df = get_nn_metrics_df(precision_scores,
                                 recall_scores,
                                 roc_auc_scores,
                                 graphs_nodes_amount)

  return metrics_df

# Create data for Nets 🦛

In [None]:
graphs_3 = generate_graphs_with_random_props(3)
graphs_5 = generate_graphs_with_random_props(5)

graphs_all = graphs_3 + graphs_5

In [None]:
embeddings_all, labels_all = get_graphs_embeddings_labels(graphs_all)

In [None]:
train_dataset_all, val_dataset_all, test_dataset_all = get_train_val_test_dataset(graphs_all, 
                                                                                  embeddings_all, 
                                                                                  labels_all)

In [None]:
train_loader_all, val_loader_all, test_loader_all = get_train_val_test_loaders(train_dataset_all, 
                                                                               val_dataset_all, 
                                                                               test_dataset_all)

In [None]:
del graphs_all
del embeddings_all
del labels_all
del train_dataset_all
del val_dataset_all
del test_dataset_all

# Convolution network 🌀

In [None]:
conv_model = ConvNet().to(device)

## Training

In [None]:
display(Javascript('''google.colab.output.setIframeHeight(0, true, {maxHeight: 300})'''))

history = run_epochs(conv_model, train_loader_all, val_loader_all, test_loader_all)

<IPython.core.display.Javascript object>

Epoch 1/30
----------
Train loss 2.4329566160837808 precision 0.5201373607427779 recall 0.5455330896302092 roc auc 0.6132965918934068
Val loss 0.473647718783468 precision 0.6588601888035214 recall 0.5779341098575402 roc auc 0.697467538049393
Test loss 0.45942750045408803 precision 0.48113157602049866 recall 0.657439539050066 roc auc 0.698398934609966

Epoch 2/30
----------
Train loss 0.5115990142027537 precision 0.644639716786947 recall 0.6528607365571254 roc auc 0.7078101732667929
Val loss 0.4661730535638829 precision 0.6259195158425032 recall 0.7470492281214479 roc auc 0.7390746672559061
Test loss 0.45146849332377315 precision 0.4672740147870063 recall 0.784500390056872 roc auc 0.736284681939931

Epoch 3/30
----------
Train loss 0.4906442022571961 precision 0.653415984017988 recall 0.7485412183699683 roc auc 0.7435263537280509
Val loss 0.4995042846227686 precision 0.6007608963603318 recall 0.6318619962852875 roc auc 0.6833118354571172
Test loss 0.5092953802086413 precision 0.38897925

In [None]:
plot_accuracy_scores(history, 'recall')

In [None]:
plot_accuracy_scores(history, 'precision')

In [None]:
plot_accuracy_scores(history, 'roc_auc')

In [None]:
torch.save(conv_model.state_dict(), '/content/drive/MyDrive/data/conv_model')

## Testing

### Clusters = 3

In [38]:
metrcis_df_3 = perform_model_testing(conv_model, CLUSTERS_SIZES_TEST_3, CLUSTERS_PROBS_TEST_3)

In [39]:
metrcis_df_3

Unnamed: 0,Precision,Recall,Roc AUC
30,0.319905,1.0,0.521667
60,0.550133,0.925373,0.732232
100,0.977398,0.958378,0.972576
160,1.0,0.959968,0.979984
210,0.709902,0.921267,0.804633
300,0.53569,0.921894,0.694349
500,0.99998,0.87579,0.937888
540,0.999981,0.928486,0.964237
600,0.950041,0.902991,0.937313
690,0.99769,0.862183,0.930137


In [40]:
fig = px.line(metrcis_df_3, title="Conv model: 3 кластера").update_layout(xaxis_title="Количество узлов",
                                                                          yaxis_title="Значение метрики")   
fig.show()

### Clusters = 5

In [41]:
metrcis_df_5 = perform_model_testing(conv_model, CLUSTERS_SIZES_TEST_5, CLUSTERS_PROBS_TEST_5)

In [42]:
metrcis_df_5

Unnamed: 0,Precision,Recall,Roc AUC
25,0.173759,0.98,0.524
60,0.393182,0.935135,0.776853
80,0.446021,0.921019,0.771457
150,0.57655,0.88972,0.842036
180,0.939549,0.919298,0.942168
230,0.553075,0.881892,0.784498
350,0.545978,0.910933,0.810863
590,0.490232,0.882002,0.79822
720,0.871246,0.882915,0.91796
940,0.520814,0.805352,0.787257


In [43]:
fig = px.line(metrcis_df_5, title="Conv model: 5 кластеров").update_layout(xaxis_title="Количество узлов",
                                                                           yaxis_title="Значение метрики")   
fig.show()

### Clusters = 10

In [44]:
metrcis_df_10 = perform_model_testing(conv_model, CLUSTERS_SIZES_TEST_10, CLUSTERS_PROBS_TEST_10)

In [45]:
metrcis_df_10

Unnamed: 0,Precision,Recall,Roc AUC
50,0.094923,0.86,0.565556
120,0.168322,0.847297,0.681617
220,0.390044,0.848567,0.818411
465,0.39218,0.867089,0.825197
510,0.404565,0.877465,0.826836
635,0.38682,0.848453,0.820076
770,0.376284,0.810366,0.794264
900,0.395571,0.835435,0.82055
980,0.625247,0.845639,0.873642
1330,0.413408,0.793604,0.806131


In [46]:
fig = px.line(metrcis_df_10, title="Conv model: 10 кластеров").update_layout(xaxis_title="Количество узлов",
                                                                             yaxis_title="Значение метрики")   
fig.show()

# Attention network 🍈

In [None]:
gat_model = GATNet().to(device)

## Training

In [None]:
display(Javascript('''google.colab.output.setIframeHeight(0, true, {maxHeight: 300})'''))

history_gat = run_epochs(gat_model, train_loader_all, val_loader_all, test_loader_all)

<IPython.core.display.Javascript object>

Epoch 1/30
----------
Train loss 3.509954566756884 precision 0.4695148091702008 recall 0.491307097559869 roc auc 0.5710666705290682
Val loss 0.48384990356862545 precision 0.6897201016012495 recall 0.520437079946405 roc auc 0.6877168451534468
Test loss 0.44957825712238747 precision 0.5149823219546317 recall 0.6871065024267405 roc auc 0.7249941642236438

Epoch 2/30
----------
Train loss 0.5643185190856457 precision 0.43968859111035224 recall 0.353137082309674 roc auc 0.6015827958028737
Val loss 0.5024797801549236 precision 0.5607756312475887 recall 0.9433586644086919 roc auc 0.757830640296851
Test loss 0.539443323854357 precision 0.39635207822403506 recall 0.9922968514289354 roc auc 0.7347379640443896

Epoch 3/30
----------
Train loss 0.49458719603717327 precision 0.6380182761389492 recall 0.7712579643399732 roc auc 0.7410069123010231
Val loss 0.4917348933716615 precision 0.6040577053183386 recall 0.6954760054192928 roc auc 0.7070102806345077
Test loss 0.4902336091424028 precision 0.4051

In [None]:
plot_accuracy_scores(history_gat, 'recall')

In [None]:
plot_accuracy_scores(history_gat, 'precision')

In [None]:
plot_accuracy_scores(history_gat, 'roc_auc')

In [None]:
torch.save(gat_model.state_dict(), '/content/drive/MyDrive/data/gat_model')

## Testing

### Clusters = 3

In [48]:
metrcis_gat_df_3 = perform_model_testing(gat_model, CLUSTERS_SIZES_TEST_3, CLUSTERS_PROBS_TEST_3)

In [49]:
metrcis_gat_df_3

Unnamed: 0,Precision,Recall,Roc AUC
30,0.34264,1.0,0.568333
60,0.537866,0.932836,0.722327
100,0.952948,0.952432,0.962184
160,1.0,0.958842,0.979421
210,0.712366,0.916943,0.805055
300,0.520071,0.933983,0.679427
500,0.99915,0.944541,0.97193
540,0.999982,0.969787,0.984888
600,0.877247,0.962426,0.940991
690,0.995464,0.974545,0.985149


In [50]:
fig = px.line(metrcis_gat_df_3, title="Attention model: 3 кластера").update_layout(xaxis_title="Количество узлов",
                                                                                   yaxis_title="Значение метрики")   
fig.show()

### Clusters = 5

In [51]:
metrcis_gat_df_5 = perform_model_testing(gat_model, CLUSTERS_SIZES_TEST_5, CLUSTERS_PROBS_TEST_5)

In [55]:
metrcis_gat_df_5

Unnamed: 0,Precision,Recall,Roc AUC
25,0.176259,0.98,0.532
60,0.393143,0.92973,0.775222
80,0.433951,0.924841,0.763052
150,0.553605,0.884112,0.82988
180,0.937841,0.920134,0.942042
230,0.531248,0.887617,0.771868
350,0.528284,0.912593,0.800742
590,0.496225,0.92081,0.814873
720,0.876051,0.948191,0.949936
940,0.530154,0.896776,0.824591


In [52]:
fig = px.line(metrcis_gat_df_5, title="Attention model: 5 кластеров").update_layout(xaxis_title="Количество узлов",
                                                                                    yaxis_title="Значение метрики")   
fig.show()

### Clusters = 10

In [53]:
metrcis_gat_df_10 = perform_model_testing(gat_model, CLUSTERS_SIZES_TEST_10, CLUSTERS_PROBS_TEST_10)

In [56]:
metrcis_gat_df_10

Unnamed: 0,Precision,Recall,Roc AUC
50,0.096663,0.84,0.571111
120,0.168317,0.85,0.682188
220,0.393841,0.844947,0.818719
465,0.390987,0.864686,0.823753
510,0.390479,0.88362,0.822303
635,0.365672,0.867346,0.817161
770,0.377084,0.87349,0.817593
900,0.370562,0.890215,0.830008
980,0.626728,0.918612,0.906222
1330,0.40829,0.888282,0.840483


In [54]:
fig = px.line(metrcis_gat_df_10, title="Attention model: 10 кластеров").update_layout(xaxis_title="Количество узлов",
                                                                                      yaxis_title="Значение метрики")   
fig.show()

# Transformer network 🚚

In [None]:
transformer_model = TransformerNet().to(device)

## Training

In [None]:
display(Javascript('''google.colab.output.setIframeHeight(0, true, {maxHeight: 300})'''))

history_transformer = run_epochs(transformer_model, train_loader_all, val_loader_all, test_loader_all)

<IPython.core.display.Javascript object>

Epoch 1/30
----------
Train loss 12.941303960978985 precision 0.5626717723774861 recall 0.4942322173342388 roc auc 0.6105520797115502
Val loss 8.546822026371956 precision 0.0 recall 0.0 roc auc 0.5
Test loss 9.307845289508501 precision 0.0 recall 0.0 roc auc 0.5

Epoch 2/30
----------
Train loss 5.009331771483024 precision 0.6406174218545618 recall 0.5826067808112609 roc auc 0.6732576698799161
Val loss 4.701501826445262 precision 0.0 recall 0.0 roc auc 0.5
Test loss 5.255843209723632 precision 0.0 recall 0.0 roc auc 0.5

Epoch 3/30
----------
Train loss 3.058244079894697 precision 0.5929660044596763 recall 0.5995692064945192 roc auc 0.6722585299888252
Val loss 0.5264393125350276 precision 0.6135662347525985 recall 0.03213224716582903 roc auc 0.508728178192687
Test loss 0.46761110983788967 precision 0.3459287784777964 recall 0.0019520975726100632 roc auc 0.5002889874689709

Epoch 4/30
----------
Train loss 0.7156224728872379 precision 0.6401429770516592 recall 0.6484938733487513 roc auc

In [None]:
plot_accuracy_scores(history_transformer, 'recall')

In [None]:
plot_accuracy_scores(history_transformer, 'precision')

In [None]:
plot_accuracy_scores(history_transformer, 'roc_auc')

In [None]:
torch.save(transformer_model.state_dict(), '/content/drive/MyDrive/data/transformer_model')

## Testing

### Clusters = 3

In [58]:
metrcis_transformer_df_3 = perform_model_testing(transformer_model, CLUSTERS_SIZES_TEST_3, CLUSTERS_PROBS_TEST_3)

In [59]:
metrcis_transformer_df_3

Unnamed: 0,Precision,Recall,Roc AUC
30,0.376437,0.97037,0.623519
60,0.533276,0.920896,0.714993
100,0.955176,0.967568,0.970235
160,1.0,0.977331,0.988666
210,0.660091,0.952137,0.772902
300,0.539741,0.96546,0.708046
500,1.0,0.988559,0.994279
540,1.0,0.989247,0.994624
600,0.742465,0.990893,0.892793
690,0.996417,0.993116,0.99485


In [60]:
fig = px.line(metrcis_transformer_df_3, title="Transformer model: 3 кластера").update_layout(xaxis_title="Количество узлов",
                                                                                             yaxis_title="Значение метрики")   
fig.show()

### Clusters = 5

In [61]:
metrcis_transformer_df_5 = perform_model_testing(transformer_model, CLUSTERS_SIZES_TEST_5, CLUSTERS_PROBS_TEST_5)

In [62]:
metrcis_transformer_df_5

Unnamed: 0,Precision,Recall,Roc AUC
25,0.190083,0.92,0.568
60,0.401949,0.891892,0.770589
80,0.425684,0.912102,0.752683
150,0.55326,0.900935,0.835997
180,0.939194,0.949708,0.956681
230,0.546688,0.913006,0.790301
350,0.548786,0.952237,0.826662
590,0.508931,0.964853,0.837491
720,0.871258,0.979968,0.963907
940,0.561588,0.959725,0.86316


In [63]:
fig = px.line(metrcis_transformer_df_5, title="Transformer model: 5 кластеров").update_layout(xaxis_title="Количество узлов",
                                                                                              yaxis_title="Значение метрики")   
fig.show()

### Clusters = 10

In [64]:
metrcis_transformer_df_10 = perform_model_testing(transformer_model, CLUSTERS_SIZES_TEST_10, CLUSTERS_PROBS_TEST_10)

In [65]:
metrcis_transformer_df_10

Unnamed: 0,Precision,Recall,Roc AUC
50,0.098684,0.75,0.570556
120,0.171064,0.832432,0.683013
220,0.382942,0.86546,0.821466
465,0.370213,0.89773,0.825738
510,0.396879,0.924726,0.840605
635,0.384963,0.924467,0.847859
770,0.380467,0.931181,0.840382
900,0.399238,0.94537,0.864402
980,0.631575,0.962083,0.926589
1330,0.414338,0.9532,0.868111


In [66]:
fig = px.line(metrcis_transformer_df_10, title="Transformer model: 10 кластеров").update_layout(xaxis_title="Количество узлов",
                                                                                        yaxis_title="Значение метрики")   
fig.show()

# GraphSAGE network 🐒

In [None]:
sage_model = SageNet().to(device)

## Training

In [None]:
display(Javascript('''google.colab.output.setIframeHeight(0, true, {maxHeight: 300})'''))

history_sage = run_epochs(sage_model, train_loader_all, val_loader_all, test_loader_all)

In [None]:
plot_accuracy_scores(history_sage, 'recall')

In [None]:
plot_accuracy_scores(history_sage, 'precision')

In [None]:
plot_accuracy_scores(history_sage, 'roc_auc')

In [None]:
torch.save(sage_model.state_dict(), '/content/drive/MyDrive/data/sage_model')

## Testing

### Clusters = 3

In [68]:
metrcis_sage_df_3 = perform_model_testing(sage_model, CLUSTERS_SIZES_TEST_3, CLUSTERS_PROBS_TEST_3)

In [69]:
metrcis_sage_df_3

Unnamed: 0,Precision,Recall,Roc AUC
30,0.377907,0.962963,0.624815
60,0.539654,0.883582,0.712246
100,0.974359,0.965405,0.975122
160,1.0,0.983601,0.991801
210,0.632401,0.956963,0.747982
300,0.528985,0.967409,0.696307
500,1.0,0.988017,0.994009
540,1.0,0.991415,0.995708
600,0.728503,0.991696,0.885466
690,0.99449,0.994054,0.994393


In [70]:
fig = px.line(metrcis_sage_df_3, title="GraphSAGE model: 3 кластера").update_layout(xaxis_title="Количество узлов",
                                                                                    yaxis_title="Значение метрики")   
fig.show()

### Clusters = 5

In [71]:
metrcis_sage_df_5 = perform_model_testing(sage_model, CLUSTERS_SIZES_TEST_5, CLUSTERS_PROBS_TEST_5)

In [72]:
metrcis_sage_df_5

Unnamed: 0,Precision,Recall,Roc AUC
25,0.188285,0.9,0.562
60,0.405743,0.878378,0.769189
80,0.442812,0.882803,0.757822
150,0.561708,0.894953,0.837594
180,0.943331,0.959566,0.962746
230,0.540497,0.920224,0.788363
350,0.539812,0.956563,0.822614
590,0.507704,0.972876,0.839578
720,0.8651,0.983732,0.964241
940,0.545416,0.96356,0.856686


In [73]:
fig = px.line(metrcis_sage_df_5, title="GraphSAGE model: 5 кластеров").update_layout(xaxis_title="Количество узлов",
                                                                                     yaxis_title="Значение метрики")   
fig.show()

### Clusters = 10

In [74]:
metrcis_sage_df_10 = perform_model_testing(sage_model, CLUSTERS_SIZES_TEST_10, CLUSTERS_PROBS_TEST_10)

In [75]:
metrcis_sage_df_10

Unnamed: 0,Precision,Recall,Roc AUC
50,0.101671,0.73,0.578333
120,0.174296,0.802703,0.681508
220,0.382494,0.866063,0.821479
465,0.357939,0.913151,0.824514
510,0.38872,0.941628,0.842515
635,0.366305,0.940804,0.844366
770,0.377362,0.944525,0.843573
900,0.371185,0.955323,0.854473
980,0.617207,0.968048,0.925771
1330,0.398845,0.956406,0.86213


In [76]:
fig = px.line(metrcis_sage_df_10, title="GraphSAGE model: 10 кластеров").update_layout(xaxis_title="Количество узлов",
                                                                                       yaxis_title="Значение метрики")   
fig.show()