# Import 🚀

In [1]:
!pip install -qq torch_geometric

[?25l     [90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━[0m [32m0.0/661.6 kB[0m [31m?[0m eta [36m-:--:--[0m[2K     [91m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━[0m[91m╸[0m [32m655.4/661.6 kB[0m [31m19.2 MB/s[0m eta [36m0:00:01[0m[2K     [90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━[0m [32m661.6/661.6 kB[0m [31m14.5 MB/s[0m eta [36m0:00:00[0m
[?25h  Installing build dependencies ... [?25l[?25hdone
  Getting requirements to build wheel ... [?25l[?25hdone
  Preparing metadata (pyproject.toml) ... [?25l[?25hdone
  Building wheel for torch_geometric (pyproject.toml) ... [?25l[?25hdone


In [2]:
import torch
import random

import networkx as nx
import numpy as np
import pandas as pd
import torch.nn as nn
import torch.nn.functional as F
import plotly.express as px

from itertools import combinations, permutations
from collections import defaultdict
from torch_geometric.data import InMemoryDataset, Data
from torch_geometric.loader import DataLoader
from torch_geometric.nn import GCNConv, GATConv, TransformerConv, SAGEConv
from sklearn.metrics import f1_score, rand_score

In [3]:
random.seed(2023)

# Clustering helpers 🐆

In [4]:
# Изменение значений кластеров на 1, кроме кластера-исключения
def update_rest_values_by_one(values_dict, except_key, add=False):
  increment = 1 if add else -1
  for key, value in values_dict.items():
    if key != except_key:
      if value == 0 and increment > 0 or value > 0:
        values_dict[key] = value + increment

In [5]:
# Поиск кластера с наибольшим числом голосов. Если таких кластеров несколько, вернуть None и список этих кластеров
def find_cluster_assign(values_dict):
  if len(values_dict) == 1:
    # if values_dict[next(iter(values_dict))] == 0:
    #   return None, []
    return max(values_dict, key=values_dict.get), []
  max_value = max(values_dict.values())
  keys_with_max = [key for key, value in values_dict.items() if value == max_value]
  if len(keys_with_max) > 1:
    return None, keys_with_max
  else:
    if max_value == 0:
      return None, keys_with_max
    return keys_with_max[0], []

In [6]:
# Получение кластера для не рассмотренного узла из рассмотренного. Если кластеров с максимум несколько, вернуть их
def get_existing_node_cluster(values_dict):
  node_cluster, keys_with_max = find_cluster_assign(values_dict)
  if node_cluster == None:
    return {key: values_dict[key] for key in keys_with_max}
  else:
    return {node_cluster: 1}

In [7]:
# После посещения ранее не рассмторенного узла, необходимо увеличить вес его кластера
def update_weight_after_adding_unseen_node(values_dict, node_cluster_dict):
  if len(node_cluster_dict) == 1:
    node_cluster = next(iter(node_cluster_dict))
    values_dict[node_cluster] += 1

In [8]:
# Добавить в кластеры назначение для расмотренной и не рассмотренной вершин
def add_nodes_pair_seen_unseen(cluster_assign, first_node, second_node):
  second_node_cluster = get_existing_node_cluster(cluster_assign[first_node])
  cluster_assign[second_node] = second_node_cluster
  update_weight_after_adding_unseen_node(cluster_assign[first_node], second_node_cluster)

In [9]:
# Добавить в кластеры назначение для расмотренной и не рассмотренной вершин не из одного кластера
def add_nodes_pair_seen_unseen_not_in(cluster_assign, first_node, second_node, current_cluster):
  node_cluster, _ = find_cluster_assign(cluster_assign[first_node])
  if node_cluster != None:
    cluster_assign[first_node][node_cluster] += 1
  cluster_assign[second_node] = {current_cluster: 1}

In [10]:
# Подтвердить назначение кластеру, увеличив его на 1, а остальные назначения уменьшив на 1
def accept_assign_to_cluster(cluster_assign, node, node_cluster):
  cluster_assign[node][node_cluster] += 1
  update_rest_values_by_one(cluster_assign[node], node_cluster)

In [11]:
# Опровергнуть назначение кластеру, уменьшив его на 1, а остальные назначения увеличив на 1
def discard_assign_to_cluster(cluster_assign, node, node_cluster):
  if cluster_assign[node][node_cluster] > 0:
    cluster_assign[node][node_cluster] -= 1
  update_rest_values_by_one(cluster_assign[node], node_cluster, add=True)

In [12]:
# Добавить новый кластер или увеличить его значение, а остальные назначения уменьшить на 1
# Если передана еще одна вершина, для нее значение этого кластера уменьшить на 1
def add_new_cluster(cluster_assign, node, new_cluster, another_node=None):
  if another_node != None and cluster_assign[another_node][new_cluster] > 0:
    cluster_assign[another_node][new_cluster] -= 1
  if new_cluster in cluster_assign[node]:
    cluster_assign[node][new_cluster] += 1
  else:
    cluster_assign[node][new_cluster] = 1
  update_rest_values_by_one(cluster_assign[node], new_cluster)

In [13]:
# Уменьшает значение кластера для одной вершины, а другой увеличивает
def downvote_cluster(cluster_assign, node, new_cluster, another_node=None):
  cluster_assign[another_node][new_cluster] += 1
  if new_cluster in cluster_assign[node] and cluster_assign[node][new_cluster] > 0:
    cluster_assign[node][new_cluster] -= 1
  update_rest_values_by_one(cluster_assign[node], new_cluster, add=True)

In [14]:
# Увеличивает значение кластера той вершине, где он максимален.
# Для другой убираем остальные назначения и оставляем только этот кластер.
def assign_to_max_cluster(cluster_assign, first_node, second_node, max_cluster):
  cluster_assign[first_node][max_cluster] += 1
  if max_cluster in cluster_assign[second_node]:
    cluster_assign[second_node][max_cluster] += 1
  else:
    cluster_assign[second_node][max_cluster] = 1
  cluster_value = cluster_assign[second_node][max_cluster]
  cluster_assign[second_node].clear()
  cluster_assign[second_node][max_cluster] = cluster_value

In [15]:
def perform_clustering(G, pairs_classification, with_permutations):
  cluster_assign = {}
  current_cluster = 0

  for index, pair in enumerate(permutations(G.nodes(), 2) if with_permutations else combinations(G.nodes(), 2)):
    first_node = pair[0]
    second_node = pair[1]

    # Обе вершины классифицированы как принадлежащие одному кластеру
    if pairs_classification[index] == 1:
      # Обе вершины уже были встречены ранее
      if first_node in cluster_assign and second_node in cluster_assign:
        # Получим их текущие назначения кластеров
        first_node_cluster, _ = find_cluster_assign(cluster_assign[first_node])
        second_node_cluster, _ = find_cluster_assign(cluster_assign[second_node])
        # Если назначения кластеров равны и они оба не None, то мы подтверждаем назначение этому кластеру,
        # то есть увеличиваем его на 1, а остальные назначения уменьшаем на 1
        if first_node_cluster == second_node_cluster and first_node_cluster != None:
          accept_assign_to_cluster(cluster_assign, first_node, first_node_cluster)
          accept_assign_to_cluster(cluster_assign, second_node, second_node_cluster)
        # Назначения кластерам не равны
        if first_node_cluster != second_node_cluster:
          # Оба кластера известны, тогда assign_to_max_cluster: Отнести вершину к кластеру с максимумом
          if first_node_cluster != None and second_node_cluster != None:
            if cluster_assign[first_node][first_node_cluster] > cluster_assign[second_node][second_node_cluster]:
              assign_to_max_cluster(cluster_assign, first_node, second_node, first_node_cluster)
            elif cluster_assign[first_node][first_node_cluster] < cluster_assign[second_node][second_node_cluster]:
              assign_to_max_cluster(cluster_assign, second_node, first_node, second_node_cluster)
          # Один кластер известен, а другой нет. Тогда для вершины с известным кластером вес уменьшаем, 
          # так как эта ситуация не дает нам уверенности в принадлежности к этому кластеру. 
          # Для вершины с неизвестным кластером увеличиваем (или добавляем) вес известному кластеру, остальные уменьшаем
          if first_node_cluster != None and second_node_cluster == None:
            add_new_cluster(cluster_assign, second_node, first_node_cluster, first_node)
          if first_node_cluster == None and second_node_cluster != None:
            add_new_cluster(cluster_assign, first_node, second_node_cluster, second_node)
      # Обеим вершинам не были назначены кластеры
      if first_node not in cluster_assign and second_node not in cluster_assign:
        cluster_assign[first_node] = {current_cluster: 1}
        cluster_assign[second_node] = {current_cluster: 1}
        current_cluster += 1
      # Первой вершине был назначен кластер, а второй еще нет
      if first_node in cluster_assign and second_node not in cluster_assign:
        add_nodes_pair_seen_unseen(cluster_assign, first_node, second_node)
      # Второй вершине был назначен кластер, а первой еще нет
      if second_node in cluster_assign and first_node not in cluster_assign:
        add_nodes_pair_seen_unseen(cluster_assign, second_node, first_node)

    # Вершины не были классифицированы как принадлежащие одному кластеру
    if pairs_classification[index] == 0:
      # Обе вершины уже были встречены ранее
      if first_node in cluster_assign and second_node in cluster_assign:
        # Получим их текущие назначения кластеров
        first_node_cluster, _ = find_cluster_assign(cluster_assign[first_node])
        second_node_cluster, _ = find_cluster_assign(cluster_assign[second_node])
        # Если назначения кластеров равны и они оба не None, то мы уменьшаем назначение этому кластеру, 
        # а остальные назначения увеличиваем на 1. При этом второй вершине назначаем следующий
        # после наибольшего для первой вершины кластер
        if first_node_cluster == second_node_cluster and first_node_cluster != None:
          discard_assign_to_cluster(cluster_assign, first_node, first_node_cluster)
          discard_assign_to_cluster(cluster_assign, second_node, second_node_cluster)
          if first_node_cluster + 1 not in cluster_assign[second_node]:
            cluster_assign[second_node][first_node_cluster + 1] = 1
          else:
            cluster_assign[second_node][first_node_cluster + 1] += 1
        # Назначения кластерам не равны
        if first_node_cluster != second_node_cluster:
          # Один кластер известен, а другой нет. Тогда для вершины с известным кластером вес увеличиваем.
          # Для вершины с неизвестным кластером уменьшаем вес известному кластеру, остальные увеличиваем
          if first_node_cluster != None and second_node_cluster == None:
            downvote_cluster(cluster_assign, second_node, first_node_cluster, first_node)
          if first_node_cluster == None and second_node_cluster != None:
            downvote_cluster(cluster_assign, first_node, second_node_cluster, second_node)
      # Обеим вершинам не были назначены кластеры
      if first_node not in cluster_assign and second_node not in cluster_assign:
        cluster_assign[first_node] = {current_cluster: 1}
        cluster_assign[second_node] = {current_cluster + 1: 1}
        current_cluster += 2
      # Первой вершине был назначен кластер, а второй еще нет, 
      # назначить вершине следующий кластер после макимального для первой
      if first_node in cluster_assign and second_node not in cluster_assign:
        first_node_cluster, _ = find_cluster_assign(cluster_assign[first_node])
        if first_node_cluster != None:
          cluster_assign[second_node] = {first_node_cluster + 1: 1}
        else:
          cluster_assign[second_node] = {current_cluster: 1}
          current_cluster += 1
      # Второй вершине был назначен кластер, а первой еще нет
      if second_node in cluster_assign and first_node not in cluster_assign:
        second_node_cluster, _ = find_cluster_assign(cluster_assign[second_node])
        if second_node_cluster != None:
          cluster_assign[first_node] = {second_node_cluster + 1: 1}
        else:
          cluster_assign[first_node] = {current_cluster: 1}
          current_cluster += 1


  return cluster_assign

In [16]:
def get_nodes_clusters(cluster_assign):
  nodes_clusters = {}
  for node, assign in cluster_assign.items():
    nodes_clusters[node] = max(assign, key=assign.get)

  return nodes_clusters

In [17]:
def restore_missing_clusters(cluster_assign):
  clusters = list(cluster_assign.values())
  full_sequence = set(range(0, clusters[-1] + 1))
  clusters_set = set(clusters)
  missing_clusters = list(clusters_set ^ full_sequence)

  for key, value in cluster_assign.items():
    for missing in missing_clusters:
      if value > missing:
        cluster_assign[key] -= 1
      else:
        break

In [18]:
def clustering_pipeline(G, pairs_classification, with_permutations):
  cluster_assign = perform_clustering(G, pairs_classification, with_permutations)
  nodes_clusters = get_nodes_clusters(cluster_assign)
  restore_missing_clusters(nodes_clusters)
  return nodes_clusters

# Prepare graphs data ⛲

In [19]:
def get_graph_true_clusters(G):
  clusters_dict = {}

  for i in range(len(G.graph["partition"])):
    for node in G.graph["partition"][i]:
      clusters_dict[node] = i

  return clusters_dict

In [20]:
def get_graph_node_pairs(G, with_permutations):
  degrees_dict = dict(G.degree())

  initial_embeddings = []

  nodes_combinations = permutations(G.nodes(), 2) if with_permutations else combinations(G.nodes(), 2)

  for nodes_pair in nodes_combinations:
    first_node = nodes_pair[0]
    second_node = nodes_pair[1]

    degrees_pair = [degrees_dict[first_node], degrees_dict[second_node]]

    initial_embeddings.append(degrees_pair)

  embeddings = np.array(initial_embeddings)

  return embeddings

In [21]:
def get_graphs_embeddings(graphs, with_permutations):
  embeddings_all = []

  for graph in graphs:
    embeddings = get_graph_node_pairs(graph, with_permutations)
    embeddings_all.append(embeddings)

  return embeddings_all

# Dataset creation 🍚

In [22]:
BATCH_SIZE = 8

In [23]:
class GraphDataset(InMemoryDataset):
    def __init__(self, graphs, embeddings):
        super(GraphDataset, self).__init__('.', None, None, None)

        data_graphs = []

        for index, graph in enumerate(graphs):
          adj = nx.to_scipy_sparse_array(graph).tocoo()
          row = torch.from_numpy(adj.row.astype(np.int64)).to(torch.long)
          col = torch.from_numpy(adj.col.astype(np.int64)).to(torch.long)
          edge_index = torch.stack([row, col], dim=0)

          x = torch.from_numpy(embeddings[index]).type(torch.float32)

          data = Data(edge_index=edge_index,
                      num_nodes=graph.number_of_nodes(),
                      x=x,
                      num_classes=2)
          
          data_graphs.append(data)

        self.data, self.slices = self.collate([data_graphs])

    def _download(self):
        return

    def _process(self):
        return

    def __repr__(self):
        return '{}()'.format(self.__class__.__name__)

# Load model 🧴

In [24]:
device = torch.device('cuda' if torch.cuda.is_available() else 'cpu')

In [25]:
# GCN model with 2 layers 
class ConvNet(torch.nn.Module):
    def __init__(self):
        super(ConvNet, self).__init__()
        self.conv1 = GCNConv(2, 64)
        self.conv2 = GCNConv(64, 2)

    def forward(self, data):
        x, edge_index = data.x, data.edge_index
        x = F.relu(self.conv1(x, edge_index))
        x = F.dropout(x, p=0.5, training=self.training)
        x = self.conv2(x, edge_index)
        return F.log_softmax(x, dim=1)

In [26]:
class GATNet(torch.nn.Module):
    def __init__(self):
        super().__init__()
        self.conv1 = GATConv(2, 64)
        self.conv2 = GATConv(64, 2)

    def forward(self, data):
        x, edge_index = data.x, data.edge_index
        x = F.relu(self.conv1(x, edge_index))
        x = F.dropout(x, p=0.6, training=self.training)
        x = self.conv2(x, edge_index)

        return F.log_softmax(x, dim=1)

In [27]:
class TransformerNet(torch.nn.Module):
  def __init__(self):
        super().__init__()
        self.conv1 = TransformerConv(2, 256, dropout=0.6) 
        self.conv2 = TransformerConv(256, 2, dropout=0.5) 

  def forward(self, data):
        x, edge_index = data.x, data.edge_index
        x = F.relu(self.conv1(x, edge_index))
        x = F.dropout(x, training=self.training)
        x = self.conv2(x, edge_index)
        return F.log_softmax(x, dim=1)

In [28]:
class SageNet(torch.nn.Module):
  def __init__(self):
        super().__init__()
        self.conv1 = SAGEConv(2, 64) 
        self.conv2 = SAGEConv(64, 2)

  def forward(self, data):
        x, edge_index = data.x, data.edge_index
        x = F.relu(self.conv1(x, edge_index))
        x = F.dropout(x, p=0.5, training=self.training)
        x = self.conv2(x, edge_index)

        return F.log_softmax(x, dim=1)

In [29]:
def evaluate(model, loader):
  model.eval()

  with torch.no_grad():
    for loader_ in loader:
      for data in loader_:
        data = data.to(device)
        output = model(data)
        _, preds = torch.max(output, dim=1)
    
    return preds

In [30]:
conv = ConvNet()
conv.load_state_dict(torch.load('/content/drive/MyDrive/data/conv_model'))

<All keys matched successfully>

In [31]:
gat = GATNet()
gat.load_state_dict(torch.load('/content/drive/MyDrive/data/gat_model'))

<All keys matched successfully>

In [32]:
transformer = TransformerNet()
transformer.load_state_dict(torch.load('/content/drive/MyDrive/data/transformer_model'))

<All keys matched successfully>

In [33]:
sage = SageNet()
sage.load_state_dict(torch.load('/content/drive/MyDrive/data/sage_model'))

<All keys matched successfully>

In [34]:
models_dict = {
    'GCNConv': conv,
    'GATConv': gat,
    'TransformerConv': transformer,
    'SAGEConv': sage
}

In [36]:
def get_graph_data_loader(size, probs, with_permutations):
  graph = nx.stochastic_block_model(size, probs, seed=0)

  embeddings = get_graphs_embeddings([graph], with_permutations)

  dataset = GraphDataset([graph], embeddings)

  dataset_loader = DataLoader(dataset, batch_size=BATCH_SIZE)

  return graph, dataset_loader

# Perform clustering 📖

## Helpers


In [37]:
CLUSTERS_PROBS_TEST_3 = [[0.75, 0.015, 0.0002], [0.015, 0.85, 0.0075], [0.0002, 0.0075, 0.90]]

In [38]:
CLUSTERS_SIZES_TEST_3 = [[20, 50, 30],
                         [150, 40, 110],
                         [300, 150, 50],
                         [150, 450, 90],
                         [450, 300, 200],
                         [500, 400, 300]]

In [39]:
CLUSTERS_PROBS_TEST_5 = [[0.75, 0.05, 0.02, 0.01, 0.05], 
             [0.05, 0.85, 0.07, 0.08, 0.01], 
             [0.02, 0.07, 0.90, 0.03, 0.04],
             [0.01, 0.08, 0.03, 0.65, 0.10],
             [0.05, 0.01, 0.04, 0.10, 0.95]]

In [40]:
CLUSTERS_SIZES_TEST_5 = [
                    [10, 100, 15, 40, 15],
                    [100, 70, 10, 30, 20],
                    [200, 100, 150, 50, 90],
                    [220, 50, 260, 120, 70],
                    [200, 250, 100, 300, 90],
                    [200, 450, 250, 250, 350]
                    ]

In [41]:
CLUSTERS_PROBS_TEST_10 = [[0.75, 0.05, 0.02, 0.01, 0.05, 0.007, 0.02, 0.3, 0.25, 0.015], 
            [0.05, 0.85, 0.07, 0.08, 0.01, 0.001, 0.03, 0.0, 0.04, 0.02], 
            [0.02, 0.07, 0.90, 0.03, 0.04, 0.03, 0.09, 0.017, 0.003, 0.01],
            [0.01, 0.08, 0.03, 0.65, 0.10, 0.005, 0.10, 0.025, 0.05, 0.1],
            [0.05, 0.01, 0.04, 0.10, 0.95, 0.085, 0.07, 0.06, 0.077, 0.009],
            [0.007, 0.001, 0.03, 0.005, 0.085, 0.70, 0.2, 0.0001, 0.08, 0.0015],
            [0.02, 0.03, 0.09, 0.10, 0.07, 0.2, 0.5, 0.3, 0.19, 0.0002],
            [0.3, 0.0, 0.017, 0.025, 0.06, 0.0001, 0.3, 0.99, 0.001, 0.15],
            [0.25, 0.04, 0.003, 0.05, 0.077, 0.08, 0.19, 0.001, 0.87, 0.035],
            [0.015, 0.02, 0.01, 0.1, 0.009, 0.0015, 0.0002, 0.15, 0.035, 0.85]]

In [42]:
CLUSTERS_SIZES_TEST_10 = [
                    [30, 10, 20, 15, 5, 10, 40, 30, 50, 10],
                    [10, 100, 15, 40, 15, 90, 60, 50, 110, 20],
                    [100, 70, 10, 30, 20, 150, 90, 80, 170, 50],
                    [220, 50, 260, 120, 70, 40, 105, 15, 25, 75],
                    [200, 100, 100, 100, 110, 90, 80, 120, 150, 50],
                    [300, 100, 200, 150, 50, 250, 150, 100, 50, 50]
                    ]

In [43]:
def evaluate_graphs_nodes_clustering(models, sizes, probs, with_permutations=False):
  metrics_f1 = {
    'GCNConv': [],
    'GATConv': [],
    'TransformerConv': [],
    'SAGEConv': []
  }

  metrics_rand = {
    'GCNConv': [],
    'GATConv': [],
    'TransformerConv': [],
    'SAGEConv': []
  }

  graphs_test = []

  for size in sizes:
    graph, dataset_loader = get_graph_data_loader(size, probs, with_permutations)
    graphs_test.append(graph)
    for model_name, model in models.items():     
      nodes_classification = evaluate(model, dataset_loader)
      pred_clusters = list(clustering_pipeline(graph, nodes_classification.numpy(), with_permutations).values())
      true_clusters = list(get_graph_true_clusters(graph).values())

      f1 = f1_score(true_clusters, pred_clusters, average='weighted')
      rand_ = rand_score(true_clusters, pred_clusters)

      metrics_f1[model_name].append(f1)
      metrics_rand[model_name].append(rand_)

  graphs_nodes_amount = [len(graph.nodes()) for graph in graphs_test]

  metrics_f1_df = pd.DataFrame(metrics_f1, index=graphs_nodes_amount)
  metrics_rand_df = pd.DataFrame(metrics_rand, index=graphs_nodes_amount)

  return metrics_f1_df, metrics_rand_df

## Clusters = 3

In [44]:
metrics_f1_3, metrics_rand_3 = evaluate_graphs_nodes_clustering(models_dict, 
                                                                CLUSTERS_SIZES_TEST_3, 
                                                                CLUSTERS_PROBS_TEST_3,
                                                                True)

In [45]:
metrics_f1_3

Unnamed: 0,GCNConv,GATConv,TransformerConv,SAGEConv
100,0.956698,0.942866,0.938826,0.952658
300,0.41008,0.420837,0.420837,0.462676
500,1.0,1.0,1.0,0.003987
690,0.559006,0.005721,1.0,1.0
950,1.0,0.355263,0.998947,0.39325
1200,0.297619,0.297619,0.297619,0.456949


In [46]:
metrics_rand_3

Unnamed: 0,GCNConv,GATConv,TransformerConv,SAGEConv
100,0.965051,0.946869,0.954949,0.974343
300,0.598239,0.600201,0.600201,0.613913
500,1.0,1.0,1.0,0.997603
690,0.716035,0.998755,1.0,1.0
950,1.0,0.700516,0.998338,0.700185
1200,0.721991,0.721991,0.721991,0.721853


In [47]:
fig = px.line(metrics_f1_3, 
              title="Метрика F1 score моделей для графов с 3 кластерами").update_layout(xaxis_title="Количество узлов",
                                                                                        yaxis_title="Значение метрики")   
fig.show()

In [48]:
fig = px.line(metrics_rand_3, 
              title="Метрика Rand index моделей для графов с 3 кластерами").update_layout(xaxis_title="Количество узлов",
                                                                                          yaxis_title="Значение метрики")   
fig.show()

## Clusters = 5

In [49]:
metrics_f1_5, metrics_rand_5 = evaluate_graphs_nodes_clustering(models_dict, CLUSTERS_SIZES_TEST_5, CLUSTERS_PROBS_TEST_5)

In [50]:
metrics_f1_5

Unnamed: 0,GCNConv,GATConv,TransformerConv,SAGEConv
180,0.014242,0.013226,0.776064,0.473098
230,0.184679,0.315372,0.196726,0.18664
590,0.341104,0.248331,0.246533,0.090369
720,0.009218,0.677072,0.689522,0.13284
940,0.256419,0.450873,0.451616,0.0068
1500,0.002369,0.11321,0.460444,0.113655


In [51]:
metrics_rand_5

Unnamed: 0,GCNConv,GATConv,TransformerConv,SAGEConv
180,0.672377,0.680323,0.955804,0.663563
230,0.692538,0.663831,0.380786,0.355269
590,0.768588,0.773146,0.775546,0.769117
720,0.923431,0.918131,0.930386,0.738607
940,0.81798,0.806276,0.807969,0.71711
1500,0.714216,0.803516,0.804314,0.80352


In [52]:
fig = px.line(metrics_f1_5, 
              title="Метрика F1 score моделей для графов с 5 кластерами").update_layout(xaxis_title="Количество узлов",
                                                                                        yaxis_title="Значение метрики")   
fig.show()

In [53]:
fig = px.line(metrics_rand_5, 
              title="Метрика Rand index моделей для графов с 5 кластерами").update_layout(xaxis_title="Количество узлов",
                                                                                          yaxis_title="Значение метрики")   
fig.show()

## Clusters = 10

In [54]:
metrics_f1_10, metrics_rand_10 = evaluate_graphs_nodes_clustering(models_dict, CLUSTERS_SIZES_TEST_10, CLUSTERS_PROBS_TEST_10)

In [55]:
metrics_f1_10

Unnamed: 0,GCNConv,GATConv,TransformerConv,SAGEConv
220,0.062862,0.051715,0.017269,0.043729
510,0.004257,0.03007,0.014656,0.015759
770,0.009363,0.055976,0.009403,0.005137
980,0.009753,0.21392,0.406458,0.007847
1100,0.036884,0.025422,0.149253,0.039846
1400,0.005674,0.015106,0.382053,0.315261


In [56]:
metrics_rand_10

Unnamed: 0,GCNConv,GATConv,TransformerConv,SAGEConv
220,0.61785,0.619386,0.532877,0.569157
510,0.479733,0.666559,0.443376,0.765838
770,0.718998,0.734359,0.729563,0.733228
980,0.797255,0.629858,0.708974,0.799823
1100,0.713331,0.613611,0.624138,0.754147
1400,0.71936,0.548023,0.754241,0.750875


In [57]:
fig = px.line(metrics_f1_10, 
              title="Метрика F1 score моделей для графов с 10 кластерами").update_layout(xaxis_title="Количество узлов",
                                                                                         yaxis_title="Значение метрики")   
fig.show()

In [58]:
fig = px.line(metrics_rand_10, 
              title="Метрика Rand index моделей для графов с 10 кластерами").update_layout(xaxis_title="Количество узлов",
                                                                                           yaxis_title="Значение метрики")   
fig.show()

# CD Lib comparison 🍃

In [59]:
!pip install -qq cdlib

[2K     [90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━[0m [32m228.6/228.6 kB[0m [31m7.5 MB/s[0m eta [36m0:00:00[0m
[2K     [90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━[0m [32m14.3/14.3 MB[0m [31m57.1 MB/s[0m eta [36m0:00:00[0m
[?25h  Preparing metadata (setup.py) ... [?25l[?25hdone
[2K     [90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━[0m [32m2.6/2.6 MB[0m [31m63.9 MB/s[0m eta [36m0:00:00[0m
[?25h  Preparing metadata (setup.py) ... [?25l[?25hdone
[2K     [90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━[0m [32m2.0/2.0 MB[0m [31m50.6 MB/s[0m eta [36m0:00:00[0m
[2K     [90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━[0m [32m3.3/3.3 MB[0m [31m76.4 MB/s[0m eta [36m0:00:00[0m
[2K     [90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━[0m [32m174.1/174.1 kB[0m [31m8.8 MB/s[0m eta [36m0:00:00[0m
[2K     [90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━[0m [32m3.0/3.0 MB[0m [31m82.5 MB/s[0m eta [36m0:00:00[0m
[?25h  Building wheel for pyclustering (se

In [60]:
from cdlib import algorithms

Note: to be able to use all crisp methods, you need to install some additional packages:  {'leidenalg', 'wurlitzer', 'karateclub', 'infomap', 'graph_tool'}
Note: to be able to use all overlapping methods, you need to install some additional packages:  {'karateclub', 'ASLPAw'}
Note: to be able to use all bipartite methods, you need to install some additional packages:  {'leidenalg', 'infomap', 'wurlitzer'}


In [61]:
cd_lib_models = {
    'louvain': algorithms.louvain,
    'greedy_modularity': algorithms.greedy_modularity,
    'walktrap': algorithms.walktrap
}

In [62]:
def get_cd_lib_pred_clusters(com_partition):
  clusters_dict = {}

  for i in range(len(com_partition)):
    for node in com_partition[i]:
      clusters_dict[node] = i

  return clusters_dict

In [63]:
def perform_cd_lib_clustering(models, sizes, probs):
  metrics_f1 = {
    'louvain': [],
    'greedy_modularity': [],
    'walktrap': []
  }

  metrics_rand = {
    'louvain': [],
    'greedy_modularity': [],
    'walktrap': []
  }

  graphs_test = []

  for size in sizes:
    graph = nx.stochastic_block_model(size, probs, seed=0)
    graphs_test.append(graph)
    for model_name, model in models.items():     
      communities = model(graph)
      pred_clusters = list(get_cd_lib_pred_clusters(communities.communities).values())
      true_clusters = list(get_graph_true_clusters(graph).values())

      f1 = f1_score(true_clusters, pred_clusters, average='weighted')
      rand_ = rand_score(true_clusters, pred_clusters)

      metrics_f1[model_name].append(f1)
      metrics_rand[model_name].append(rand_)

  graphs_nodes_amount = [len(graph.nodes()) for graph in graphs_test]

  metrics_f1_df = pd.DataFrame(metrics_f1, index=graphs_nodes_amount)
  metrics_rand_df = pd.DataFrame(metrics_rand, index=graphs_nodes_amount)

  return metrics_f1_df, metrics_rand_df

## Clusters = 3

In [64]:
cd_lib_metrics_f1_3, cd_lib_metrics_rand_3 = perform_cd_lib_clustering(cd_lib_models, 
                                                                       CLUSTERS_SIZES_TEST_3, 
                                                                       CLUSTERS_PROBS_TEST_3)

In [65]:
cd_lib_metrics_f1_3

Unnamed: 0,louvain,greedy_modularity,walktrap
100,0.604286,0.604286,0.604286
300,0.766667,0.766667,0.766667
500,1.0,1.0,1.0
690,0.565217,0.565217,0.565217
950,1.0,1.0,1.0
1200,1.0,1.0,1.0


In [66]:
cd_lib_metrics_rand_3

Unnamed: 0,louvain,greedy_modularity,walktrap
100,0.676768,0.676768,0.676768
300,0.875139,0.875139,0.875139
500,1.0,1.0,1.0
690,0.621379,0.621379,0.621379
950,1.0,1.0,1.0
1200,1.0,1.0,1.0


In [None]:
fig = px.line(cd_lib_metrics_f1_3, 
              title="Метрика F1 score CD Lib моделей для графов с 3 кластерами").update_layout(xaxis_title="Количество узлов",
                                                                                               yaxis_title="Значение метрики")   
fig.show()

In [67]:
fig = px.line(cd_lib_metrics_rand_3, 
              title="Метрика Rand index CD Lib моделей для графов с 3 кластерами").update_layout(xaxis_title="Количество узлов",
                                                                                                 yaxis_title="Значение метрики")   
fig.show()

## Clusters = 5

In [69]:
cd_lib_metrics_f1_5, cd_lib_metrics_rand_5 = perform_cd_lib_clustering(cd_lib_models, 
                                                                       CLUSTERS_SIZES_TEST_5, 
                                                                       CLUSTERS_PROBS_TEST_5)

In [70]:
cd_lib_metrics_f1_5

Unnamed: 0,louvain,greedy_modularity,walktrap
180,0.081786,0.009259,0.236941
230,0.753623,0.718841,0.753623
590,0.661212,0.661212,0.661212
720,0.365484,0.330021,0.330021
940,0.585106,0.460052,0.585106
1500,0.535082,0.535082,0.535082


In [71]:
cd_lib_metrics_rand_5

Unnamed: 0,louvain,greedy_modularity,walktrap
180,0.807573,0.849472,0.816884
230,0.969622,0.950636,0.969622
590,0.911945,0.911945,0.911945
720,0.875599,0.87444,0.87444
940,0.841389,0.820996,0.841389
1500,0.804314,0.804314,0.804314


In [72]:
fig = px.line(cd_lib_metrics_f1_5, 
              title="Метрика F1 score CD Lib моделей для графов с 5 кластерами").update_layout(xaxis_title="Количество узлов",
                                                                                               yaxis_title="Значение метрики")   
fig.show()

In [73]:
fig = px.line(cd_lib_metrics_rand_5, 
              title="Метрика Rand index CD Lib моделей для графов с 5 кластерами").update_layout(xaxis_title="Количество узлов",
                                                                                                 yaxis_title="Значение метрики")   
fig.show()

## Clusters = 10


In [74]:
cd_lib_metrics_f1_10, cd_lib_metrics_rand_10 = perform_cd_lib_clustering(cd_lib_models, 
                                                                         CLUSTERS_SIZES_TEST_10, 
                                                                         CLUSTERS_PROBS_TEST_10)

In [75]:
cd_lib_metrics_f1_10

Unnamed: 0,louvain,greedy_modularity,walktrap
220,0.07177,0.058862,0.096257
510,0.00304,0.001783,0.003268
770,0.0702,0.07662,0.092764
980,0.219581,0.147426,0.220245
1100,0.13986,0.132231,0.13986
1400,0.171429,0.160514,0.171429


In [76]:
cd_lib_metrics_rand_10

Unnamed: 0,louvain,greedy_modularity,walktrap
220,0.826692,0.700125,0.852636
510,0.858338,0.784853,0.887515
770,0.80815,0.825109,0.865232
980,0.897699,0.745992,0.928134
1100,0.875258,0.837159,0.890148
1400,0.836618,0.816043,0.816195


In [77]:
fig = px.line(cd_lib_metrics_f1_10, 
              title="Метрика F1 score CD Lib моделей для графов с 10 кластерами").update_layout(xaxis_title="Количество узлов",
                                                                                                yaxis_title="Значение метрики")   
fig.show()

In [78]:
fig = px.line(cd_lib_metrics_rand_10, 
              title="Метрика Rand index CD Lib моделей для графов с 10 кластерами").update_layout(xaxis_title="Количество узлов",
                                                                                                  yaxis_title="Значение метрики")   
fig.show()