# Import ⏳

In [1]:
!pip install -qq --no-cache-dir node2vec

[?25l     [90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━[0m [32m0.0/2.0 MB[0m [31m?[0m eta [36m-:--:--[0m[2K     [91m━━━━━[0m[90m╺[0m[90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━[0m [32m0.3/2.0 MB[0m [31m8.1 MB/s[0m eta [36m0:00:01[0m[2K     [90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━[0m [32m2.0/2.0 MB[0m [31m32.6 MB/s[0m eta [36m0:00:00[0m
[?25h

In [2]:
!pip install -qq catboost

[2K     [90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━[0m [32m98.6/98.6 MB[0m [31m9.2 MB/s[0m eta [36m0:00:00[0m
[?25h

In [3]:
import random
import tqdm

import networkx as nx
import pandas as pd
import numpy as np
import plotly.express as px

from IPython.core.display import display, HTML
from node2vec import Node2Vec
from itertools import combinations
from sklearn.metrics import precision_score, recall_score, roc_auc_score
from sklearn.model_selection import train_test_split
from sklearn.ensemble import RandomForestClassifier
from xgboost import XGBClassifier
from catboost import CatBoostClassifier
from IPython.display import Javascript

In [4]:
pip install -qq transformers

[2K     [90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━[0m [32m7.1/7.1 MB[0m [31m50.7 MB/s[0m eta [36m0:00:00[0m
[2K     [90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━[0m [32m224.5/224.5 kB[0m [31m27.7 MB/s[0m eta [36m0:00:00[0m
[2K     [90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━[0m [32m7.8/7.8 MB[0m [31m120.1 MB/s[0m eta [36m0:00:00[0m
[?25h

In [5]:
from transformers import get_linear_schedule_with_warmup

In [6]:
random.seed(2023)

# SBM graphs generation 🫐

In [7]:
AMOUNT_GRAPHS = 20

MIN_CLUSTER_SIZE = 30
MAX_CLUSTER_SIZE = 200

In [8]:
PROBS_3_CLUSTERS = [[[0.75, 0.015, 0.0002], [0.015, 0.85, 0.0075], [0.0002, 0.0075, 0.90]],
                    [[0.2, 0.001, 0.05], [0.001, 0.35, 0.075], [0.05, 0.075, 0.40]],
                    [[0.25, 0.05, 0.02], [0.05, 0.35, 0.07], [0.02, 0.07, 0.40]],
                    [[0.65, 0.05, 0.001], [0.05, 0.7, 0.15], [0.001, 0.15, 0.85]],
                    [[0.35, 0.15, 0.0002], [0.15, 0.45, 0.0075], [0.0002, 0.0075, 0.50]],
                    [[0.1, 0.60, 0.55], [0.60, 0.25, 0.80], [0.55, 0.80, 0.30]]]

In [9]:
PROBS_5_CLUSTERS = [[[0.75, 0.05, 0.02, 0.01, 0.05], [0.05, 0.85, 0.07, 0.08, 0.01], [0.02, 0.07, 0.90, 0.03, 0.04], [0.01, 0.08, 0.03, 0.65, 0.10], [0.05, 0.01, 0.04, 0.10, 0.95]],
                    [[0.25, 0.15, 0.35, 0.10, 0.55], [0.15, 0.45, 0.25, 0.10, 0.17], [0.35, 0.25, 0.50, 0.25, 0.25], [0.10, 0.10, 0.25, 0.65, 0.15], [0.55, 0.17, 0.25, 0.15, 0.35]],
                    [[0.05, 0.45, 0.35, 0.55, 0.65], [0.45, 0.15, 0.37, 0.88, 0.20], [0.35, 0.37, 0.20, 0.40, 0.50], [0.55, 0.88, 0.40, 0.35, 0.80], [0.65, 0.20, 0.50, 0.80, 0.17]],
                    [[0.95, 0.15, 0.05, 0.03, 0.19], [0.15, 0.45, 0.07, 0.18, 0.40], [0.05, 0.07, 0.80, 0.02, 0.15], [0.03, 0.18, 0.02, 0.75, 0.01], [0.19, 0.40, 0.15, 0.01, 0.57]],
                    [[0.55, 0.33, 0.25, 0.07, 0.39], [0.33, 0.38, 0.17, 0.58, 0.25], [0.25, 0.17, 0.63, 0.35, 0.40], [0.07, 0.58, 0.35, 0.48, 0.28], [0.39, 0.25, 0.40, 0.28, 0.98]],
                    [[1.00, 0.75, 0.55, 0.30, 0.65], [0.75, 0.88, 0.77, 0.98, 0.56], [0.55, 0.77, 0.93, 0.62, 0.45], [0.30, 0.98, 0.62, 0.56, 0.67], [0.65, 0.56, 0.45, 0.67, 0.86]]]

In [10]:
PROBS_10_CLUSTERS = [[[0.75, 0.05, 0.02, 0.01, 0.05, 0.007, 0.02, 0.3, 0.25, 0.015], 
                     [0.05, 0.85, 0.07, 0.08, 0.01, 0.001, 0.03, 0.0, 0.04, 0.02], 
                     [0.02, 0.07, 0.90, 0.03, 0.04, 0.03, 0.09, 0.017, 0.003, 0.01],
                     [0.01, 0.08, 0.03, 0.65, 0.10, 0.005, 0.10, 0.025, 0.05, 0.1],
                     [0.05, 0.01, 0.04, 0.10, 0.95, 0.085, 0.07, 0.06, 0.077, 0.009],
                     [0.007, 0.001, 0.03, 0.005, 0.085, 0.70, 0.2, 0.0001, 0.08, 0.0015],
                     [0.02, 0.03, 0.09, 0.10, 0.07, 0.2, 0.5, 0.3, 0.19, 0.0002],
                     [0.3, 0.0, 0.017, 0.025, 0.06, 0.0001, 0.3, 0.99, 0.001, 0.15],
                     [0.25, 0.04, 0.003, 0.05, 0.077, 0.08, 0.19, 0.001, 0.87, 0.035],
                     [0.015, 0.02, 0.01, 0.1, 0.009, 0.0015, 0.0002, 0.15, 0.035, 0.85]],
                     # 2
                     [[0.99, 0.15, 0.25, 0.05, 0.55, 0.375, 0.2, 0.03, 0.125, 0.001], 
                     [0.15, 0.57, 0.47, 0.89, 0.31, 0.11, 0.13, 0.002, 0.14, 0.122], 
                     [0.25, 0.47, 0.33, 0.06, 0.45, 0.023, 0.099, 0.27, 0.033, 0.70],
                     [0.05, 0.89, 0.06, 0.95, 0.40, 0.075, 0.101, 0.25, 0.26, 0.031],
                     [0.55, 0.31, 0.45, 0.40, 0.53, 0.105, 0.47, 0.869, 0.177, 0.099],
                     [0.375, 0.11, 0.023, 0.075, 0.105, 0.748, 0.023, 0.019, 0.0089, 0.25],
                     [0.2, 0.13, 0.099, 0.101, 0.47, 0.023, 0.86, 0.33, 0.49, 0.0502],
                     [0.03, 0.002, 0.27, 0.25, 0.869, 0.019, 0.33, 0.678, 0.501, 0.15],
                     [0.125, 0.14, 0.033, 0.26, 0.177, 0.0089, 0.49, 0.501, 0.745, 0.001],
                     [0.001, 0.122, 0.70, 0.031, 0.099, 0.25, 0.0502, 0.15, 0.001, 0.89995]]]

In [11]:
def generate_sbm_graph(sizes, probs):
  G = nx.stochastic_block_model(sizes, probs, seed=0)
  amount_clusters = len(G.graph["partition"])

  hex_color = lambda: random.randint(0, 255)
  colors = ['#%02X%02X%02X' % (hex_color(), hex_color(), hex_color()) for i in range(amount_clusters)]

  color_attributes = {}

  title_attributes = {}

  for i in range(amount_clusters):
    for node in G.graph["partition"][i]:
      color_attributes[node] = colors[i]
      title_attributes[node] = str(i)

  nx.set_node_attributes(G, color_attributes, name="color")
  nx.set_node_attributes(G, title_attributes, name="title")

  return G

In [12]:
def generate_graphs_with_props(cluster_sizes_list, cluster_probs):
  graphs = []

  for cluster_sizes in cluster_sizes_list:
    G = generate_sbm_graph(cluster_sizes, cluster_probs)
    graphs.append(G)

  return graphs

In [13]:
def get_clusters_probs(amount_clusters):
  if amount_clusters == 3:
    return PROBS_3_CLUSTERS
  elif amount_clusters == 5:
    return PROBS_5_CLUSTERS
  elif amount_clusters == 10:
    return PROBS_10_CLUSTERS
  else:
    return []

In [14]:
def generate_graphs_with_random_props(amount_clusters):
  graphs = []
  clusters_probs = get_clusters_probs(amount_clusters)
  amount_graphs = int(AMOUNT_GRAPHS / 2) if amount_clusters == 10 else AMOUNT_GRAPHS

  for i in range(amount_graphs):
    cluster_sizes = [random.randint(MIN_CLUSTER_SIZE, MAX_CLUSTER_SIZE) for j in range(amount_clusters)]
    probs_index = random.randint(0, len(clusters_probs) - 1)

    G = nx.stochastic_block_model(cluster_sizes, clusters_probs[probs_index])
    graphs.append(G)

  return graphs

In [15]:
def get_graph_true_clusters(G):
  clusters_dict = {}

  for i in range(len(G.graph["partition"])):
    for node in G.graph["partition"][i]:
      clusters_dict[node] = i

  return clusters_dict

# Embeddings 🍪

In [16]:
def generate_node2vec_embeddings(G):
  node2vec = Node2Vec(G, dimensions=8, walk_length=5, num_walks=5, workers=1)

  # Embed nodes
  model = node2vec.fit(window=3, min_count=1, batch_words=4)  

  vector_dict = {}
  for word in model.wv.index_to_key: 
      vector_dict[int(word)] = model.wv.get_vector(word)

  return vector_dict

# Prepare train data 🧘

In [17]:
def get_k_nearest_neighbors(G, node_cluster, node_embedding, graph_embeddings_dict, k = 100):
  distances = {}
  for i in range(len(G.graph["partition"])):
    if i != node_cluster:
      for another_node in G.graph["partition"][i]:
        # считаем расстояние между эмеддингами
        another_node_embedding = graph_embeddings_dict[another_node]
        distance = np.linalg.norm(node_embedding - another_node_embedding)
        distances[another_node] = distance
      # сортируем расстояния в порядке возрастания
      distances = dict(sorted(distances.items(), key=lambda item: item[1]))
      # берем k вершин, с минимальным расстоянием до рассматриваемой
      min_nodes = list(distances.keys())[:k]

  return min_nodes

In [18]:
def create_embeddings_pairs(G):
  searched_nodes_dict = {}

  true_clusters = get_graph_true_clusters(G)
  graph_embeddings_dict = generate_node2vec_embeddings(G)

  # пары вершин для обучения в виде [эмбеддинг 1, эмбеддинг 2, в одном кластере или нет (1/0)]
  accepted_combinations = []
  combination_target = []

  for nodes_pair in combinations(G.nodes(), 2):
    first_node = nodes_pair[0]
    second_node = nodes_pair[1]
    # вершины в одном кластере
    if true_clusters[first_node] == true_clusters[second_node]:
      res = np.concatenate((graph_embeddings_dict[first_node], graph_embeddings_dict[second_node]), axis=0) 
      accepted_combinations.append(res)
      combination_target.append(1)
    # находим для рассмотренных вершин ближайших соседей из других кластеров
    for node in nodes_pair:
      if node not in searched_nodes_dict:
        searched_nodes_dict[node] = True
        node_nearest_neighbors = get_k_nearest_neighbors(G, 
                                                        true_clusters[node], 
                                                        graph_embeddings_dict[node],
                                                        graph_embeddings_dict)
        # получаем эмбеддинги для ближайших соседей
        for nearest_neighbor in node_nearest_neighbors:
          res = np.concatenate((graph_embeddings_dict[node], graph_embeddings_dict[nearest_neighbor]), axis=0) 
          accepted_combinations.append(res)
          combination_target.append(0)

  return accepted_combinations, combination_target

In [19]:
def create_graphs_data(graphs):
  graphs_combinations = []
  graphs_targets = []

  for G in graphs:
    combinations, targets = create_embeddings_pairs(G)

    graphs_combinations += combinations
    graphs_targets += targets

  return graphs_combinations, graphs_targets

# Prepare test data 🔥

In [20]:
def prepare_test_clustering_embeddings(G):
  nodes_combinations = []
  combinations_target = []

  graph_embeddings_dict = generate_node2vec_embeddings(G)
  true_clusters = get_graph_true_clusters(G)

  for nodes_pair in combinations(G.nodes(), 2):
    first_node = nodes_pair[0]
    second_node = nodes_pair[1]

    res = np.concatenate((graph_embeddings_dict[first_node], graph_embeddings_dict[second_node]), axis=0) 
    nodes_combinations.append(res)

    if true_clusters[first_node] == true_clusters[second_node]:
      combinations_target.append(1)
    else:
      combinations_target.append(0)
    

  return nodes_combinations, combinations_target

In [21]:
def create_test_graphs_data(graphs):
  graphs_combinations = []
  graphs_targets = []

  for G in graphs:
    combinations, targets = prepare_test_clustering_embeddings(G)

    graphs_combinations.append(combinations)
    graphs_targets.append(targets)

  return graphs_combinations, graphs_targets

In [22]:
def get_test_models_scores(combinations, targets, model):
  precision_scores = []
  recall_scores = []
  roc_auc_scores = []

  for combination, target in zip(combinations, targets):
    pred = model.predict(combination)
    precision_scores.append(precision_score(target, pred, zero_division=0))
    recall_scores.append(recall_score(target, pred, zero_division=0))
    roc_auc_scores.append(roc_auc_score(target, pred))

  return precision_scores, recall_scores, roc_auc_scores

# Experiments Classical 💥

In [24]:
NUM_EXPERIMENTS = 10

In [23]:
def get_average_scores(model, amount_test_graphs, embeddings, targets):
  precision_scores = [0.0 for i in range(amount_test_graphs)]
  recall_scores = [0.0 for i in range(amount_test_graphs)]
  roc_auc_scores = [0.0 for i in range(amount_test_graphs)]

  for i in range(NUM_EXPERIMENTS):
    precision_scores_, recall_scores_, roc_auc_scores_ = get_test_models_scores(embeddings, targets, model)
    precision_scores = [sum(i) for i in zip(precision_scores, precision_scores_)]
    recall_scores = [sum(i) for i in zip(recall_scores, recall_scores_)]  
    roc_auc_scores = [sum(i) for i in zip(roc_auc_scores, roc_auc_scores_)]

  precision_scores = [score / NUM_EXPERIMENTS for score in precision_scores]
  recall_scores = [score / NUM_EXPERIMENTS for score in recall_scores]
  roc_auc_scores = [score / NUM_EXPERIMENTS for score in roc_auc_scores]

  return precision_scores, recall_scores, roc_auc_scores

In [25]:
def get_train_df(graphs_train):
  graphs_combinations_train, graphs_targets_train = create_graphs_data(graphs_train)

  graphs_combinations_train_df = pd.DataFrame(graphs_combinations_train)
  graphs_targets_train_df = pd.DataFrame(graphs_targets_train, columns=['target'])

  graphs_train_df = pd.concat([graphs_combinations_train_df, graphs_targets_train_df], axis=1)
  graphs_train_df = graphs_train_df.sample(frac=1).reset_index(drop=True)

  graphs_combinations_train = graphs_train_df.drop(['target'], axis=1)
  graphs_targets_train = graphs_train_df['target']

  return graphs_train_df, graphs_combinations_train, graphs_targets_train

In [26]:
def train_models(models, graphs_combinations_train, graphs_targets_train):
  models_trained = []
  for model in models:
    model.fit(np.array(graphs_combinations_train), graphs_targets_train)
    models_trained.append(model)
  return models_trained

In [27]:
def get_test_df(graphs_test):
    # graphs_test = generate_graphs_with_props(test_cluster_sizes, probs)
    graphs_test_embeddings, graphs_test_targets = create_test_graphs_data(graphs_test)

    return graphs_test, graphs_test_embeddings, graphs_test_targets

In [28]:
def get_metrics_df(metrics, graphs_nodes_amount):
  return pd.DataFrame(np.transpose(metrics), 
                      index=graphs_nodes_amount, 
                      columns=['XGB', 'Catboost'])

In [29]:
def test_models(models, graphs_test, graphs_test_embeddings, graphs_test_targets):
  precision_scores = []
  recall_scores = [] 
  roc_auc_scores = []

  graphs_nodes_amount = [len(graph.nodes()) for graph in graphs_test]

  for model in models:
    precision_score, recall_score, roc_auc_score = get_average_scores(model, 
                                                                      len(graphs_test), 
                                                                      graphs_test_embeddings, 
                                                                      graphs_test_targets)
    
    precision_scores.append(precision_score)
    recall_scores.append(recall_score)
    roc_auc_scores.append(roc_auc_score)


  precision_scores_df = get_metrics_df(precision_scores, graphs_nodes_amount)                                
  recall_scores_df = get_metrics_df(recall_scores, graphs_nodes_amount)
  roc_auc_scores_df = get_metrics_df(roc_auc_scores, graphs_nodes_amount)

  return precision_scores_df, recall_scores_df, roc_auc_scores_df

In [30]:
def perform_classical_experiments_training(models, train_graphs):
  graphs_train_df, graphs_combinations_train, graphs_targets_train = get_train_df(train_graphs)
  print("================ Finish creating train data ================")
  models_trained = train_models(models, graphs_combinations_train, graphs_targets_train)
  print("================ Finish training models ================")
  
  return graphs_train_df, models_trained

In [31]:
def perform_classical_experiments_testing(models_trained, test_graphs):
  graphs_test, graphs_test_embeddings, graphs_test_targets = get_test_df(test_graphs)
  print("================ Finish creating test data ================")
  precision_scores_df, recall_scores_df, roc_auc_scores_df = test_models(models_trained, 
                                                                         graphs_test, 
                                                                         graphs_test_embeddings, 
                                                                         graphs_test_targets)
  
  return precision_scores_df, recall_scores_df, roc_auc_scores_df

In [32]:
xgb = XGBClassifier(n_estimators=3, max_depth=2, learning_rate=1, objective='binary:logistic')

catboost_ = CatBoostClassifier(verbose=False)

In [33]:
MODELS = [xgb, catboost_]

## Training

In [34]:
graphs_train_3 = generate_graphs_with_random_props(3)
graphs_train_5 = generate_graphs_with_random_props(5)
graphs_train_10 = generate_graphs_with_random_props(10)

graphs_train_mix = graphs_train_3 + graphs_train_5 + graphs_train_10

In [None]:
display(Javascript('''google.colab.output.setIframeHeight(0, true, {maxHeight: 300})'''))

train_df, models_trained = perform_classical_experiments_training(MODELS, graphs_train_mix)

In [36]:
train_df.to_csv('/content/drive/MyDrive/data/mix_clusters_train.csv')

## Testing

#### Clusters = 3

In [32]:
CLUSTERS_PROBS_TEST_3 = [[0.75, 0.015, 0.0002], [0.015, 0.85, 0.0075], [0.0002, 0.0075, 0.90]]

In [33]:
CLUSTERS_SIZES_TEST_3 = [[10, 10, 10], 
                         [30, 10, 20],
                         [20, 50, 30],
                         [10, 100, 50],
                         [100, 100, 10],
                         [150, 40, 110],
                         [300, 150, 50],
                         [200, 80, 260],
                         [250, 250, 100],
                         [150, 450, 90]]

In [40]:
graphs_test_3 = generate_graphs_with_props(CLUSTERS_SIZES_TEST_3, CLUSTERS_PROBS_TEST_3)

In [None]:
precision_scores_df_3,\
 recall_scores_df_3,\
  roc_auc_scores_df_3 = perform_classical_experiments_testing(models_trained, 
                                                              graphs_test_3)                                                                                        

In [42]:
fig = px.line(precision_scores_df_3, title="Precision score (3 кластера)").update_layout(xaxis_title="Количество узлов",
                                                                                         yaxis_title="Precision score")   
fig.show()

In [43]:
fig = px.line(recall_scores_df_3, title="Recall score (3 кластера)").update_layout(xaxis_title="Количество узлов",
                                                                                   yaxis_title="Recall score")   
fig.show()

In [44]:
fig = px.line(roc_auc_scores_df_3, title="Roc AUC score (3 кластера)").update_layout(xaxis_title="Количество узлов",
                                                                                     yaxis_title="Recall score")   
fig.show()

#### Clusters = 5

In [34]:
CLUSTERS_PROBS_TEST_5 = [[0.75, 0.05, 0.02, 0.01, 0.05], 
             [0.05, 0.85, 0.07, 0.08, 0.01], 
             [0.02, 0.07, 0.90, 0.03, 0.04],
             [0.01, 0.08, 0.03, 0.65, 0.10],
             [0.05, 0.01, 0.04, 0.10, 0.95]]

In [35]:
CLUSTERS_SIZES_TEST_5 = [[5, 5, 5, 5, 5],
                    [10, 10, 10, 10, 20],
                    [30, 10, 20, 15, 5],
                    [20, 50, 30, 40, 10],
                    [10, 100, 15, 40, 15],
                    [100, 70, 10, 30, 20],
                    [140, 20, 100, 40, 50],
                    [200, 100, 150, 50, 90],
                    [220, 50, 260, 120, 70],
                    [200, 250, 100, 300, 90],
                    [100, 440, 90, 210, 140]]

In [47]:
graphs_test_5 = generate_graphs_with_props(CLUSTERS_SIZES_TEST_5, CLUSTERS_PROBS_TEST_5)

In [None]:
precision_scores_df_5,\
 recall_scores_df_5,\
  roc_auc_scores_df_5 = perform_classical_experiments_testing(models_trained, 
                                                              graphs_test_5)                                                                                        

In [49]:
fig = px.line(precision_scores_df_5, title="Precision score (5 кластеров)").update_layout(xaxis_title="Количество узлов",
                                                                                         yaxis_title="Precision score")   
fig.show()

In [50]:
fig = px.line(recall_scores_df_5, title="Recall score (5 кластеров)").update_layout(xaxis_title="Количество узлов",
                                                                                   yaxis_title="Recall score")   
fig.show()

In [51]:
fig = px.line(roc_auc_scores_df_5, title="Roc AUC score (5 кластеров)").update_layout(xaxis_title="Количество узлов",
                                                                                      yaxis_title="Precision score")   
fig.show()

#### Clusters = 10

In [36]:
CLUSTERS_PROBS_TEST_10 = [[0.75, 0.05, 0.02, 0.01, 0.05, 0.007, 0.02, 0.3, 0.25, 0.015], 
            [0.05, 0.85, 0.07, 0.08, 0.01, 0.001, 0.03, 0.0, 0.04, 0.02], 
            [0.02, 0.07, 0.90, 0.03, 0.04, 0.03, 0.09, 0.017, 0.003, 0.01],
            [0.01, 0.08, 0.03, 0.65, 0.10, 0.005, 0.10, 0.025, 0.05, 0.1],
            [0.05, 0.01, 0.04, 0.10, 0.95, 0.085, 0.07, 0.06, 0.077, 0.009],
            [0.007, 0.001, 0.03, 0.005, 0.085, 0.70, 0.2, 0.0001, 0.08, 0.0015],
            [0.02, 0.03, 0.09, 0.10, 0.07, 0.2, 0.5, 0.3, 0.19, 0.0002],
            [0.3, 0.0, 0.017, 0.025, 0.06, 0.0001, 0.3, 0.99, 0.001, 0.15],
            [0.25, 0.04, 0.003, 0.05, 0.077, 0.08, 0.19, 0.001, 0.87, 0.035],
            [0.015, 0.02, 0.01, 0.1, 0.009, 0.0015, 0.0002, 0.15, 0.035, 0.85]]

In [37]:
CLUSTERS_SIZES_TEST_10 = [[5, 5, 5, 5, 5, 5, 5, 5, 5, 5],
                    [10, 10, 10, 10, 20, 10, 10, 10, 10, 20],
                    [30, 10, 20, 15, 5, 10, 40, 30, 50, 10],
                    [20, 50, 30, 40, 10, 80, 70, 5, 60, 100],
                    [10, 100, 15, 40, 15, 90, 60, 50, 110, 20],
                    [140, 20, 100, 40, 50, 10, 50, 50, 80, 95],
                    [100, 70, 10, 30, 20, 150, 90, 80, 170, 50],
                    [200, 100, 150, 50, 90, 10, 70, 40, 90, 100],
                    [220, 50, 260, 120, 70, 40, 105, 15, 25, 75],
                    [200, 250, 100, 300, 90, 120, 90, 50, 100, 30],
                    [100, 240, 90, 210, 140, 195, 45, 175, 45, 110]
                    ]

In [54]:
graphs_test_10 = generate_graphs_with_props(CLUSTERS_SIZES_TEST_10, 
                                            CLUSTERS_PROBS_TEST_10)

In [55]:
precision_scores_df_10,\
 recall_scores_df_10,\
  roc_auc_scores_df_10 = perform_classical_experiments_testing(models_trained, 
                                                               graphs_test_10)                                                                                        

Computing transition probabilities:   0%|          | 0/50 [00:00<?, ?it/s]

Generating walks (CPU: 1): 100%|██████████| 5/5 [00:00<00:00, 558.18it/s]


Computing transition probabilities:   0%|          | 0/120 [00:00<?, ?it/s]

Generating walks (CPU: 1): 100%|██████████| 5/5 [00:00<00:00, 215.63it/s]


Computing transition probabilities:   0%|          | 0/220 [00:00<?, ?it/s]

Generating walks (CPU: 1): 100%|██████████| 5/5 [00:00<00:00, 92.23it/s]


Computing transition probabilities:   0%|          | 0/465 [00:00<?, ?it/s]

Generating walks (CPU: 1): 100%|██████████| 5/5 [00:00<00:00, 21.22it/s]


Computing transition probabilities:   0%|          | 0/510 [00:00<?, ?it/s]

Generating walks (CPU: 1): 100%|██████████| 5/5 [00:00<00:00, 27.84it/s]


Computing transition probabilities:   0%|          | 0/635 [00:00<?, ?it/s]

Generating walks (CPU: 1): 100%|██████████| 5/5 [00:00<00:00, 19.88it/s]


Computing transition probabilities:   0%|          | 0/770 [00:00<?, ?it/s]

Generating walks (CPU: 1): 100%|██████████| 5/5 [00:00<00:00, 13.89it/s]


Computing transition probabilities:   0%|          | 0/900 [00:00<?, ?it/s]

Generating walks (CPU: 1): 100%|██████████| 5/5 [00:00<00:00, 11.60it/s]


Computing transition probabilities:   0%|          | 0/980 [00:00<?, ?it/s]

Generating walks (CPU: 1): 100%|██████████| 5/5 [00:00<00:00,  9.48it/s]


Computing transition probabilities:   0%|          | 0/1330 [00:00<?, ?it/s]

Generating walks (CPU: 1): 100%|██████████| 5/5 [00:00<00:00,  5.84it/s]


Computing transition probabilities:   0%|          | 0/1350 [00:00<?, ?it/s]

Generating walks (CPU: 1): 100%|██████████| 5/5 [00:00<00:00,  6.30it/s]




In [56]:
fig = px.line(precision_scores_df_10, title="Precision score (10 кластеров)").update_layout(xaxis_title="Количество узлов",
                                                                                            yaxis_title="Precision score")   
fig.show()

In [57]:
fig = px.line(recall_scores_df_10, title="Recall score (10 кластеров)").update_layout(xaxis_title="Количество узлов",
                                                                                      yaxis_title="Recall score")   
fig.show()

In [58]:
fig = px.line(roc_auc_scores_df_10, title="Roc AUC score (10 кластеров)").update_layout(xaxis_title="Количество узлов",
                                                                                        yaxis_title="Recall score")
fig.show()

# Experiments NN 🐒

In [38]:
import torch

from torch import nn
from torch.utils.data import Dataset, DataLoader

import torch.nn.functional as F

from collections import defaultdict

In [39]:
emb_data = pd.read_csv('/content/drive/MyDrive/data/mix_clusters_train.csv', index_col=0)

In [40]:
EMBEDDINGS = 'embeddings'
TARGETS = 'targets'

In [41]:
AMOUNT_CLASSES = 2
EMBEDDING_SHAPE = 16
HID_DIM = 64
DROPOUT = 0.2

In [42]:
BATCH_TRAIN = 256
BATCH_TEST = 128

In [43]:
EPOCHS = 10

In [44]:
device = torch.device('cuda' if torch.cuda.is_available() else 'cpu')

## Data

In [45]:
class GraphEmbeddings(Dataset):
  def __init__(self, embeddings, labels):
    self.embeddings = embeddings
    self.labels = labels

  def __getitem__(self, index):
    embedding = self.embeddings.iloc[[index]]
    label = self.labels.iloc[[index]]

    return {
      EMBEDDINGS: torch.tensor(embedding.values, dtype=torch.float32)[0],
      TARGETS: torch.tensor(label.values, dtype=torch.long)[0]
    }

  def __len__(self):
    return len(self.embeddings)

In [46]:
def get_data_tensor(dataset):  
  X_train, X_rem, y_train, y_rem = train_test_split(dataset.drop(['target'], axis=1), 
                                                      dataset['target'],
                                                      test_size=0.2,
                                                      random_state=2023) 
  X_val, X_test, y_val, y_test = train_test_split(X_rem, y_rem, test_size=0.5, random_state=2023)

  train_data = GraphEmbeddings(X_train, y_train)
  val_data = GraphEmbeddings(X_val, y_val)
  test_data = GraphEmbeddings(X_test, y_test)

  train_loader = DataLoader(train_data, batch_size=BATCH_TRAIN, num_workers=2, shuffle=True)
  val_loader = DataLoader(val_data, batch_size=BATCH_TEST, num_workers=2)
  test_loader = DataLoader(test_data, batch_size=BATCH_TEST, num_workers=2)

  print(f'Train size: {len(train_data)}, Val size: {len(val_data)}, Test size: {len(test_data)}')

  return train_loader, val_loader, test_loader

## Nets

In [47]:
class GraphEmbeddingsNN(nn.Module):
  def __init__(self, num_classes, embedding_shape):
    super(GraphEmbeddingsNN, self).__init__()   
    self.num_classes = num_classes
    self.dropout = nn.Dropout(0.6)

    self.fc1 = nn.Linear(embedding_shape, 32)
    self.fc2 = nn.Linear(32, 64)
    self.fc3 = nn.Linear(64, self.num_classes)


  def forward(self, x):
    x = F.relu(self.fc1(x))
    x = self.dropout(x)
    x = F.relu(self.fc2(x))
    x = self.dropout(x)
    x = F.log_softmax(self.fc3(x), dim=1)

    return x

In [77]:
class GraphEmbeddingsGRUNN(nn.Module):
  def __init__(self, num_classes, embedding_shape):
    super(GraphEmbeddingsGRUNN, self).__init__()   
    self.num_classes = num_classes
    self.linear = nn.Linear(16, embedding_shape)
    self.gru = nn.GRU(embedding_shape, HID_DIM, num_layers=2, \
                       bidirectional= True, batch_first=True, dropout=DROPOUT)
    self.dropout = nn.Dropout(0.5)
    self.output = nn.Linear(HID_DIM * 2, self.num_classes)


  def forward(self, x):
    x, _ = self.gru(x)
    x = self.dropout(x)
    x = torch.sigmoid(self.output(x))

    result_out = x.contiguous().view(x.shape[0], -1)

    return result_out

## Helpers

In [49]:
def plot_accuracy_scores(history, metric):
  accuracy_scores = pd.DataFrame([history[f'train_{metric}'], 
                                  history[f'val_{metric}'],
                                  history[f'test_{metric}']], index=['Train', 'Val', 'Test']).T
  accuracy_scores['index'] = [i + 1 for i in range(accuracy_scores.shape[0])]

  accuracy_scores_melt = accuracy_scores.melt(id_vars='index', value_vars=['Train', 'Val', 'Test'])
      
  fig = px.line(accuracy_scores_melt, 
                x='index', y='value', 
                title='Training history', 
                color='variable', labels={'value': 'Score', 'index': 'Epoch'})
  fig.show()

In [50]:
def get_data_properties(example):
  return example[EMBEDDINGS].to(device), example[TARGETS].to(device)

In [51]:
def evaluate(model, test_loader):
  criterion = nn.CrossEntropyLoss().to(device)

  model.eval()

  losses = []
  precision_scores = []
  recall_scores = []
  roc_auc_scores = []

  with torch.no_grad():
    for d in tqdm.notebook.tqdm(test_loader):
      x, y = get_data_properties(d)
      out_s = model(x)
      # out_s = torch.sigmoid(out)
      _, preds = torch.max(out_s, dim=1)
      preds = preds.cpu()
      loss = criterion(out_s, y.long())
      y = y.cpu()

      precision_scores.append(precision_score(y, preds, zero_division=0))
      recall_scores.append(recall_score(y, preds, zero_division=0))
      roc_auc_scores.append(roc_auc_score(y, preds))
      losses.append(loss.item())

  return np.mean(precision_scores), np.mean(recall_scores), np.mean(roc_auc_scores), np.mean(losses)

In [52]:
def train_epoch(model, train_loader):
  model = model.train()

  optimizer = torch.optim.Adam(model.parameters(), lr=0.0002)
  # optimizer = torch.optim.SGD(model.parameters(), lr=1e-4, weight_decay=1e-5)
  # lr_scheduler = torch.optim.lr_scheduler.StepLR(optimizer, step_size=3, gamma=0.1) 
  total_steps = len(train_loader) * EPOCHS
  lr_scheduler = get_linear_schedule_with_warmup(optimizer, num_warmup_steps=0, num_training_steps=total_steps)
  loss_fn = nn.CrossEntropyLoss().to(device)
  # loss_fn = nn.BCELoss()

  losses = []
  precision_scores = []
  recall_scores = []
  roc_auc_scores = []
  
  for d in tqdm.notebook.tqdm(train_loader):
    x, y = get_data_properties(d)  
    outputs = model(x)
    _, preds = torch.max(outputs, dim=1)
    preds = preds.cpu()
    loss = loss_fn(outputs, y.long()).cpu()
    y = y.cpu()

    precision_scores.append(precision_score(y, preds, zero_division=0))
    recall_scores.append(recall_score(y, preds, zero_division=0))
    roc_auc_scores.append(roc_auc_score(y, preds))
    losses.append(loss.item())

    loss.backward()
    nn.utils.clip_grad_norm_(model.parameters(), max_norm=1.0)
    optimizer.step()
    lr_scheduler.step()
    optimizer.zero_grad()

  return np.mean(precision_scores), np.mean(recall_scores), np.mean(roc_auc_scores), np.mean(losses)

In [53]:
def train(model, train_loader, val_loader, test_loader):
  history = defaultdict(list)
  net = model(AMOUNT_CLASSES, EMBEDDING_SHAPE).to(device)

  for epoch in range(EPOCHS):
    print(f'Epoch {epoch + 1}/{EPOCHS}')
    print('-' * 10)

    train_precision, train_recall, train_roc_auc, train_loss = train_epoch(net, train_loader)
    print(f'Train loss {train_loss} precision {train_precision} recall {train_recall} roc AUC {train_roc_auc}')

    val_precision, val_recall, val_roc_auc, val_loss = evaluate(net, val_loader)
    print(f'Val loss {val_loss} precision {val_precision} recall {val_recall} roc AUC {val_roc_auc}')

    test_precision, test_recall, test_roc_auc, test_loss = evaluate(net, test_loader)
    print(f'Test loss {test_loss} precision {test_precision} recall {test_recall} roc AUC {test_roc_auc}')
    print()

    history['train_precision'].append(train_precision)
    history['train_recall'].append(train_recall)
    history['train_roc_auc'].append(train_roc_auc)
    history['train_loss'].append(train_loss)

    history['val_precision'].append(val_precision)
    history['val_recall'].append(val_recall)
    history['val_roc_auc'].append(val_roc_auc)
    history['val_loss'].append(val_loss)

    history['test_precision'].append(test_precision)
    history['test_recall'].append(test_recall)
    history['test_roc_auc'].append(test_roc_auc)
    history['test_loss'].append(test_loss)

  return history, net

## Testing


In [58]:
def get_nn_metrics_df(precision, recall, roc_auc, graphs_nodes_amount):
  return pd.DataFrame(np.concatenate((np.transpose([precision]), np.transpose([recall]), np.transpose([roc_auc])), axis=1), 
                      index=graphs_nodes_amount,
                      columns=['Precision', 'Recall', 'Roc AUC'])

In [59]:
def perform_nn_testing(model, sizes, probs):
  precision_scores = []
  recall_scores = []
  roc_auc_scores = []
  graphs_test = []
  for size in sizes:
    graph = generate_sbm_graph(size, probs)
    graphs_test.append(graph)
    _, graphs_test_embeddings, graphs_test_targets = get_test_df([graph])

    graphs_test_embeddings_df = pd.DataFrame(graphs_test_embeddings[0])
    graphs_test_targets_df = pd.Series(graphs_test_targets[0])

    batch_size = int(graphs_test_embeddings_df.shape[0] / 2)

    test_data = GraphEmbeddings(graphs_test_embeddings_df, graphs_test_targets_df)
    test_loader = DataLoader(test_data, batch_size=graphs_test_embeddings_df.shape[0], num_workers=2)

    test_precision, test_recall, test_roc_auc, _ = evaluate(model, test_loader)
    
    precision_scores.append(test_precision)
    recall_scores.append(test_recall)
    roc_auc_scores.append(test_roc_auc)

  graphs_nodes_amount = [len(graph.nodes()) for graph in graphs_test]

  metrics_df = get_nn_metrics_df(precision_scores,
                                 recall_scores,
                                 roc_auc_scores,
                                 graphs_nodes_amount)

  return metrics_df

In [60]:
CLUSTERS_PROBS_TEST_3 = [[0.75, 0.015, 0.0002], [0.015, 0.85, 0.0075], [0.0002, 0.0075, 0.90]]

In [61]:
CLUSTERS_SIZES_TEST_3 = [[10, 10, 10], 
                         [30, 10, 20],
                         [20, 50, 30],
                         [10, 100, 50],
                         [100, 100, 10],
                         [150, 40, 110],
                         [300, 150, 50],
                         [200, 80, 260],
                         [250, 250, 100],
                         [150, 450, 90]]

In [62]:
CLUSTERS_PROBS_TEST_5 = [[0.75, 0.05, 0.02, 0.01, 0.05], 
             [0.05, 0.85, 0.07, 0.08, 0.01], 
             [0.02, 0.07, 0.90, 0.03, 0.04],
             [0.01, 0.08, 0.03, 0.65, 0.10],
             [0.05, 0.01, 0.04, 0.10, 0.95]]

In [63]:
CLUSTERS_SIZES_TEST_5 = [[5, 5, 5, 5, 5],
                    [10, 10, 10, 10, 20],
                    [30, 10, 20, 15, 5],
                    [20, 50, 30, 40, 10],
                    [10, 100, 15, 40, 15],
                    [100, 70, 10, 30, 20],
                    [140, 20, 100, 40, 50],
                    [200, 100, 150, 50, 90],
                    [220, 50, 260, 120, 70],
                    [200, 250, 100, 300, 90],
                    [100, 440, 90, 210, 140]]

In [64]:
CLUSTERS_PROBS_TEST_10 = [[0.75, 0.05, 0.02, 0.01, 0.05, 0.007, 0.02, 0.3, 0.25, 0.015], 
            [0.05, 0.85, 0.07, 0.08, 0.01, 0.001, 0.03, 0.0, 0.04, 0.02], 
            [0.02, 0.07, 0.90, 0.03, 0.04, 0.03, 0.09, 0.017, 0.003, 0.01],
            [0.01, 0.08, 0.03, 0.65, 0.10, 0.005, 0.10, 0.025, 0.05, 0.1],
            [0.05, 0.01, 0.04, 0.10, 0.95, 0.085, 0.07, 0.06, 0.077, 0.009],
            [0.007, 0.001, 0.03, 0.005, 0.085, 0.70, 0.2, 0.0001, 0.08, 0.0015],
            [0.02, 0.03, 0.09, 0.10, 0.07, 0.2, 0.5, 0.3, 0.19, 0.0002],
            [0.3, 0.0, 0.017, 0.025, 0.06, 0.0001, 0.3, 0.99, 0.001, 0.15],
            [0.25, 0.04, 0.003, 0.05, 0.077, 0.08, 0.19, 0.001, 0.87, 0.035],
            [0.015, 0.02, 0.01, 0.1, 0.009, 0.0015, 0.0002, 0.15, 0.035, 0.85]]

In [65]:
CLUSTERS_SIZES_TEST_10 = [[5, 5, 5, 5, 5, 5, 5, 5, 5, 5],
                    [10, 10, 10, 10, 20, 10, 10, 10, 10, 20],
                    [30, 10, 20, 15, 5, 10, 40, 30, 50, 10],
                    [20, 50, 30, 40, 10, 80, 70, 5, 60, 100],
                    [10, 100, 15, 40, 15, 90, 60, 50, 110, 20],
                    [140, 20, 100, 40, 50, 10, 50, 50, 80, 95],
                    [100, 70, 10, 30, 20, 150, 90, 80, 170, 50],
                    [200, 100, 150, 50, 90, 10, 70, 40, 90, 100],
                    [220, 50, 260, 120, 70, 40, 105, 15, 25, 75],
                    [200, 250, 100, 300, 90, 120, 90, 50, 100, 30],
                    [100, 240, 90, 210, 140, 195, 45, 175, 45, 110]
                    ]

## Linear NN

### Training

In [47]:
train_loader, val_loader, test_loader = get_data_tensor(emb_data)

Train size: 4224208, Val size: 528026, Test size: 528026


In [48]:
display(Javascript('''google.colab.output.setIframeHeight(0, true, {maxHeight: 300})'''))

history, net_linear = train(GraphEmbeddingsNN, train_loader, val_loader, test_loader)

<IPython.core.display.Javascript object>

Epoch 1/10
----------


  0%|          | 0/16501 [00:00<?, ?it/s]

Train loss 0.6481195767246141 precision 0.5258311776132336 recall 0.17893650695296792 roc AUC 0.5630132027848385


  0%|          | 0/4126 [00:00<?, ?it/s]

Val loss 0.5585038230234827 precision 0.8504912118158474 recall 0.4588742257362324 roc AUC 0.7011334791105812


  0%|          | 0/4126 [00:00<?, ?it/s]

Test loss 0.5583951854107742 precision 0.8493792410651049 recall 0.4603937465602484 roc AUC 0.7014527625528733

Epoch 2/10
----------


  0%|          | 0/16501 [00:00<?, ?it/s]

Train loss 0.5966471721024869 precision 0.7421500605402906 recall 0.38201918031492405 roc AUC 0.6444801237417138


  0%|          | 0/4126 [00:00<?, ?it/s]

Val loss 0.5392976306591921 precision 0.9223626957698501 recall 0.4378960269352529 roc AUC 0.7060209452988935


  0%|          | 0/4126 [00:00<?, ?it/s]

Test loss 0.5392922158025736 precision 0.9230792408782226 recall 0.43723865836263676 roc AUC 0.7057685840211001

Epoch 3/10
----------


  0%|          | 0/16501 [00:00<?, ?it/s]

Train loss 0.589712414381887 precision 0.7573553791420566 recall 0.3838149846450273 roc AUC 0.6487681147575469


  0%|          | 0/4126 [00:00<?, ?it/s]

Val loss 0.5317506432692172 precision 0.9285351841606457 recall 0.4546180217498187 roc AUC 0.7150687554793147


  0%|          | 0/4126 [00:00<?, ?it/s]

Test loss 0.5319119707956166 precision 0.929342206011684 recall 0.45383141734113813 roc AUC 0.7147808741076588

Epoch 4/10
----------


  0%|          | 0/16501 [00:00<?, ?it/s]

Train loss 0.5870033573341416 precision 0.7681527303431138 recall 0.37953145834204033 roc AUC 0.6495922777792994


  0%|          | 0/4126 [00:00<?, ?it/s]

Val loss 0.5301461518460896 precision 0.9474906484301063 recall 0.4426893324005291 roc AUC 0.712762807418093


  0%|          | 0/4126 [00:00<?, ?it/s]

Test loss 0.5302190366903093 precision 0.947238861435044 recall 0.44186071845701064 roc AUC 0.7122754471254609

Epoch 5/10
----------


  0%|          | 0/16501 [00:00<?, ?it/s]

Train loss 0.5854815657276377 precision 0.773455781595511 recall 0.37856719159640506 roc AUC 0.6503828013855939


  0%|          | 0/4126 [00:00<?, ?it/s]

Val loss 0.5294952537111444 precision 0.9467062637591361 recall 0.4526348327339003 roc AUC 0.7174040315576834


  0%|          | 0/4126 [00:00<?, ?it/s]

Test loss 0.5295171987949733 precision 0.947594644756654 recall 0.4518842604325594 roc AUC 0.7171423818175275

Epoch 6/10
----------


  0%|          | 0/16501 [00:00<?, ?it/s]

Train loss 0.5848196855505831 precision 0.7770812479645991 recall 0.3762626448224426 roc AUC 0.65028786279147


  0%|          | 0/4126 [00:00<?, ?it/s]

Val loss 0.5316427973682766 precision 0.9663079268659113 recall 0.4179712522676533 roc AUC 0.7039061699456194


  0%|          | 0/4126 [00:00<?, ?it/s]

Test loss 0.5317257498790497 precision 0.9669507064291853 recall 0.41693877808097624 roc AUC 0.7034813864631273

Epoch 7/10
----------


  0%|          | 0/16501 [00:00<?, ?it/s]

Train loss 0.5833771395077164 precision 0.7791813092488521 recall 0.3779709855436823 roc AUC 0.6514432946511385


  0%|          | 0/4126 [00:00<?, ?it/s]

Val loss 0.5226094238514237 precision 0.9548956177086266 recall 0.4550554396395423 roc AUC 0.7199898670704341


  0%|          | 0/4126 [00:00<?, ?it/s]

Test loss 0.5226719918987212 precision 0.955719332055592 recall 0.4545252235675442 roc AUC 0.7198732945647821

Epoch 8/10
----------


  0%|          | 0/16501 [00:00<?, ?it/s]

Train loss 0.5829529356096059 precision 0.780515175658533 recall 0.3777614550211285 roc AUC 0.6516205118772407


  0%|          | 0/4126 [00:00<?, ?it/s]

Val loss 0.52679441115172 precision 0.9605973101894039 recall 0.44553011664611925 roc AUC 0.7163675171176495


  0%|          | 0/4126 [00:00<?, ?it/s]

Test loss 0.5268022399892289 precision 0.9616186197335465 recall 0.44496893511307256 roc AUC 0.7162692765870848

Epoch 9/10
----------


  0%|          | 0/16501 [00:00<?, ?it/s]

Train loss 0.5820347490827067 precision 0.7836404816137879 recall 0.3769586212987232 roc AUC 0.6519686036501577


  0%|          | 0/4126 [00:00<?, ?it/s]

Val loss 0.5266405159099207 precision 0.9743304486418325 recall 0.4235454524093187 roc AUC 0.7078689738751153


  0%|          | 0/4126 [00:00<?, ?it/s]

Test loss 0.5266519230846867 precision 0.9749746668880618 recall 0.42366624659389307 roc AUC 0.7080019526498613

Epoch 10/10
----------


  0%|          | 0/16501 [00:00<?, ?it/s]

Train loss 0.5816938862620856 precision 0.7843368124043266 recall 0.3771307199968426 roc AUC 0.6521982524615895


  0%|          | 0/4126 [00:00<?, ?it/s]

Val loss 0.5314192896437079 precision 0.9750452709916725 recall 0.4193172731256483 roc AUC 0.7058914027370399


  0%|          | 0/4126 [00:00<?, ?it/s]

Test loss 0.5314346256977831 precision 0.9752547755145654 recall 0.4192722452790785 roc AUC 0.7059052072428043



In [49]:
plot_accuracy_scores(history, 'recall')

In [50]:
plot_accuracy_scores(history, 'precision')

In [51]:
plot_accuracy_scores(history, 'roc_auc')

In [52]:
torch.save(net_linear.state_dict(), '/content/drive/MyDrive/data/linear_emb')

### Testing

#### Clusters = 3

In [None]:
linear_metrics_df_3 = perform_nn_testing(net_linear, CLUSTERS_SIZES_TEST_3, CLUSTERS_PROBS_TEST_3) 

In [67]:
linear_metrics_df_3

Unnamed: 0,Precision,Recall,Roc AUC
30,0.0,0.0,0.5
60,0.0,0.0,0.5
100,0.0,0.0,0.5
160,0.0,0.0,0.5
210,0.0,0.0,0.5
300,0.316357,0.035989,0.492046
500,0.334557,0.168786,0.442023
540,0.316828,0.396228,0.425203
600,0.30033,0.358854,0.429738
690,0.431913,0.409475,0.447186


In [68]:
fig = px.line(linear_metrics_df_3, title="Метрики качества линейной модели (3 кластера)").update_layout(xaxis_title="Количество узлов",
                                                                                                        yaxis_title="Значение метрики")   
fig.show()

#### Clusters = 5

In [None]:
linear_metrics_df_5 = perform_nn_testing(net_linear, CLUSTERS_SIZES_TEST_5, CLUSTERS_PROBS_TEST_5) 

In [70]:
linear_metrics_df_5

Unnamed: 0,Precision,Recall,Roc AUC
25,0.0,0.0,0.5
60,0.0,0.0,0.5
80,0.0,0.0,0.5
150,0.0,0.0,0.5
180,0.0,0.0,0.5
230,0.5,0.000747,0.500209
350,0.185124,0.086281,0.470641
590,0.204072,0.394789,0.457685
720,0.229843,0.401109,0.458516
940,0.206347,0.347143,0.465597


In [71]:
fig = px.line(linear_metrics_df_5, title="Метрики качества линейной модели (5 кластеров)").update_layout(xaxis_title="Количество узлов",
                                                                                                        yaxis_title="Значение метрики")   
fig.show()

#### Clusters = 10

In [None]:
linear_metrics_df_10 = perform_nn_testing(net_linear, CLUSTERS_SIZES_TEST_10, CLUSTERS_PROBS_TEST_10) 

In [73]:
linear_metrics_df_10

Unnamed: 0,Precision,Recall,Roc AUC
50,0.0,0.0,0.5
120,0.0,0.0,0.5
220,0.0,0.0,0.5
465,0.115168,0.304473,0.463636
510,0.108254,0.208659,0.455403
635,0.119524,0.453306,0.468064
770,0.105541,0.09894,0.480229
900,0.109454,0.192105,0.477079
980,0.126735,0.218381,0.463192
1330,0.131484,0.62072,0.480213


In [74]:
fig = px.line(linear_metrics_df_10, title="Метрики качества линейной модели (10 кластеров)").update_layout(xaxis_title="Количество узлов",
                                                                                                           yaxis_title="Значение метрики")   
fig.show()

## GRU NN

### Training

In [None]:
train_loader, val_loader, test_loader = get_data_tensor(emb_data)

Train size: 3883120, Val size: 485390, Test size: 485391


In [None]:
display(Javascript('''google.colab.output.setIframeHeight(0, true, {maxHeight: 300})'''))

history_gru, net_gru = train(GraphEmbeddingsGRUNN, train_loader, val_loader, test_loader)

<IPython.core.display.Javascript object>

Epoch 1/10
----------


  0%|          | 0/15169 [00:00<?, ?it/s]

Train loss 0.6691661355121219 precision 0.25920014022623455 recall 0.02979860151146032 roc AUC 0.5100071981689231


  0%|          | 0/3793 [00:00<?, ?it/s]

Test loss 0.6557310257679612 precision 0.6911821692060709 recall 0.11641896558102587 roc AUC 0.5408101372347186

Epoch 2/10
----------


  0%|          | 0/15169 [00:00<?, ?it/s]

Train loss 0.6496442428205109 precision 0.6549328293784774 recall 0.13842689664434835 roc AUC 0.5444478695469186


  0%|          | 0/3793 [00:00<?, ?it/s]

Val loss 0.6434469590581818 precision 0.6493946346905849 recall 0.1515746998337505 roc AUC 0.5485524820135973


  0%|          | 0/3793 [00:00<?, ?it/s]

Test loss 0.6449372456000217 precision 0.6500748770404893 recall 0.15064827527734828 roc AUC 0.5480514895629957

Epoch 3/10
----------


  0%|          | 0/15169 [00:00<?, ?it/s]

Train loss 0.6448300484025382 precision 0.6608612548287505 recall 0.1422049337826834 roc AUC 0.5465576421986827


  0%|          | 0/3793 [00:00<?, ?it/s]

Val loss 0.6419810308430774 precision 0.6898370498596167 recall 0.13309069988319597 roc AUC 0.5466330579898936


  0%|          | 0/3793 [00:00<?, ?it/s]

Test loss 0.6435075811776579 precision 0.6887838223081416 recall 0.13180891333676964 roc AUC 0.5459349863041485

Epoch 4/10
----------


  0%|          | 0/15169 [00:00<?, ?it/s]

Train loss 0.6430176811719964 precision 0.6677863828645823 recall 0.14311403488189509 roc AUC 0.5475950426879089


  0%|          | 0/3793 [00:00<?, ?it/s]

Val loss 0.639339881162671 precision 0.6835237216761476 recall 0.14199412087018642 roc AUC 0.5490449159682375


  0%|          | 0/3793 [00:00<?, ?it/s]

Test loss 0.6408858494270832 precision 0.6823443377065704 recall 0.14103462065922723 roc AUC 0.5484150629127331

Epoch 5/10
----------


  0%|          | 0/15169 [00:00<?, ?it/s]

Train loss 0.6301333125386228 precision 0.6864296381657977 recall 0.21834824294078156 roc AUC 0.5761987826609035


  0%|          | 0/3793 [00:00<?, ?it/s]

Val loss 0.5928699012061017 precision 0.7756320259448231 recall 0.37300431259744227 roc AUC 0.6503733801769281


  0%|          | 0/3793 [00:00<?, ?it/s]

Test loss 0.595152978088747 precision 0.7709490552952014 recall 0.36972133282378683 roc AUC 0.6479083855624469

Epoch 6/10
----------


  0%|          | 0/15169 [00:00<?, ?it/s]

Train loss 0.5717597607648697 precision 0.7499916661012181 recall 0.4831304038970295 roc AUC 0.6870965568427624


  0%|          | 0/3793 [00:00<?, ?it/s]

Val loss 0.5501430804877805 precision 0.8044305107234455 recall 0.5014456528220501 roc AUC 0.7101312794216385


  0%|          | 0/3793 [00:00<?, ?it/s]

Test loss 0.5513512641466484 precision 0.8045871394062933 recall 0.500404443949328 roc AUC 0.7095187359768766

Epoch 7/10
----------


  0%|          | 0/15169 [00:00<?, ?it/s]

Train loss 0.5485737984484899 precision 0.7857712559592933 recall 0.5295719961188925 roc AUC 0.7157067027886542


  0%|          | 0/3793 [00:00<?, ?it/s]

Val loss 0.5352872697621261 precision 0.8269342689839967 recall 0.5295156395577573 roc AUC 0.7278641815710721


  0%|          | 0/3793 [00:00<?, ?it/s]

Test loss 0.5362348922820761 precision 0.82721988533119 recall 0.5288728157081471 roc AUC 0.7274893669013522

Epoch 8/10
----------


  0%|          | 0/15169 [00:00<?, ?it/s]

Train loss 0.531608811662714 precision 0.8113435843417949 recall 0.5617551037697113 roc AUC 0.736464224373264


  0%|          | 0/3793 [00:00<?, ?it/s]

Val loss 0.5226770048087422 precision 0.8742873170089529 recall 0.5288960164588563 roc AUC 0.7391239022680811


  0%|          | 0/3793 [00:00<?, ?it/s]

Test loss 0.5239239063122767 precision 0.8758208928749442 recall 0.5266787691792901 roc AUC 0.7383116749883346

Epoch 9/10
----------


  0%|          | 0/15169 [00:00<?, ?it/s]

Train loss 0.5235837709849491 precision 0.8230019640916286 recall 0.5765714648971373 roc AUC 0.7461918034601723


  0%|          | 0/3793 [00:00<?, ?it/s]

Val loss 0.5150508392683724 precision 0.8638916753565272 recall 0.5626597334705457 roc AUC 0.7518744888222171


  0%|          | 0/3793 [00:00<?, ?it/s]

Test loss 0.5166065355199577 precision 0.8630261223362254 recall 0.5601985483543577 roc AUC 0.7503461331995204

Epoch 10/10
----------


  0%|          | 0/15169 [00:00<?, ?it/s]

Train loss 0.5152572540002583 precision 0.8334600510661703 recall 0.5941351252853452 roc AUC 0.7567659387150802


  0%|          | 0/3793 [00:00<?, ?it/s]

Val loss 0.5041865410699148 precision 0.8523528994662463 recall 0.6077822727092888 roc AUC 0.7688963008181615


  0%|          | 0/3793 [00:00<?, ?it/s]

Test loss 0.5054079345983595 precision 0.8530145415600636 recall 0.6052350553071416 roc AUC 0.767630813526218



In [None]:
plot_accuracy_scores(history_gru, 'recall')

In [None]:
plot_accuracy_scores(history_gru, 'precision')

In [None]:
plot_accuracy_scores(history_gru, 'roc_auc')

In [None]:
torch.save(net_gru.state_dict(), '/content/drive/MyDrive/data/gru_emb')

### Testing

#### Clusters = 3

In [None]:
gru_metrics_df_3 = perform_nn_testing(net_gru, CLUSTERS_SIZES_TEST_3, CLUSTERS_PROBS_TEST_3)

In [82]:
gru_metrics_df_3

Unnamed: 0,Precision,Recall,Roc AUC
30,0.431034,0.185185,0.537593
60,0.360294,0.146269,0.494043
100,0.308057,0.140541,0.476077
160,0.532657,0.135048,0.510832
210,0.395472,0.165108,0.477971
300,0.376523,0.201393,0.489432
500,0.383829,0.37579,0.432066
540,0.364163,0.482161,0.472167
600,0.325579,0.459241,0.445501
690,0.45724,0.475702,0.467818


In [83]:
fig = px.line(gru_metrics_df_3, title="Метрики для GRU модели (3 кластера)").update_layout(xaxis_title="Количество узлов",
                                                                                                yaxis_title="Precision score")   
fig.show()

#### Clusters = 5

In [None]:
gru_metrics_df_5 = perform_nn_testing(net_gru, CLUSTERS_SIZES_TEST_5, CLUSTERS_PROBS_TEST_5)

In [85]:
gru_metrics_df_5

Unnamed: 0,Precision,Recall,Roc AUC
25,0.097561,0.08,0.466
60,0.263158,0.175676,0.522838
80,0.368263,0.156688,0.533923
150,0.230329,0.120374,0.496893
180,0.415402,0.123475,0.51038
230,0.317036,0.183199,0.50496
350,0.233401,0.163733,0.479208
590,0.20431,0.488571,0.448067
720,0.254374,0.586942,0.483644
940,0.235483,0.624964,0.496433


In [86]:
fig = px.line(gru_metrics_df_5, title="Метрики для GRU модели (5 кластеров)").update_layout(xaxis_title="Количество узлов",
                                                                                            yaxis_title="Precision score")   
fig.show()

#### Clusters = 10

In [None]:
gru_metrics_df_10 = perform_nn_testing(net_gru, CLUSTERS_SIZES_TEST_10, CLUSTERS_PROBS_TEST_10)

In [88]:
gru_metrics_df_10

Unnamed: 0,Precision,Recall,Roc AUC
50,0.064039,0.13,0.480556
120,0.115534,0.160811,0.509234
220,0.131533,0.158673,0.49575
465,0.128914,0.366622,0.48358
510,0.137919,0.436255,0.481861
635,0.127055,0.504056,0.483847
770,0.13218,0.486524,0.479494
900,0.125077,0.637699,0.479307
980,0.153748,0.573063,0.48049
1330,0.136592,0.611127,0.494513


In [90]:
fig = px.line(gru_metrics_df_10, title="Метрики для GRU модели (10 кластеров)").update_layout(xaxis_title="Количество узлов",
                                                                                             yaxis_title="Precision score")   
fig.show()