In [1]:
!pip install torch_geometric --quiet
!pip install nni --quiet

[2K     [90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━[0m [32m63.1/63.1 kB[0m [31m1.9 MB/s[0m eta [36m0:00:00[0m
[2K   [90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━[0m [32m1.1/1.1 MB[0m [31m18.9 MB/s[0m eta [36m0:00:00[0ma [36m0:00:01[0m
[2K     [90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━[0m [32m47.2/47.2 kB[0m [31m2.1 MB/s[0m eta [36m0:00:00[0m
[2K   [90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━[0m [32m61.4/61.4 MB[0m [31m28.5 MB/s[0m eta [36m0:00:00[0m:00:01[0m00:01[0m
[2K   [90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━[0m [32m144.8/144.8 kB[0m [31m10.3 MB/s[0m eta [36m0:00:00[0m
[?25h[31mERROR: pip's dependency resolver does not currently take into account all the packages that are installed. This behaviour is the source of the following dependency conflicts.
datasets 3.5.0 requires fsspec[http]<=2024.12.0,>=2023.1.0, but you have fsspec 2025.3.2 which is incompatible.
torch 2.5.1+cu124 requires nvidia-cublas-cu12==12.4.5.8; platform_sy

In [2]:
import numpy as np
import torch
import torch.nn.functional as F
import os
import json
from torch_geometric.utils import dense_to_sparse
import matplotlib.pyplot as plt
from sklearn.decomposition import PCA
from sklearn.cluster import OPTICS, DBSCAN, KMeans
from sklearn.metrics import silhouette_samples, silhouette_score
from tqdm.notebook import tqdm
from torch.utils.data import Dataset, DataLoader

import torch
import gc
from torch.utils.data import DataLoader
from collections import deque

# Custom imports
import sys
sys.path.insert(1, "/kaggle/input/second-dataset/dependecies")

import GCN
from Graph import Graph
from data_generator import generate_arch_dicts

device = torch.device("cuda" if torch.cuda.is_available() else "cpu")
TEST = False

In [3]:
model_diversity_path = "/kaggle/input/second-dataset/weights/model_diversity_weights.pth"
model_accuracy_path = "/kaggle/input/second-dataset/weights/model_accuracy_weights.pth"

In [4]:
input_dim = 8
output_dim = 128
dropout=0.1

model_diversity = GCN.GAT(input_dim, output_dim, dropout).to(device)
state_dict = torch.load(model_diversity_path, map_location=device, weights_only=True)
model_diversity.load_state_dict(state_dict)
model_diversity.eval()

GAT(
  (gat1): GATv2Conv(8, 16, heads=4)
  (gat2): GATv2Conv(64, 64, heads=4)
  (gat3): GATv2Conv(256, 64, heads=4)
  (gat4): GATv2Conv(256, 16, heads=4)
  (res1): Linear(in_features=8, out_features=64, bias=True)
  (res2): Linear(in_features=64, out_features=256, bias=True)
  (res3): Linear(in_features=256, out_features=256, bias=True)
  (res4): Linear(in_features=256, out_features=64, bias=True)
  (norm1): GraphNorm(64)
  (norm2): GraphNorm(256)
  (norm3): GraphNorm(256)
  (norm4): GraphNorm(64)
  (dropout): Dropout(p=0.1, inplace=False)
  (fc1): Linear(in_features=64, out_features=64, bias=True)
  (fc_norm): LayerNorm((64,), eps=1e-05, elementwise_affine=True)
  (fc2): Linear(in_features=64, out_features=128, bias=True)
)

In [5]:
input_dim = 8
output_dim = 1
dropout = 0.4
heads = 16

model_accuracy = GCN.GAT(input_dim, output_dim, dropout, heads=heads).to(device)
state_dict = torch.load(model_accuracy_path, map_location=device, weights_only=True)
model_accuracy.load_state_dict(state_dict)
model_accuracy.eval()

GAT(
  (gat1): GATv2Conv(8, 4, heads=16)
  (gat2): GATv2Conv(64, 16, heads=16)
  (gat3): GATv2Conv(256, 16, heads=16)
  (gat4): GATv2Conv(256, 4, heads=16)
  (res1): Linear(in_features=8, out_features=64, bias=True)
  (res2): Linear(in_features=64, out_features=256, bias=True)
  (res3): Linear(in_features=256, out_features=256, bias=True)
  (res4): Linear(in_features=256, out_features=64, bias=True)
  (norm1): GraphNorm(64)
  (norm2): GraphNorm(256)
  (norm3): GraphNorm(256)
  (norm4): GraphNorm(64)
  (dropout): Dropout(p=0.4, inplace=False)
  (fc1): Linear(in_features=64, out_features=64, bias=True)
  (fc_norm): LayerNorm((64,), eps=1e-05, elementwise_affine=True)
  (fc2): Linear(in_features=64, out_features=1, bias=True)
)

Зададим гиперпараметры:

1. K -- количество моделей в ансамбле
2. ALPHA -- трейд-off между точностью и разнообразием, где разнообразие определяется как расстояние эмбеддингов до ансамбля.
3. N -- количество моделей в одной эпохе

Мы хотим, чтобы в ансамбле были точные модели, поэтому введем $\gamma$ -- минимальную точность модели.

In [6]:
K = 6
ALPHA = 0.9
N = 1_000_000
M = 65536
GAMMA = 0.87
BATCH_SIZE = 8192

In [7]:
def find_dist_to_best(best_embs, emb):
    """
    best_embs: tensor [k, d], emb: tensor [d] или [1, d]
    возвращает min расстояние от emb до любого из best_embs
    """
    if best_embs.numel() == 0:
        return float("inf")
    dists = torch.cdist(emb.unsqueeze(0), best_embs, p=2)  # [1, k]
    return dists.min().item()

In [8]:
def optimize_architecture_search(
    K, M, N, GAMMA, BATCH_SIZE, model_accuracy, model_diversity, device
):
    """
    K: размер итогового ансамбля
    M: минмальный размер потенциального пула
    N: число архитектур за итерацию
    GAMMA: порог точности
    BATCH_SIZE: размер батча
    model_accuracy, model_diversity: ваши GNN‑модели
    device: 'cpu' или 'cuda'
    """
    best_models = []
    best_embeddings = []
    potential_archs = []
    potential_embeddings = []
    potential_accuracies = []

    while len(best_models) < K:
        print(
            f"\nProgress: {len(best_models)}/{K} selected, pool size {len(potential_archs)}/{M}"
        )

        # 1) Сгенерировать архитектуры
        arch_dicts = generate_arch_dicts(N, use_tqdm=True)  # list of dicts

        # 2) Построить графы и датасет
        graphs = [Graph(arch, index=i) for i, arch in enumerate(arch_dicts)]
        dataset = GCN.CustomDataset(graphs, use_tqdm=True)
        loader = DataLoader(
            dataset,
            batch_size=BATCH_SIZE,
            shuffle=False,
            num_workers=4,
            collate_fn=GCN.collate_graphs,
        )

        # 3) Извлечь эмбеддинги (numpy)
        with torch.no_grad():
            emb_acc_np, _ = GCN.extract_embeddings(
                model_accuracy, loader, device, use_tqdm=True
            )
            emb_div_np, _ = GCN.extract_embeddings(
                model_diversity, loader, device, use_tqdm=True
            )

        # 4) Фильтрация по точности
        mask = emb_acc_np >= GAMMA  # numpy boolean array, shape (N,)

        valid_archs = [arch for arch, ok in zip(arch_dicts, mask) if ok]
        valid_div_embs = emb_div_np[mask].astype(np.float16)  # shape (n_valid, d)
        valid_accs = emb_acc_np[mask]  # shape (n_valid,)

        for arch, emb, acc in zip(valid_archs, valid_div_embs, valid_accs):
            potential_archs.append(arch)
            potential_embeddings.append(emb)
            potential_accuracies.append(acc)

        # 6) Очистка временных переменных и сборка мусора
        del arch_dicts, graphs, dataset, loader
        del emb_acc_np, emb_div_np
        torch.cuda.empty_cache()
        gc.collect()

        # 7) Если пул заполнен — выбираем наиболее разнообразные модели
        while len(potential_archs) >= M and len(best_models) < K:
            if best_embeddings:
                best_arr = np.stack(best_embeddings)  # shape (len(best), d)
                # Для каждого emb в пуле — минимальное расстояние до best_arr
                distances = [
                    np.min(np.linalg.norm(emb - best_arr, axis=1))
                    for emb in potential_embeddings
                ]
            else:
                # Для первой модели ничем не ограничены
                distances = [np.inf] * len(potential_embeddings)

            farthest = int(np.argmax(distances))

            # Добавляем в лучшие
            best_models.append(potential_archs.pop(farthest))
            best_embeddings.append(potential_embeddings.pop(farthest))
            acc = potential_accuracies.pop(farthest)
            print(
                f"Selected #{len(best_models)}/{K}: acc={acc:.4f}, dist={distances[farthest]:.4f}"
            )

    return best_models, potential_archs, potential_embeddings, potential_accuracies

In [None]:
best_models_acc, potential_archs, potential_embeddings, potential_accuracies = (
    optimize_architecture_search(
        K, M, N, GAMMA, BATCH_SIZE, model_accuracy, model_diversity, device
    )
)


Progress: 0/6 selected, pool size 0/65536


  0%|          | 0/1000000 [00:00<?, ?it/s]

  0%|          | 0/1000000 [00:00<?, ?it/s]



In [None]:

def select_central_models_by_clusters(
    potential_archs,
    potential_embeddings,
    potential_accuracies,
    K,
    random_state=42,
    plot_pca=False
):
    """
    Кластеризует potential_embeddings на K кластеров,
    и из каждого выбирает модель, ближайшую к центроиду.

    Если plot_pca=True, рисует проекцию всех эмбеддингов и центроидов на плоскость.

    Возвращает:
        selected_archs: список выбранных архитектур (K штук)
        selected_embs: соответствующие эмбеддинги
        selected_accs: соответствующие точности
    """
    potential_embeddings = np.array(potential_embeddings, dtype=np.float32)
    potential_accuracies = np.array(potential_accuracies, dtype=np.float32)

    # 1. Кластеризация
    kmeans = KMeans(n_clusters=K, random_state=random_state, n_init='auto')
    cluster_ids = kmeans.fit_predict(potential_embeddings)
    centroids = kmeans.cluster_centers_

    selected_archs = []
    selected_embs = []
    selected_accs = []
    selected_indices = []

    for cluster_id in range(K):
        cluster_indices = np.where(cluster_ids == cluster_id)[0]
        if len(cluster_indices) == 0:
            continue

        cluster_embs = potential_embeddings[cluster_indices]
        cluster_centroid = centroids[cluster_id]

        dists = np.linalg.norm(cluster_embs - cluster_centroid, axis=1)
        best_local_idx = np.argmin(dists)
        best_idx_in_global = cluster_indices[best_local_idx]

        selected_archs.append(potential_archs[best_idx_in_global])
        selected_embs.append(potential_embeddings[best_idx_in_global])
        selected_accs.append(potential_accuracies[best_idx_in_global])
        selected_indices.append(best_idx_in_global)

    if plot_pca:
        # PCA до 2 измерений
        pca = PCA(n_components=2, random_state=random_state)
        projected_embeddings = pca.fit_transform(potential_embeddings)
        projected_centroids = pca.transform(centroids)
        projected_selected = projected_embeddings[selected_indices]

        plt.figure(figsize=(8, 6))
        plt.scatter(projected_embeddings[:, 0], projected_embeddings[:, 1],
                    c=cluster_ids, cmap="tab10", alpha=0.4, label="Все модели")
        plt.scatter(projected_centroids[:, 0], projected_centroids[:, 1],
                    c="black", marker="X", s=100, label="Центроиды")
        plt.scatter(projected_selected[:, 0], projected_selected[:, 1],
                    c="red", marker="*", s=150, label="Выбранные модели")
        plt.title("PCA проекция эмбеддингов")
        plt.legend()
        plt.grid(True)
        plt.tight_layout()
        plt.show()

    return selected_archs, selected_embs, selected_accs

In [None]:
best_models_cluster, best_embeddings_cluster, best_accuracies_cluster = select_central_models_by_clusters(
    potential_archs, potential_embeddings, potential_accuracies, K, random_state=42, plot_pca=True
)

In [None]:
def save_models_to_dir(best_models, dir_name):
    os.makedirs(dir_name, exist_ok=True)
    
    # Сохраняем архитектуры по одной
    for i, arch in enumerate(best_models, 1):
        file_path = os.path.join(dir_name, f"model_{i:02d}.json")
        with open(file_path, "w") as f:
            json.dump(arch, f, indent=4)
        print(f"Сохранена модель {i} в {file_path}")

In [None]:
save_models_to_dir(best_models_cluster, "best_models_greed_cluster")
save_models_to_dir(best_models_acc, "best_models_greed")

In [None]:
!zip -r best_models best_models_greed
!zip -r best_models best_models_greed_cluster