In [1]:
# !pip install filelock --quiet --break-system-packages
# !pip install nni --quiet --break-system-packages
# !pip install torch --quiet --break-system-packages
# !pip install torchvision --quiet --break-system-packages
# !pip install numpy --quiet --break-system-packages
# !pip install matplotlib --quiet --break-system-packages
# !pip install tqdm --quiet --break-system-packages
# !pip install json --quiet --break-system-packages
# !pip install os --quiet --break-system-packages
# !pip install random --quiet --break-system-packages
# !pip install wandb --quiet --break-system-packages
# !pip install pytorch-lightning --quiet --break-system-packages
# !pip install torchmetrics --quiet --break-system-packages

[0m

In [2]:
import nni
import torch
from torchvision import transforms
from torchvision.datasets import CIFAR10
from nni.nas.evaluator.pytorch import DataLoader

import numpy as np
from nni.nas.evaluator.pytorch import Classification
from torch.utils.data import SubsetRandomSampler
from nni.nas.hub.pytorch import DARTS as DartsSpace
from nni.nas.space import model_context
from nni.nas.evaluator.pytorch import ClassificationModule
from nni.nas.evaluator.pytorch import Lightning, Trainer

from darts_classification_module import DartsClassificationModule

import matplotlib.pyplot as plt
from IPython.display import clear_output

from tqdm.notebook import tqdm

import json
import os
import random
import wandb
from pytorch_lightning.loggers import WandbLogger
import torch
from torch.nn import DataParallel
from nni.nas.evaluator.pytorch import ClassificationModule
import torchmetrics

In [3]:
wandb.login(key="ca0f522a70ce0bd6b4a0aeb32424470b576c24d3")

[34m[1mwandb[0m: [32m[41mERROR[0m Failed to detect the name of this notebook. You can set it manually with the WANDB_NOTEBOOK_NAME environment variable to enable code saving.
[34m[1mwandb[0m: Appending key for api.wandb.ai to your netrc file: /root/.netrc
[34m[1mwandb[0m: Currently logged in as: [33mdemoren[0m ([33mdemoren_mipt[0m) to [32mhttps://api.wandb.ai[0m. Use [1m`wandb login --relogin`[0m to force relogin


True

In [4]:
CIFAR_MEAN = [0.49139968, 0.48215827, 0.44653124]
CIFAR_STD = [0.24703233, 0.24348505, 0.26158768]

transform = transforms.Compose([
    transforms.RandomCrop(32, padding=4),
    transforms.RandomHorizontalFlip(),
    transforms.ToTensor(),
    transforms.Normalize(CIFAR_MEAN, CIFAR_STD),
])

train_data = nni.trace(CIFAR10)(root='./data', train=True, download=True, transform=transform)

num_samples = len(train_data)
indices = np.random.permutation(num_samples)
split = int(num_samples * 0.8)

search_train_loader = DataLoader(
    train_data, batch_size=96, num_workers=4,
    sampler=SubsetRandomSampler(indices[:split]),
)

search_valid_loader = DataLoader(
    train_data, batch_size=96, num_workers=4,
    sampler=SubsetRandomSampler(indices[split:]),
)

Files already downloaded and verified


In [5]:
def load_json_from_directory(directory_path):
    json_data = []
    for root, _, files in os.walk(directory_path):
        for file in files:
            if file.endswith('.json'):
                file_path = os.path.join(root, file)
                with open(file_path, 'r', encoding='utf-8') as f:
                    try:
                        data = json.load(f)
                        json_data.append(data)
                    except json.JSONDecodeError as e:
                        print(f"Error decoding JSON from file {file_path}: {e}")
    return json_data
    
arch_dicts = load_json_from_directory('../home/best_models_greed_cluster')

In [10]:
def train_model(
    architecture, 
    train_loader, 
    valid_loader, 
    max_epochs=600, 
    learning_rate=0.025, 
    project_name="neural_ensemble_search_6",
    run_name=None,
    fast_dev_run=False
):
    wandb.init(
        project=project_name,
        name=run_name or str(architecture),
        config={
            "architecture": str(architecture),
            "max_epochs": max_epochs,
            "learning_rate": learning_rate,
            "weight_decay": 3e-4,
            "fast_dev_run": fast_dev_run
        },
        reinit=True
    )

    with model_context(architecture):
        model = DartsSpace(width=16, num_cells=10, dataset='cifar')
    
    device = torch.device("cuda" if torch.cuda.is_available() else "cpu")
    if torch.cuda.device_count() > 1:
        model = torch.nn.DataParallel(model)
    model.to(device)

    evaluator = Lightning(
        DartsClassificationModule(
            learning_rate=learning_rate,
            weight_decay=3e-4,
            auxiliary_loss_weight=0.4,
            max_epochs=max_epochs
        ),
        trainer=Trainer(
            gradient_clip_val=5.0,
            max_epochs=max_epochs,
            fast_dev_run=fast_dev_run,
            logger=WandbLogger(experiment=wandb.run)
        ),
        train_dataloaders=train_loader,
        val_dataloaders=valid_loader
    )

    evaluator.fit(model)
    wandb.finish()
    return model

In [None]:
def evaluate_ensemble(models, valid_loader):
    device = torch.device("cuda" if torch.cuda.is_available() else "cpu")
    
    # Переносим все модели на главное устройство
    main_device = 'cuda:0' if torch.cuda.is_available() else 'cpu'
    models = [model.to(main_device) for model in models]
    
    # Включаем DataParallel только если есть несколько GPU
    if torch.cuda.device_count() > 1:
        models = [torch.nn.DataParallel(model) for model in models]  # Используем все GPU
    
    for model in models:
        model.eval()

    total = 0
    correct_ensemble = 0
    correct_models = [0] * len(models)  # Для накопления правильных ответов каждой модели

    with torch.no_grad():
        for images, labels in tqdm(valid_loader):
            images = images.to(main_device)
            labels = labels.to(main_device)
            batch_size = labels.size(0)
            total += batch_size

            # Предсказания ансамбля
            ensemble_outputs = []
            
            # Для каждой модели получаем предсказания и обновляем correct_models
            for i, model in enumerate(models):
                outputs = model(images)
                ensemble_outputs.append(outputs)
                
                _, predicted = torch.max(outputs.data, 1)
                correct_models[i] += (predicted == labels).sum().item()

            # Точность ансамбля
            avg_outputs = torch.mean(torch.stack(ensemble_outputs), dim=0)
            _, predicted = torch.max(avg_outputs.data, 1)
            correct_ensemble += (predicted == labels).sum().item()

    # Расчет итоговой точности
    ensemble_accuracy = 100 * correct_ensemble / total
    model_accuracies = [100 * correct / total for correct in correct_models]

    print(f'Ensemble Accuracy: {ensemble_accuracy:.2f}%')
    for i, acc in enumerate(model_accuracies):
        print(f'Model {i + 1} Accuracy: {acc:.2f}%')

    return ensemble_accuracy, model_accuracies

: 

In [None]:
models = []
architectures = []
for i, architecture in enumerate(arch_dicts):
    model = train_model(architecture["architecture"],
                        search_train_loader,
                        search_valid_loader,
                        max_epochs=100,
                        learning_rate = 0.025,
                        run_name = f"model_{i}", 
                        fast_dev_run=False)
    
    models.append(model)
    if len(models) > 1:
        print(f"Ensemble size: {len(models)}")
        evaluate_ensemble(models, search_valid_loader)
    architectures.append(architecture)
    clear_output(wait=True)

Using 16bit Automatic Mixed Precision (AMP)
Using default `ModelCheckpoint`. Consider installing `litmodels` package to enable `LitModelCheckpoint` for automatic upload to the Lightning model registry.
GPU available: True (cuda), used: True
TPU available: False, using: 0 TPU cores
HPU available: False, using: 0 HPUs
LOCAL_RANK: 0 - CUDA_VISIBLE_DEVICES: [0]

  | Name      | Type             | Params | Mode 
-------------------------------------------------------
0 | criterion | CrossEntropyLoss | 0      | train
1 | metrics   | ModuleDict       | 0      | train
2 | _model    | DARTS            | 390 K  | train
-------------------------------------------------------
390 K     Trainable params
0         Non-trainable params
390 K     Total params
1.562     Total estimated model params size (MB)
755       Modules in train mode
0         Modules in eval mode


Epoch 88:  45%|▍| 189/417 [00:19<00:23,  9.77it/s, v_num=vzdn, train_loss=0.0029, train_ac

In [None]:
evaluate_ensemble(models, search_valid_loader)

: 

In [None]:
def evaluate_and_save_results(
    models, architectures, valid_loader, folder_name="results"
):
    """
    Оценивает модели на валидационном наборе данных и сохраняет результаты в файлы JSON.
    Аргументы:
    models (list): Список обученных моделей.
    architectures (list): Список архитектур моделей.
    valid_loader (DataLoader): DataLoader для валидационных данных.
    folder_name (str, необязательно): Имя папки для сохранения результатов. По умолчанию "results".
    Исключения:
    ValueError: Если количество моделей и архитектур не совпадает.
    Результаты:    Для каждой модели создается файл JSON с результатами, содержащий:
    - architecture: Архитектура модели.
    - valid_predictions: Предсказания модели на валидационном наборе данных.
    - valid_accuracy: Точность модели на валидационном наборе данных.
    """
    if len(models) != len(architectures):
        raise ValueError("Количество моделей и архитектур должно совпадать")

    device = torch.device("cuda" if torch.cuda.is_available() else "cpu")
    os.makedirs(folder_name, exist_ok=True)

    for i, (model, architecture) in enumerate(zip(models, architectures)):
        model.to(device)
        model.eval()

        valid_correct = 0
        valid_total = 0
        valid_preds = []

        with torch.no_grad():
            for images, labels in valid_loader:
                images, labels = images.to(device), labels.to(device)
                outputs = model(images)
                _, predicted = torch.max(outputs, 1)
                valid_preds.extend(predicted.cpu().tolist())
                valid_correct += (predicted == labels).sum().item()
                valid_total += labels.size(0)

        valid_accuracy = valid_correct / valid_total

        result = {
            "architecture": architecture,
            "valid_predictions": valid_preds,
            "valid_accuracy": valid_accuracy,
        }

        file_name = f"model_{i+1}_results.json"
        file_path = os.path.join(folder_name, file_name)

        with open(file_path, "w") as f:
            json.dump(result, f, indent=4)

        print(f"Results for model_{i + 1} saved to {file_path}")

: 

In [None]:
evaluate_and_save_results(models, architectures, search_valid_loader)

: 

In [None]:
!zip -r results.zip /kaggle/working/results

: 