# Загрузка библиотек

In [1]:
from google.colab import drive
drive.mount('/content/drive')


Mounted at /content/drive


In [2]:
import sys
sys.path.append('/content/drive/MyDrive/ColabNotebooks/DeepL_Geoproject')

In [3]:
from src.data.preprocessing import create_patches_to_disc
from src.data.dataset import InriaDataset
from src.data.transforms import *
from src.models.unet import UNet
from src.models.losses import DiceBCELoss
from src.train import *
from src.utils.visualize_history import plot_history
from src.utils.visualize_img_gt import visualize_images_and_masks
from src.utils.metrics import SegmentationMetrics

from configs.base import *

from torch.utils.data import DataLoader
from sklearn.model_selection import train_test_split
import torch
import json
import os
from glob import glob

from importlib import reload

In [None]:
device = torch.device('cuda' if torch.cuda.is_available() else 'cpu')

In [4]:
!pip install torchmetrics

Collecting torchmetrics
  Downloading torchmetrics-1.7.3-py3-none-any.whl.metadata (21 kB)
Collecting lightning-utilities>=0.8.0 (from torchmetrics)
  Downloading lightning_utilities-0.14.3-py3-none-any.whl.metadata (5.6 kB)
Collecting nvidia-cuda-nvrtc-cu12==12.4.127 (from torch>=2.0.0->torchmetrics)
  Downloading nvidia_cuda_nvrtc_cu12-12.4.127-py3-none-manylinux2014_x86_64.whl.metadata (1.5 kB)
Collecting nvidia-cuda-runtime-cu12==12.4.127 (from torch>=2.0.0->torchmetrics)
  Downloading nvidia_cuda_runtime_cu12-12.4.127-py3-none-manylinux2014_x86_64.whl.metadata (1.5 kB)
Collecting nvidia-cuda-cupti-cu12==12.4.127 (from torch>=2.0.0->torchmetrics)
  Downloading nvidia_cuda_cupti_cu12-12.4.127-py3-none-manylinux2014_x86_64.whl.metadata (1.6 kB)
Collecting nvidia-cudnn-cu12==9.1.0.70 (from torch>=2.0.0->torchmetrics)
  Downloading nvidia_cudnn_cu12-9.1.0.70-py3-none-manylinux2014_x86_64.whl.metadata (1.6 kB)
Collecting nvidia-cublas-cu12==12.4.5.8 (from torch>=2.0.0->torchmetrics)
  D

# Загрузка данных

## Комментарий по созданию InriaDataset

Рассматривалось два варианта создания InriaDataset.
1. Без предобработки. Нарезка исходных картинок 5000х5000 "на лету" во время обучения потребует много ресурсов и соответственно много времени на одну эпоху. Но занимает меньше места на диске.
2. С предобработкой. Сохранение патчей 512х512 на диск. Во время обучения обращается напрямую.

Принято решение выбрать второй вариант.

В связи с тем, что объем памяти на диске ограничен, обучать модель будем поэтапно.
1. Загрузили часть обучающих данных.
2. Обучили на них модель.
3. Сохранили веса модели.
4. Удалили обучающие данные.
5. Повторяем пп.1-4, пока не закончаться данные.

## Предпроцессинг, создание датасета

In [None]:
images_path = os.path.join(PROJECT_PATH, 'data/train_small/images')
gt_path = os.path.join(PROJECT_PATH, 'data/train_small/gt')
images_patches_path = os.path.join(PROJECT_PATH, 'data/train_small/images_patches')
gt_patches_path = os.path.join(PROJECT_PATH, 'data/train_small/gt_patches')

In [None]:
# Создаем патчи 512х512 из больших картинок 5000х5000
# image_paths, gt_paths = create_patches_to_disc(images_path, gt_path, images_patches_path, gt_patches_path)

# В случае, когда по исходным картинкам патчи уже созданы, просто сохраняем список путей к ним
image_paths = sorted(glob(os.path.join(images_patches_path, "*.tif")))
gt_paths = sorted(glob(os.path.join(gt_patches_path, "*.tif")))

In [None]:
train_img, val_img, train_gt, val_gt = train_test_split(image_paths, gt_paths, test_size=0.2)

In [None]:
train_dataset = InriaDataset(train_img, train_gt, transform=train_transform)
val_dataset = InriaDataset(val_img, val_gt, transform=val_transform)

train_loader = DataLoader(train_dataset, batch_size=BATCH_SIZE, shuffle=True, num_workers=2)
val_loader = DataLoader(val_dataset, batch_size=BATCH_SIZE, num_workers=2)

## Создание модели, загрузка весов

In [None]:
model = UNet().to(device)
optimizer = torch.optim.AdamW(model.parameters(), lr=LR)
criterion = DiceBCELoss()
metrics = SegmentationMetrics(device=device)

In [None]:
weights_path = os.path.join(PROJECT_PATH, 'experiments/checkpoints/unet_resnet34_buildings_final.pth')
model.load_state_dict(torch.load(weights_path))

<All keys matched successfully>

## Обучение

In [None]:
history = {
    'train': {'loss': []},
    'val': {
        'loss': [],
        'metrics': {
            'IoU': [],
            'Dice': [],
            'Precision': [],
            'Recall': []
        }
    },
    'best_epoch': -1,
    'timestamp': datetime.now().strftime("%Y-%m-%d_%H-%M-%S")
}
best_val_loss_path = os.path.join(PROJECT_PATH, 'experiments/checkpoints/best_val_loss.json')

In [None]:
# Загружаем лучший val_loss с прошлого этапа обучения
with open(best_val_loss_path, "r") as f:
    best_val_loss = json.load(f)

In [None]:
for epoch in range(EPOCHS):

    train_loss = train(model, train_loader, optimizer, criterion, device)
    history['train']['loss'].append(train_loss)

    val_loss, val_metrics = validate(model, val_loader, criterion, metrics, device)
    history['val']['loss'].append(val_loss)
    for metric_name in history['val']['metrics'].keys():
        history['val']['metrics'][metric_name].append(val_metrics[metric_name].item())

    print(f'\nEpoch {epoch+1}/{EPOCHS}:')
    print(f'Train Loss: {train_loss:.4f} | Val Loss: {val_loss:.4f}')
    print('Val Metrics:')
    for name, value in val_metrics.items():
        print(f'- {name}: {value:.4f}')

    # --- Сохранение лучшей модели ---
    if val_loss < best_val_loss:
        best_val_loss = val_loss

        model_path = os.path.join(PROJECT_PATH, 'experiments/checkpoints/unet_resnet34_buildings_best.pth')
        torch.save(model.state_dict(), model_path)
        print(f'Model saved to {model_path}')

        # Сохраняем best_val_loss, чтобы при переподключении к среде снова его подгрузить
        with open(best_val_loss_path, 'w') as f:
            json.dump(best_val_loss, f)

    final_model_path = os.path.join(PROJECT_PATH, 'experiments/checkpoints/unet_resnet34_buildings_final.pth')
    torch.save(model.state_dict(), final_model_path)

    # Сохранение истории в JSON
    history_path = os.path.join(PROJECT_PATH, 'experiments/checkpoints/loss_history.json')
    with open(history_path, 'w') as f:
        json.dump(history, f)


OutOfMemoryError: CUDA out of memory. Tried to allocate 768.00 MiB. GPU 0 has a total capacity of 14.74 GiB of which 202.12 MiB is free. Process 405958 has 14.54 GiB memory in use. Of the allocated memory 13.22 GiB is allocated by PyTorch, and 1.20 GiB is reserved by PyTorch but unallocated. If reserved but unallocated memory is large try setting PYTORCH_CUDA_ALLOC_CONF=expandable_segments:True to avoid fragmentation.  See documentation for Memory Management  (https://pytorch.org/docs/stable/notes/cuda.html#environment-variables)

Обучение происходит поэтапно.
В каждом новом этапе подгружаем веса из unet_resnet34_buildings_final.pth

Всю историю сохранями в loss_history.json, указывая как это были эпохи.

## Визуализация результатов

In [None]:
plot_history(history)

In [None]:
with torch.no_grad():
  im, mas = next(iter(val_loader))
  im = im.to(device)
  mas = mas.unsqueeze(1).to(device)
  out = model(im)

In [None]:
visualize_images_and_masks(im, out, mas)