# Лабораторная работа 7, студент Устинов Денис Александрович М8О-406Б-21

## 1. Выбор начальных условий

### a. Набор данных для задачи сегментации
В качестве датасета был выбран CamVid (https://github.com/alexgkendall/SegNet-Tutorial).

CamVid - это небольшой, но хорошо структурированный датасет для семантической сегментации дорожных сцен. Он содержит видеозаписи и соответствующие им изображения с детальной разметкой, где каждый пиксель отнесен к одному из 32 классов, включая дороги, здания, автомобили, пешеходов и другие элементы городской среды. Данные были собраны в реальных условиях движения по городу, что обеспечивает разнообразие сцен с различным освещением, погодными условиями и углами обзора.

Обоснование выбора:

С практической точки зрения, задачи, которые можно решать с помощью CamVid, имеют прямое отношение к разработке систем автономного вождения и интеллектуального анализа городской инфраструктуры. Например, точная сегментация дорожного полотна, тротуаров и препятствий критически важна для навигации беспилотных автомобилей. 

### b. Выбор метрик качества

1) IoU (Intersection over Union) — отношение площади пересечения предсказанной и истинной маски к их объединению (основная метрика в сегментации).
2) Pixel Accuracy — доля правильно классифицированных пикселей (простая, но чувствительна к дисбалансу классов).

## 2. Создание бейзлайна и оценка качества

### a. Обучить сверточную модель (resnet34) из segmentation_models_pytorch для выбранного набора данных и оценить качество моделей по выбранным метрикам на выбранном наборе данных

In [1]:
import torch
import torch.nn as nn
import torch.optim as optim
from torch.utils.data import DataLoader, Dataset
from torchvision import transforms
from PIL import Image
import os
import numpy as np
import wget
import zipfile
import segmentation_models_pytorch as smp
from torchmetrics import JaccardIndex, Accuracy

device = torch.device("cpu")
BATCH_SIZE = 4
EPOCHS = 3
LEARNING_RATE = 0.001
IMAGE_SIZE = (128, 128)

if not os.path.exists("./CamVid"):
    os.makedirs("./CamVid", exist_ok=True)
    url = "https://github.com/alexgkendall/SegNet-Tutorial/archive/master.zip"
    wget.download(url, out="./CamVid/master.zip")
    
    with zipfile.ZipFile("./CamVid/master.zip", 'r') as zip_ref:
        zip_ref.extractall("./CamVid")
    
    os.rename("./CamVid/SegNet-Tutorial-master/CamVid", "./CamVid/data")
    os.remove("./CamVid/master.zip")

class CamVidDataset(Dataset):
    def __init__(self, image_dir, label_dir, image_size=(128, 128)):
        self.image_dir = image_dir
        self.label_dir = label_dir
        self.image_size = image_size
        
        self.image_files = sorted([f for f in os.listdir(image_dir) if f.endswith('.png')])
        self.label_files = sorted([f for f in os.listdir(label_dir) if f.endswith('.png')])
                
        self.transform = transforms.Compose([
            transforms.Resize(image_size),
            transforms.ToTensor(),
            transforms.Normalize(mean=[0.485, 0.456, 0.406], std=[0.229, 0.224, 0.225])
        ])

    def __len__(self):
        return len(self.image_files)

    def __getitem__(self, idx):
        img_path = os.path.join(self.image_dir, self.image_files[idx])
        label_path = os.path.join(self.label_dir, self.label_files[idx])
        
        img = Image.open(img_path).convert('RGB')
        label = Image.open(label_path)
        
        img = self.transform(img)
        label = transforms.Resize(self.image_size, Image.NEAREST)(label)
        label = torch.from_numpy(np.array(label)).long()
        
        return img, label

train_data = CamVidDataset(
    image_dir="./CamVid/data/train",
    label_dir="./CamVid/data/trainannot",
    image_size=IMAGE_SIZE
)
test_data = CamVidDataset(
    image_dir="./CamVid/data/test",
    label_dir="./CamVid/data/testannot",
    image_size=IMAGE_SIZE
)

train_loader = DataLoader(train_data, batch_size=BATCH_SIZE, shuffle=True)
test_loader = DataLoader(test_data, batch_size=BATCH_SIZE, shuffle=False)

model = smp.Unet(
    encoder_name="resnet34",
    encoder_weights="imagenet",
    in_channels=3,
    classes=32,
).to(device)

criterion = nn.CrossEntropyLoss()
optimizer = optim.Adam(model.parameters(), lr=LEARNING_RATE)

iou = JaccardIndex(task="multiclass", num_classes=32).to(device)
pixel_acc = Accuracy(task="multiclass", num_classes=32).to(device)

for epoch in range(EPOCHS):
    model.train()
    for images, targets in train_loader:
        images, targets = images.to(device), targets.to(device)
        
        optimizer.zero_grad()
        outputs = model(images)
        loss = criterion(outputs, targets)
        loss.backward()
        optimizer.step()

model.eval()
with torch.no_grad():
    for images, targets in test_loader:
        images, targets = images.to(device), targets.to(device)
        outputs = model(images)
        preds = torch.argmax(outputs, dim=1)
        
        iou.update(preds, targets)
        pixel_acc.update(preds, targets)

print(f"IoU (Jaccard Index): {iou.compute().item():.4f}")
print(f"Pixel Accuracy: {pixel_acc.compute().item():.4f}")

  from .autonotebook import tqdm as notebook_tqdm


IoU (Jaccard Index): 0.3292
Pixel Accuracy: 0.7772


### b. Обучить трансформерную модель (mit_b0) из segmentation_models_pytorch для выбранного набора данных и оценить качество моделей по выбранным метрикам на выбранном наборе данных

In [1]:
import torch
import torch.nn as nn
import torch.optim as optim
from torch.utils.data import DataLoader, Dataset
from torchvision import transforms
from PIL import Image
import os
import numpy as np
import segmentation_models_pytorch as smp
from torchmetrics import JaccardIndex, Accuracy

device = torch.device("cpu")
BATCH_SIZE = 2
EPOCHS = 3
LEARNING_RATE = 0.001
IMAGE_SIZE = (224, 224)

class CamVidDataset(Dataset):
    def __init__(self, image_dir, label_dir, image_size=(224, 224)):
        self.image_dir = image_dir
        self.label_dir = label_dir
        self.image_size = image_size
        
        self.image_files = sorted([f for f in os.listdir(image_dir) if f.endswith('.png')])
        self.label_files = sorted([f for f in os.listdir(label_dir) if f.endswith('.png')])
                
        self.transform = transforms.Compose([
            transforms.Resize(image_size),
            transforms.ToTensor(),
            transforms.Normalize(mean=[0.485, 0.456, 0.406], std=[0.229, 0.224, 0.225])
        ])

    def __len__(self):
        return len(self.image_files)

    def __getitem__(self, idx):
        img_path = os.path.join(self.image_dir, self.image_files[idx])
        label_path = os.path.join(self.label_dir, self.label_files[idx])
        
        img = Image.open(img_path).convert('RGB')
        label = Image.open(label_path)
        
        img = self.transform(img)
        label = transforms.Resize(self.image_size, Image.NEAREST)(label)
        label = torch.from_numpy(np.array(label)).long()
        
        return img, label

train_data = CamVidDataset(
    image_dir="./CamVid/data/train",
    label_dir="./CamVid/data/trainannot",
    image_size=IMAGE_SIZE
)
test_data = CamVidDataset(
    image_dir="./CamVid/data/test",
    label_dir="./CamVid/data/testannot",
    image_size=IMAGE_SIZE
)

train_loader = DataLoader(train_data, batch_size=BATCH_SIZE, shuffle=True)
test_loader = DataLoader(test_data, batch_size=BATCH_SIZE, shuffle=False)

model = smp.Unet(
    encoder_name="mit_b0",
    encoder_weights="imagenet",
    in_channels=3,
    classes=32,
).to(device)

criterion = nn.CrossEntropyLoss()
optimizer = optim.Adam(model.parameters(), lr=LEARNING_RATE)

iou = JaccardIndex(task="multiclass", num_classes=32).to(device)
pixel_acc = Accuracy(task="multiclass", num_classes=32).to(device)

for epoch in range(EPOCHS):
    model.train()
    for images, targets in train_loader:
        images, targets = images.to(device), targets.to(device)
        
        optimizer.zero_grad()
        outputs = model(images)
        loss = criterion(outputs, targets)
        loss.backward()
        optimizer.step()

model.eval()
with torch.no_grad():
    for images, targets in test_loader:
        images, targets = images.to(device), targets.to(device)
        outputs = model(images)
        preds = torch.argmax(outputs, dim=1)
        
        iou.update(preds, targets)
        pixel_acc.update(preds, targets)

print(f"IoU (Jaccard Index): {iou.compute().item():.4f}")
print(f"Pixel Accuracy: {pixel_acc.compute().item():.4f}")

  from .autonotebook import tqdm as notebook_tqdm


IoU (Jaccard Index): 0.3003
Pixel Accuracy: 0.7670


## 3. Улучшение бейзлайна

### a. Сформулировать гипотезы (аугментации данных, подбор моделей, подбор гиперпараметров и т.д)

1. **Аугментация данных**. Добавим небольшие сдвиги, повороты и изменения яркости/контраста
2. **Оптимизация гиперпараметров + AdamW**. Подбор learning rate, batch size и использование AdamW с косинусным расписанием ускоряют сходимость.
3. **Комбинированная функция потерь (Dice + CE)**. Сочетание кросс-энтропии и Dice Loss улучшает качество сегментации за счет баланса между точностью классификации и геометрией областей.

### Обучение моделей, оценка качества обучения моделей по метрикам

#### Аугментация данных

##### Сверточная модель resnet34

In [2]:
import torch
import torch.nn as nn
import torch.optim as optim
from torch.utils.data import DataLoader, Dataset
from torchvision import transforms
import torchvision.transforms.functional as TF
from PIL import Image
import os
import numpy as np
import segmentation_models_pytorch as smp
import random
from torchmetrics import JaccardIndex, Accuracy

device = torch.device("cpu")
BATCH_SIZE = 4
EPOCHS = 3
LEARNING_RATE = 0.001
IMAGE_SIZE = (128, 128)

class CamVidDataset(Dataset):
    def __init__(self, image_dir, label_dir, image_size=(128, 128), is_train=True):
        self.image_dir = image_dir
        self.label_dir = label_dir
        self.image_size = image_size
        self.is_train = is_train
        
        self.image_files = sorted([f for f in os.listdir(image_dir) if f.endswith('.png')])
        self.label_files = sorted([f for f in os.listdir(label_dir) if f.endswith('.png')])
        
        self.normalize = transforms.Normalize(mean=[0.485, 0.456, 0.406], std=[0.229, 0.224, 0.225])

    def __len__(self):
        return len(self.image_files)

    def __getitem__(self, idx):
        img_path = os.path.join(self.image_dir, self.image_files[idx])
        label_path = os.path.join(self.label_dir, self.label_files[idx])
        
        img = Image.open(img_path).convert('RGB')
        label = Image.open(label_path)
        
        img = TF.resize(img, self.image_size)
        label = TF.resize(label, self.image_size, interpolation=Image.NEAREST)
        
        if self.is_train:
            if random.random() > 0.5:
                img = TF.hflip(img)
                label = TF.hflip(label)
            
            if random.random() > 0.5:
                img = TF.vflip(img)
                label = TF.vflip(label)
            
            angle = random.uniform(-10, 10)
            img = TF.rotate(img, angle)
            label = TF.rotate(label, angle)
            
            brightness = random.uniform(0.8, 1.2)
            contrast = random.uniform(0.8, 1.2)
            img = TF.adjust_brightness(img, brightness)
            img = TF.adjust_contrast(img, contrast)
        
        img = TF.to_tensor(img)
        label = torch.from_numpy(np.array(label)).long()
        
        img = self.normalize(img)
        
        return img, label

train_data = CamVidDataset(
    image_dir="./CamVid/data/train",
    label_dir="./CamVid/data/trainannot",
    image_size=IMAGE_SIZE,
    is_train=True
)
test_data = CamVidDataset(
    image_dir="./CamVid/data/test",
    label_dir="./CamVid/data/testannot",
    image_size=IMAGE_SIZE,
    is_train=False
)

train_loader = DataLoader(train_data, batch_size=BATCH_SIZE, shuffle=True)
test_loader = DataLoader(test_data, batch_size=BATCH_SIZE, shuffle=False)

model = smp.Unet(
    encoder_name="resnet34",
    encoder_weights="imagenet",
    in_channels=3,
    classes=32,
).to(device)

criterion = nn.CrossEntropyLoss()
optimizer = optim.Adam(model.parameters(), lr=LEARNING_RATE)

iou = JaccardIndex(task="multiclass", num_classes=32).to(device)
pixel_acc = Accuracy(task="multiclass", num_classes=32).to(device)

for epoch in range(EPOCHS):
    model.train()
    for images, targets in train_loader:
        images, targets = images.to(device), targets.to(device)
        
        optimizer.zero_grad()
        outputs = model(images)
        loss = criterion(outputs, targets)
        loss.backward()
        optimizer.step()

model.eval()
with torch.no_grad():
    for images, targets in test_loader:
        images, targets = images.to(device), targets.to(device)
        outputs = model(images)
        preds = torch.argmax(outputs, dim=1)
        
        iou.update(preds, targets)
        pixel_acc.update(preds, targets)

print(f"IoU (Jaccard Index): {iou.compute().item():.4f}")
print(f"Pixel Accuracy: {pixel_acc.compute().item():.4f}")

IoU (Jaccard Index): 0.2267
Pixel Accuracy: 0.6731


##### Трансформерная модель mit_b0

In [3]:
import torch
import torch.nn as nn
import torch.optim as optim
from torch.utils.data import DataLoader, Dataset
from torchvision import transforms
import torchvision.transforms.functional as TF
from PIL import Image
import os
import numpy as np
import segmentation_models_pytorch as smp
import random
from torchmetrics import JaccardIndex, Accuracy

device = torch.device("cpu")
BATCH_SIZE = 2
EPOCHS = 3
LEARNING_RATE = 0.001
IMAGE_SIZE = (224, 224)

class CamVidDataset(Dataset):
    def __init__(self, image_dir, label_dir, image_size=(128, 128), is_train=True):
        self.image_dir = image_dir
        self.label_dir = label_dir
        self.image_size = image_size
        self.is_train = is_train
        
        self.image_files = sorted([f for f in os.listdir(image_dir) if f.endswith('.png')])
        self.label_files = sorted([f for f in os.listdir(label_dir) if f.endswith('.png')])
        
        self.normalize = transforms.Normalize(mean=[0.485, 0.456, 0.406], std=[0.229, 0.224, 0.225])

    def __len__(self):
        return len(self.image_files)

    def __getitem__(self, idx):
        img_path = os.path.join(self.image_dir, self.image_files[idx])
        label_path = os.path.join(self.label_dir, self.label_files[idx])
        
        img = Image.open(img_path).convert('RGB')
        label = Image.open(label_path)
        
        img = TF.resize(img, self.image_size)
        label = TF.resize(label, self.image_size, interpolation=Image.NEAREST)
        
        if self.is_train:
            if random.random() > 0.5:
                img = TF.hflip(img)
                label = TF.hflip(label)
            
            if random.random() > 0.5:
                img = TF.vflip(img)
                label = TF.vflip(label)
            
            angle = random.uniform(-10, 10)
            img = TF.rotate(img, angle)
            label = TF.rotate(label, angle)
            
            brightness = random.uniform(0.8, 1.2)
            contrast = random.uniform(0.8, 1.2)
            img = TF.adjust_brightness(img, brightness)
            img = TF.adjust_contrast(img, contrast)
        
        img = TF.to_tensor(img)
        label = torch.from_numpy(np.array(label)).long()
        
        img = self.normalize(img)
        
        return img, label

train_data = CamVidDataset(
    image_dir="./CamVid/data/train",
    label_dir="./CamVid/data/trainannot",
    image_size=IMAGE_SIZE,
    is_train=True
)
test_data = CamVidDataset(
    image_dir="./CamVid/data/test",
    label_dir="./CamVid/data/testannot",
    image_size=IMAGE_SIZE,
    is_train=False
)

train_loader = DataLoader(train_data, batch_size=BATCH_SIZE, shuffle=True)
test_loader = DataLoader(test_data, batch_size=BATCH_SIZE, shuffle=False)

model = smp.Unet(
    encoder_name="mit_b0",
    encoder_weights="imagenet",
    in_channels=3,
    classes=32,
).to(device)

criterion = nn.CrossEntropyLoss()
optimizer = optim.Adam(model.parameters(), lr=LEARNING_RATE)

iou = JaccardIndex(task="multiclass", num_classes=32).to(device)
pixel_acc = Accuracy(task="multiclass", num_classes=32).to(device)

for epoch in range(EPOCHS):
    model.train()
    for images, targets in train_loader:
        images, targets = images.to(device), targets.to(device)
        
        optimizer.zero_grad()
        outputs = model(images)
        loss = criterion(outputs, targets)
        loss.backward()
        optimizer.step()

model.eval()
with torch.no_grad():
    for images, targets in test_loader:
        images, targets = images.to(device), targets.to(device)
        outputs = model(images)
        preds = torch.argmax(outputs, dim=1)
        
        iou.update(preds, targets)
        pixel_acc.update(preds, targets)

print(f"IoU (Jaccard Index): {iou.compute().item():.4f}")
print(f"Pixel Accuracy: {pixel_acc.compute().item():.4f}")

IoU (Jaccard Index): 0.1834
Pixel Accuracy: 0.6261


#### Оптимизация гиперпараметров + AdamW

##### Сверточная модель resnet34

Уменьшаем LEARNING_RATE и увеличиваем количество эпох и размер батча. Используем AdamW и CosineAnnealingLR

In [4]:
import torch
import torch.nn as nn
import torch.optim as optim
from torch.utils.data import DataLoader, Dataset
from torchvision import transforms
from PIL import Image
import os
import numpy as np
import segmentation_models_pytorch as smp
from torchmetrics import JaccardIndex, Accuracy
from torch.optim import AdamW
from torch.optim.lr_scheduler import CosineAnnealingLR

device = torch.device("cpu")

BATCH_SIZE = 8
EPOCHS = 5
LEARNING_RATE = 0.0005
IMAGE_SIZE = (128, 128)

class CamVidDataset(Dataset):
    def __init__(self, image_dir, label_dir, image_size=(224, 224)):
        self.image_dir = image_dir
        self.label_dir = label_dir
        self.image_size = image_size
        
        self.image_files = sorted([f for f in os.listdir(image_dir) if f.endswith('.png')])
        self.label_files = sorted([f for f in os.listdir(label_dir) if f.endswith('.png')])
                
        self.transform = transforms.Compose([
            transforms.Resize(image_size),
            transforms.ToTensor(),
            transforms.Normalize(mean=[0.485, 0.456, 0.406], std=[0.229, 0.224, 0.225])
        ])

    def __len__(self):
        return len(self.image_files)

    def __getitem__(self, idx):
        img_path = os.path.join(self.image_dir, self.image_files[idx])
        label_path = os.path.join(self.label_dir, self.label_files[idx])
        
        img = Image.open(img_path).convert('RGB')
        label = Image.open(label_path)
        
        img = self.transform(img)
        label = transforms.Resize(self.image_size, Image.NEAREST)(label)
        label = torch.from_numpy(np.array(label)).long()
        
        return img, label

train_data = CamVidDataset(
    image_dir="./CamVid/data/train",
    label_dir="./CamVid/data/trainannot",
    image_size=IMAGE_SIZE
)
test_data = CamVidDataset(
    image_dir="./CamVid/data/test",
    label_dir="./CamVid/data/testannot",
    image_size=IMAGE_SIZE
)

train_loader = DataLoader(train_data, batch_size=BATCH_SIZE, shuffle=True)
test_loader = DataLoader(test_data, batch_size=BATCH_SIZE, shuffle=False)

model = smp.Unet(
    encoder_name="resnet34",
    encoder_weights="imagenet",
    in_channels=3,
    classes=32,
).to(device)

criterion = nn.CrossEntropyLoss()
optimizer = AdamW(model.parameters(), lr=LEARNING_RATE, weight_decay=1e-4)
scheduler = CosineAnnealingLR(optimizer, T_max=EPOCHS)

iou = JaccardIndex(task="multiclass", num_classes=32).to(device)
pixel_acc = Accuracy(task="multiclass", num_classes=32).to(device)

for epoch in range(EPOCHS):
    model.train()
    for images, targets in train_loader:
        images, targets = images.to(device), targets.to(device)
        
        optimizer.zero_grad()
        outputs = model(images)
        loss = criterion(outputs, targets)
        loss.backward()
        optimizer.step()
    
    scheduler.step()

model.eval()
with torch.no_grad():
    for images, targets in test_loader:
        images, targets = images.to(device), targets.to(device)
        outputs = model(images)
        preds = torch.argmax(outputs, dim=1)
        
        iou.update(preds, targets)
        pixel_acc.update(preds, targets)

print(f"IoU (Jaccard Index): {iou.compute().item():.4f}")
print(f"Pixel Accuracy: {pixel_acc.compute().item():.4f}")

IoU (Jaccard Index): 0.3560
Pixel Accuracy: 0.8121


##### Трансформерная модель mit_b0

Уменьшаем LEARNING_RATE и увеличиваем количество эпох и размер батча. Используем AdamW и CosineAnnealingLR

In [5]:
import torch
import torch.nn as nn
import torch.optim as optim
from torch.utils.data import DataLoader, Dataset
from torchvision import transforms
from PIL import Image
import os
import numpy as np
import segmentation_models_pytorch as smp
from torchmetrics import JaccardIndex, Accuracy
from torch.optim import AdamW
from torch.optim.lr_scheduler import CosineAnnealingLR

device = torch.device("cpu")

BATCH_SIZE = 4
EPOCHS = 5
LEARNING_RATE = 0.0005
IMAGE_SIZE = (224, 224)

class CamVidDataset(Dataset):
    def __init__(self, image_dir, label_dir, image_size=(224, 224)):
        self.image_dir = image_dir
        self.label_dir = label_dir
        self.image_size = image_size
        
        self.image_files = sorted([f for f in os.listdir(image_dir) if f.endswith('.png')])
        self.label_files = sorted([f for f in os.listdir(label_dir) if f.endswith('.png')])
                
        self.transform = transforms.Compose([
            transforms.Resize(image_size),
            transforms.ToTensor(),
            transforms.Normalize(mean=[0.485, 0.456, 0.406], std=[0.229, 0.224, 0.225])
        ])

    def __len__(self):
        return len(self.image_files)

    def __getitem__(self, idx):
        img_path = os.path.join(self.image_dir, self.image_files[idx])
        label_path = os.path.join(self.label_dir, self.label_files[idx])
        
        img = Image.open(img_path).convert('RGB')
        label = Image.open(label_path)
        
        img = self.transform(img)
        label = transforms.Resize(self.image_size, Image.NEAREST)(label)
        label = torch.from_numpy(np.array(label)).long()
        
        return img, label

train_data = CamVidDataset(
    image_dir="./CamVid/data/train",
    label_dir="./CamVid/data/trainannot",
    image_size=IMAGE_SIZE
)
test_data = CamVidDataset(
    image_dir="./CamVid/data/test",
    label_dir="./CamVid/data/testannot",
    image_size=IMAGE_SIZE
)

train_loader = DataLoader(train_data, batch_size=BATCH_SIZE, shuffle=True)
test_loader = DataLoader(test_data, batch_size=BATCH_SIZE, shuffle=False)
model = smp.Unet(
    encoder_name="mit_b0",
    encoder_weights="imagenet",
    in_channels=3,
    classes=32,
).to(device)

criterion = nn.CrossEntropyLoss()
optimizer = AdamW(model.parameters(), lr=LEARNING_RATE, weight_decay=1e-4)
scheduler = CosineAnnealingLR(optimizer, T_max=EPOCHS)

iou = JaccardIndex(task="multiclass", num_classes=32).to(device)
pixel_acc = Accuracy(task="multiclass", num_classes=32).to(device)

for epoch in range(EPOCHS):
    model.train()
    for images, targets in train_loader:
        images, targets = images.to(device), targets.to(device)
        
        optimizer.zero_grad()
        outputs = model(images)
        loss = criterion(outputs, targets)
        loss.backward()
        optimizer.step()
    
    scheduler.step()

model.eval()
with torch.no_grad():
    for images, targets in test_loader:
        images, targets = images.to(device), targets.to(device)
        outputs = model(images)
        preds = torch.argmax(outputs, dim=1)
        
        iou.update(preds, targets)
        pixel_acc.update(preds, targets)

print(f"IoU (Jaccard Index): {iou.compute().item():.4f}")
print(f"Pixel Accuracy: {pixel_acc.compute().item():.4f}")

IoU (Jaccard Index): 0.3920
Pixel Accuracy: 0.8366


#### Комбинированная функция потерь (Dice + CE)

##### Сверточная модель resnet34

In [6]:
import torch
import torch.nn as nn
import torch.optim as optim
from torch.utils.data import DataLoader, Dataset
from torchvision import transforms
from PIL import Image
import os
import numpy as np
import segmentation_models_pytorch as smp
from torchmetrics import JaccardIndex, Accuracy

device = torch.device("cpu")

BATCH_SIZE = 4
EPOCHS = 3
LEARNING_RATE = 0.001
IMAGE_SIZE = (128, 128)

class CamVidDataset(Dataset):
    def __init__(self, image_dir, label_dir, image_size=(224, 224)):
        self.image_dir = image_dir
        self.label_dir = label_dir
        self.image_size = image_size
        
        self.image_files = sorted([f for f in os.listdir(image_dir) if f.endswith('.png')])
        self.label_files = sorted([f for f in os.listdir(label_dir) if f.endswith('.png')])
                
        self.transform = transforms.Compose([
            transforms.Resize(image_size),
            transforms.ToTensor(),
            transforms.Normalize(mean=[0.485, 0.456, 0.406], std=[0.229, 0.224, 0.225])
        ])

    def __len__(self):
        return len(self.image_files)

    def __getitem__(self, idx):
        img_path = os.path.join(self.image_dir, self.image_files[idx])
        label_path = os.path.join(self.label_dir, self.label_files[idx])
        
        img = Image.open(img_path).convert('RGB')
        label = Image.open(label_path)
        
        img = self.transform(img)
        label = transforms.Resize(self.image_size, Image.NEAREST)(label)
        label = torch.from_numpy(np.array(label)).long()
        
        return img, label

train_data = CamVidDataset(
    image_dir="./CamVid/data/train",
    label_dir="./CamVid/data/trainannot",
    image_size=IMAGE_SIZE
)
test_data = CamVidDataset(
    image_dir="./CamVid/data/test",
    label_dir="./CamVid/data/testannot",
    image_size=IMAGE_SIZE
)

train_loader = DataLoader(train_data, batch_size=BATCH_SIZE, shuffle=True)
test_loader = DataLoader(test_data, batch_size=BATCH_SIZE, shuffle=False)

model = smp.Unet(
    encoder_name="resnet34",
    encoder_weights="imagenet",
    in_channels=3,
    classes=32,
).to(device)

class DiceCELoss(nn.Module):
    def __init__(self):
        super().__init__()
        self.ce = nn.CrossEntropyLoss()
        self.dice = smp.losses.DiceLoss(mode='multiclass')
    
    def forward(self, outputs, targets):
        return 0.5 * self.ce(outputs, targets) + 0.5 * self.dice(outputs, targets)

criterion = DiceCELoss()
optimizer = optim.Adam(model.parameters(), lr=LEARNING_RATE)

iou = JaccardIndex(task="multiclass", num_classes=32).to(device)
pixel_acc = Accuracy(task="multiclass", num_classes=32).to(device)

for epoch in range(EPOCHS):
    model.train()
    for images, targets in train_loader:
        images, targets = images.to(device), targets.to(device)
        
        optimizer.zero_grad()
        outputs = model(images)
        loss = criterion(outputs, targets)
        loss.backward()
        optimizer.step()

model.eval()
with torch.no_grad():
    for images, targets in test_loader:
        images, targets = images.to(device), targets.to(device)
        outputs = model(images)
        preds = torch.argmax(outputs, dim=1)
        
        iou.update(preds, targets)
        pixel_acc.update(preds, targets)

print(f"IoU (Jaccard Index): {iou.compute().item():.4f}")
print(f"Pixel Accuracy: {pixel_acc.compute().item():.4f}")

IoU (Jaccard Index): 0.3633
Pixel Accuracy: 0.8088


##### Трансформерная модель mit_b0

In [7]:
import torch
import torch.nn as nn
import torch.optim as optim
from torch.utils.data import DataLoader, Dataset
from torchvision import transforms
from PIL import Image
import os
import numpy as np
import segmentation_models_pytorch as smp
from torchmetrics import JaccardIndex, Accuracy

device = torch.device("cpu")

BATCH_SIZE = 2
EPOCHS = 3
LEARNING_RATE = 0.001
IMAGE_SIZE = (224, 224)

class CamVidDataset(Dataset):
    def __init__(self, image_dir, label_dir, image_size=(224, 224)):
        self.image_dir = image_dir
        self.label_dir = label_dir
        self.image_size = image_size
        
        self.image_files = sorted([f for f in os.listdir(image_dir) if f.endswith('.png')])
        self.label_files = sorted([f for f in os.listdir(label_dir) if f.endswith('.png')])
                
        self.transform = transforms.Compose([
            transforms.Resize(image_size),
            transforms.ToTensor(),
            transforms.Normalize(mean=[0.485, 0.456, 0.406], std=[0.229, 0.224, 0.225])
        ])

    def __len__(self):
        return len(self.image_files)

    def __getitem__(self, idx):
        img_path = os.path.join(self.image_dir, self.image_files[idx])
        label_path = os.path.join(self.label_dir, self.label_files[idx])
        
        img = Image.open(img_path).convert('RGB')
        label = Image.open(label_path)
        
        img = self.transform(img)
        label = transforms.Resize(self.image_size, Image.NEAREST)(label)
        label = torch.from_numpy(np.array(label)).long()
        
        return img, label

train_data = CamVidDataset(
    image_dir="./CamVid/data/train",
    label_dir="./CamVid/data/trainannot",
    image_size=IMAGE_SIZE
)
test_data = CamVidDataset(
    image_dir="./CamVid/data/test",
    label_dir="./CamVid/data/testannot",
    image_size=IMAGE_SIZE
)

train_loader = DataLoader(train_data, batch_size=BATCH_SIZE, shuffle=True)
test_loader = DataLoader(test_data, batch_size=BATCH_SIZE, shuffle=False)

model = smp.Unet(
    encoder_name="mit_b0",
    encoder_weights="imagenet",
    in_channels=3,
    classes=32,
).to(device)

class DiceCELoss(nn.Module):
    def __init__(self):
        super().__init__()
        self.ce = nn.CrossEntropyLoss()
        self.dice = smp.losses.DiceLoss(mode='multiclass')
    
    def forward(self, outputs, targets):
        return 0.5 * self.ce(outputs, targets) + 0.5 * self.dice(outputs, targets)

criterion = DiceCELoss()
optimizer = optim.Adam(model.parameters(), lr=LEARNING_RATE)

iou = JaccardIndex(task="multiclass", num_classes=32).to(device)
pixel_acc = Accuracy(task="multiclass", num_classes=32).to(device)

for epoch in range(EPOCHS):
    model.train()
    for images, targets in train_loader:
        images, targets = images.to(device), targets.to(device)
        
        optimizer.zero_grad()
        outputs = model(images)
        loss = criterion(outputs, targets)
        loss.backward()
        optimizer.step()

model.eval()
with torch.no_grad():
    for images, targets in test_loader:
        images, targets = images.to(device), targets.to(device)
        outputs = model(images)
        preds = torch.argmax(outputs, dim=1)
        
        iou.update(preds, targets)
        pixel_acc.update(preds, targets)

print(f"IoU (Jaccard Index): {iou.compute().item():.4f}")
print(f"Pixel Accuracy: {pixel_acc.compute().item():.4f}")

IoU (Jaccard Index): 0.3341
Pixel Accuracy: 0.7872


### Окончательный улучшенный бейзлайн

#### Сверточная модель resnet34

Используем улучшения из 2 и 3 гипотезы. Не используем аугментацию (1 гипотеза) - так как она не дала улучшений

In [8]:
import torch
import torch.nn as nn
import torch.optim as optim
from torch.utils.data import DataLoader, Dataset
from torchvision import transforms
from PIL import Image
import os
import numpy as np
import segmentation_models_pytorch as smp
from torchmetrics import JaccardIndex, Accuracy
from torch.optim import AdamW
from torch.optim.lr_scheduler import CosineAnnealingLR

device = torch.device("cpu")

BATCH_SIZE = 8
EPOCHS = 5
LEARNING_RATE = 0.0005
IMAGE_SIZE = (128, 128)

class CamVidDataset(Dataset):
    def __init__(self, image_dir, label_dir, image_size=(224, 224)):
        self.image_dir = image_dir
        self.label_dir = label_dir
        self.image_size = image_size
        
        self.image_files = sorted([f for f in os.listdir(image_dir) if f.endswith('.png')])
        self.label_files = sorted([f for f in os.listdir(label_dir) if f.endswith('.png')])
                
        self.transform = transforms.Compose([
            transforms.Resize(image_size),
            transforms.ToTensor(),
            transforms.Normalize(mean=[0.485, 0.456, 0.406], std=[0.229, 0.224, 0.225])
        ])

    def __len__(self):
        return len(self.image_files)

    def __getitem__(self, idx):
        img_path = os.path.join(self.image_dir, self.image_files[idx])
        label_path = os.path.join(self.label_dir, self.label_files[idx])
        
        img = Image.open(img_path).convert('RGB')
        label = Image.open(label_path)
        
        img = self.transform(img)
        label = transforms.Resize(self.image_size, Image.NEAREST)(label)
        label = torch.from_numpy(np.array(label)).long()
        
        return img, label

train_data = CamVidDataset(
    image_dir="./CamVid/data/train",
    label_dir="./CamVid/data/trainannot",
    image_size=IMAGE_SIZE
)
test_data = CamVidDataset(
    image_dir="./CamVid/data/test",
    label_dir="./CamVid/data/testannot",
    image_size=IMAGE_SIZE
)

train_loader = DataLoader(train_data, batch_size=BATCH_SIZE, shuffle=True)
test_loader = DataLoader(test_data, batch_size=BATCH_SIZE, shuffle=False)

model = smp.Unet(
    encoder_name="resnet34",
    encoder_weights="imagenet",
    in_channels=3,
    classes=32,
).to(device)

class DiceCELoss(nn.Module):
    def __init__(self):
        super().__init__()
        self.ce = nn.CrossEntropyLoss()
        self.dice = smp.losses.DiceLoss(mode='multiclass')
    
    def forward(self, outputs, targets):
        return 0.5 * self.ce(outputs, targets) + 0.5 * self.dice(outputs, targets)

criterion = DiceCELoss()
optimizer = AdamW(model.parameters(), lr=LEARNING_RATE, weight_decay=1e-4)
scheduler = CosineAnnealingLR(optimizer, T_max=EPOCHS)

iou = JaccardIndex(task="multiclass", num_classes=32).to(device)
pixel_acc = Accuracy(task="multiclass", num_classes=32).to(device)

for epoch in range(EPOCHS):
    model.train()
    for images, targets in train_loader:
        images, targets = images.to(device), targets.to(device)
        
        optimizer.zero_grad()
        outputs = model(images)
        loss = criterion(outputs, targets)
        loss.backward()
        optimizer.step()
    
    scheduler.step()

model.eval()
with torch.no_grad():
    for images, targets in test_loader:
        images, targets = images.to(device), targets.to(device)
        outputs = model(images)
        preds = torch.argmax(outputs, dim=1)
        
        iou.update(preds, targets)
        pixel_acc.update(preds, targets)

print(f"IoU (Jaccard Index): {iou.compute().item():.4f}")
print(f"Pixel Accuracy: {pixel_acc.compute().item():.4f}")

IoU (Jaccard Index): 0.3640
Pixel Accuracy: 0.8116


#### Трансформерная модель mit_b0

Используем улучшения из 2 и 3 гипотезы. Не используем аугментацию (1 гипотеза) - так как она не дала улучшений

In [9]:
import torch
import torch.nn as nn
import torch.optim as optim
from torch.utils.data import DataLoader, Dataset
from torchvision import transforms
from PIL import Image
import os
import numpy as np
import segmentation_models_pytorch as smp
from torchmetrics import JaccardIndex, Accuracy
from torch.optim import AdamW
from torch.optim.lr_scheduler import CosineAnnealingLR

device = torch.device("cpu")

BATCH_SIZE = 4
EPOCHS = 5
LEARNING_RATE = 0.0005
IMAGE_SIZE = (224, 224)

class CamVidDataset(Dataset):
    def __init__(self, image_dir, label_dir, image_size=(224, 224)):
        self.image_dir = image_dir
        self.label_dir = label_dir
        self.image_size = image_size
        
        self.image_files = sorted([f for f in os.listdir(image_dir) if f.endswith('.png')])
        self.label_files = sorted([f for f in os.listdir(label_dir) if f.endswith('.png')])
                
        self.transform = transforms.Compose([
            transforms.Resize(image_size),
            transforms.ToTensor(),
            transforms.Normalize(mean=[0.485, 0.456, 0.406], std=[0.229, 0.224, 0.225])
        ])

    def __len__(self):
        return len(self.image_files)

    def __getitem__(self, idx):
        img_path = os.path.join(self.image_dir, self.image_files[idx])
        label_path = os.path.join(self.label_dir, self.label_files[idx])
        
        img = Image.open(img_path).convert('RGB')
        label = Image.open(label_path)
        
        img = self.transform(img)
        label = transforms.Resize(self.image_size, Image.NEAREST)(label)
        label = torch.from_numpy(np.array(label)).long()
        
        return img, label

train_data = CamVidDataset(
    image_dir="./CamVid/data/train",
    label_dir="./CamVid/data/trainannot",
    image_size=IMAGE_SIZE
)
test_data = CamVidDataset(
    image_dir="./CamVid/data/test",
    label_dir="./CamVid/data/testannot",
    image_size=IMAGE_SIZE
)

train_loader = DataLoader(train_data, batch_size=BATCH_SIZE, shuffle=True)
test_loader = DataLoader(test_data, batch_size=BATCH_SIZE, shuffle=False)
model = smp.Unet(
    encoder_name="mit_b0",
    encoder_weights="imagenet",
    in_channels=3,
    classes=32,
).to(device)

class DiceCELoss(nn.Module):
    def __init__(self):
        super().__init__()
        self.ce = nn.CrossEntropyLoss()
        self.dice = smp.losses.DiceLoss(mode='multiclass')
    
    def forward(self, outputs, targets):
        return 0.5 * self.ce(outputs, targets) + 0.5 * self.dice(outputs, targets)

criterion = DiceCELoss()
optimizer = AdamW(model.parameters(), lr=LEARNING_RATE, weight_decay=1e-4)
scheduler = CosineAnnealingLR(optimizer, T_max=EPOCHS)

iou = JaccardIndex(task="multiclass", num_classes=32).to(device)
pixel_acc = Accuracy(task="multiclass", num_classes=32).to(device)

for epoch in range(EPOCHS):
    model.train()
    for images, targets in train_loader:
        images, targets = images.to(device), targets.to(device)
        
        optimizer.zero_grad()
        outputs = model(images)
        loss = criterion(outputs, targets)
        loss.backward()
        optimizer.step()
    
    scheduler.step()

model.eval()
with torch.no_grad():
    for images, targets in test_loader:
        images, targets = images.to(device), targets.to(device)
        outputs = model(images)
        preds = torch.argmax(outputs, dim=1)
        
        iou.update(preds, targets)
        pixel_acc.update(preds, targets)

print(f"IoU (Jaccard Index): {iou.compute().item():.4f}")
print(f"Pixel Accuracy: {pixel_acc.compute().item():.4f}")

IoU (Jaccard Index): 0.3942
Pixel Accuracy: 0.8342


### Выводы

В ходе экспериментального исследования были последовательно проверены три гипотезы по улучшению базовых моделей семантической сегментации на датасете CamVid. Исходные показатели для сверточной модели resnet34 демонстрировали уровень IoU 0.329 и точность пикселей 0.777, в то время как трансформерная модель mit_b0 показывала несколько более скромные результаты - IoU 0.300 при точности 0.767.

Применение аугментации данных, вопреки ожиданиям, привело к ухудшению метрик обеих моделей, что может объясняться недостаточно точной настройкой параметров преобразований или особенностями самого датасета. Наиболее значимый положительный эффект был достигнут при оптимизации гиперпараметров и использовании усовершенствованного оптимизатора AdamW. Для resnet34 это позволило повысить IoU до 0.356 при точности 0.812, а mit_b0 показала еще более впечатляющий рост - IoU 0.392 при точности 0.837.

Введение комбинированной функции потерь Dice + CrossEntropy также дало положительный результат, хотя и менее выраженный. Наилучшие итоговые показатели были получены при совместном применении техник из второй и третьей гипотез. Финальные метрики улучшенной resnet34 составили IoU 0.364 и точность 0.812, а mit_b0 достигла IoU 0.394 при точности 0.834. 

Считаю важным отметить, что трансформерная архитектура в конечном итоге превзошла сверточную по основному показателю IoU, демонстрируя потенциал подобных моделей для задач сегментации.

## 4. Имплементация алгоритма машинного обучения 

### Имплементация сверточной модели

In [1]:
import torch
import torch.nn as nn

class ConvBlock(nn.Module):
    def __init__(self, in_channels, out_channels):
        super().__init__()
        self.conv = nn.Sequential(
            nn.Conv2d(in_channels, out_channels, 3, padding=1),
            nn.BatchNorm2d(out_channels),
            nn.ReLU(inplace=True),
            nn.Conv2d(out_channels, out_channels, 3, padding=1),
            nn.BatchNorm2d(out_channels),
            nn.ReLU(inplace=True)
        )
    
    def forward(self, x):
        return self.conv(x)

class UpBlock(nn.Module):
    def __init__(self, in_channels, out_channels):
        super().__init__()
        self.up = nn.ConvTranspose2d(in_channels, out_channels, 2, stride=2)
        self.conv = ConvBlock(in_channels, out_channels)
    
    def forward(self, x1, x2):
        x1 = self.up(x1)
        x = torch.cat([x2, x1], dim=1)
        return self.conv(x)

class CustomUNet(nn.Module):
    def __init__(self, n_classes=32):
        super().__init__()
        self.down1 = ConvBlock(3, 64)
        self.down2 = ConvBlock(64, 128)
        self.down3 = ConvBlock(128, 256)
        self.down4 = ConvBlock(256, 512)
        self.pool = nn.MaxPool2d(2)
        
        self.bottleneck = ConvBlock(512, 1024)
        
        self.up1 = UpBlock(1024, 512)
        self.up2 = UpBlock(512, 256)
        self.up3 = UpBlock(256, 128)
        self.up4 = UpBlock(128, 64)
        
        self.out = nn.Conv2d(64, n_classes, 1)

    def forward(self, x):
        x1 = self.down1(x)
        x2 = self.pool(x1)
        x2 = self.down2(x2)
        x3 = self.pool(x2)
        x3 = self.down3(x3)
        x4 = self.pool(x3)
        x4 = self.down4(x4)
        x5 = self.pool(x4)
        
        x5 = self.bottleneck(x5)
        
        x = self.up1(x5, x4)
        x = self.up2(x, x3)
        x = self.up3(x, x2)
        x = self.up4(x, x1)
        
        return self.out(x)

### Имплементация трансформерной модели модели

In [3]:
import torch
import torch.nn as nn
import torch.nn.functional as F
from einops import rearrange, repeat
from torch import einsum

class PatchEmbedding(nn.Module):
    def __init__(self, patch_size=16, in_chans=3, embed_dim=384):
        super().__init__()
        self.patch_size = patch_size
        self.proj = nn.Conv2d(in_chans, embed_dim, kernel_size=patch_size, stride=patch_size)

    def forward(self, x):
        x = self.proj(x)
        x = rearrange(x, 'b e h w -> b (h w) e')
        return x

class Attention(nn.Module):
    def __init__(self, dim, num_heads=6):
        super().__init__()
        self.num_heads = num_heads
        self.scale = (dim // num_heads) ** -0.5
        
        self.to_qkv = nn.Linear(dim, dim * 3)
        self.to_out = nn.Linear(dim, dim)

    def forward(self, x):
        qkv = self.to_qkv(x).chunk(3, dim=-1)
        q, k, v = map(lambda t: rearrange(t, 'b n (h d) -> b h n d', h=self.num_heads), qkv)
        
        dots = einsum('b h i d, b h j d -> b h i j', q, k) * self.scale
        attn = dots.softmax(dim=-1)
        
        out = einsum('b h i j, b h j d -> b h i d', attn, v)
        out = rearrange(out, 'b h n d -> b n (h d)')
        return self.to_out(out)

class TransformerBlock(nn.Module):
    def __init__(self, dim, num_heads=6, mlp_dim=1536):
        super().__init__()
        self.norm1 = nn.LayerNorm(dim)
        self.attn = Attention(dim, num_heads)
        self.norm2 = nn.LayerNorm(dim)
        self.mlp = nn.Sequential(
            nn.Linear(dim, mlp_dim),
            nn.GELU(),
            nn.Linear(mlp_dim, dim)
        )

    def forward(self, x):
        x = x + self.attn(self.norm1(x))
        x = x + self.mlp(self.norm2(x))
        return x

class SimpleSegmenter(nn.Module):
    def __init__(self, n_classes=32, img_size=224, patch_size=16):
        super().__init__()
        self.img_size = img_size
        self.patch_size = patch_size
        self.embed_dim = 384
        self.num_patches = (img_size // patch_size) ** 2
        
        self.patch_embed = PatchEmbedding(patch_size, 3, self.embed_dim)
        self.pos_embed = nn.Parameter(torch.randn(1, self.num_patches, self.embed_dim))
        
        self.blocks = nn.Sequential(
            *[TransformerBlock(self.embed_dim) for _ in range(4)]
        )
        
        self.decoder = nn.Sequential(
            nn.LayerNorm(self.embed_dim),
            nn.Linear(self.embed_dim, n_classes)
        )
        
    def forward(self, x):
        B, C, H, W = x.shape
        
        x = self.patch_embed(x)
        x = x + self.pos_embed
        
        x = self.blocks(x)
        
        x = rearrange(x, 'b (h w) c -> b c h w', h=H//self.patch_size, w=W//self.patch_size)
        x = F.interpolate(x, size=(H, W), mode='bilinear', align_corners=False)
        
        x = x.permute(0, 2, 3, 1)
        x = self.decoder(x)
        return x.permute(0, 3, 1, 2)

### Обучение моделей на выбранных датасетах и вывод метрик

#### Сверточная модель

In [2]:
import torch
import torch.nn as nn
import torch.optim as optim
from torch.utils.data import DataLoader, Dataset
from torchmetrics import JaccardIndex, Accuracy
from torchvision import transforms
from PIL import Image
import os
import numpy as np

device = torch.device("cpu")
BATCH_SIZE = 4
EPOCHS = 3
LEARNING_RATE = 0.001
IMAGE_SIZE = (128, 128)

class CamVidDataset(Dataset):
    def __init__(self, image_dir, label_dir, image_size=(224, 224)):
        self.image_dir = image_dir
        self.label_dir = label_dir
        self.image_size = image_size
        
        self.image_files = sorted([f for f in os.listdir(image_dir) if f.endswith('.png')])
        self.label_files = sorted([f for f in os.listdir(label_dir) if f.endswith('.png')])
                
        self.transform = transforms.Compose([
            transforms.Resize(image_size),
            transforms.ToTensor(),
            transforms.Normalize(mean=[0.485, 0.456, 0.406], std=[0.229, 0.224, 0.225])
        ])

    def __len__(self):
        return len(self.image_files)

    def __getitem__(self, idx):
        img_path = os.path.join(self.image_dir, self.image_files[idx])
        label_path = os.path.join(self.label_dir, self.label_files[idx])
        
        img = Image.open(img_path).convert('RGB')
        label = Image.open(label_path)
        
        img = self.transform(img)
        label = transforms.Resize(self.image_size, Image.NEAREST)(label)
        label = torch.from_numpy(np.array(label)).long()
        
        return img, label

train_data = CamVidDataset(
    image_dir="./CamVid/data/train",
    label_dir="./CamVid/data/trainannot",
    image_size=IMAGE_SIZE
)
test_data = CamVidDataset(
    image_dir="./CamVid/data/test",
    label_dir="./CamVid/data/testannot",
    image_size=IMAGE_SIZE
)

train_loader = DataLoader(train_data, batch_size=BATCH_SIZE, shuffle=True)
test_loader = DataLoader(test_data, batch_size=BATCH_SIZE, shuffle=False)

model = CustomUNet(n_classes=32).to(device)
criterion = nn.CrossEntropyLoss()
optimizer = optim.Adam(model.parameters(), lr=LEARNING_RATE)

iou = JaccardIndex(task="multiclass", num_classes=32).to(device)
pixel_acc = Accuracy(task="multiclass", num_classes=32).to(device)

for epoch in range(EPOCHS):
    model.train()
    for images, targets in train_loader:
        images, targets = images.to(device), targets.to(device)
        optimizer.zero_grad()
        outputs = model(images)
        loss = criterion(outputs, targets)
        loss.backward()
        optimizer.step()

    model.eval()
    with torch.no_grad():
        for images, targets in test_loader:
            images, targets = images.to(device), targets.to(device)
            outputs = model(images)
            preds = torch.argmax(outputs, dim=1)
            iou.update(preds, targets)
            pixel_acc.update(preds, targets)

    print(f"Epoch {epoch+1}/{EPOCHS}")
    print(f"IoU: {iou.compute().item():.4f} | Pixel Acc: {pixel_acc.compute().item():.4f}")
    iou.reset()
    pixel_acc.reset()

Epoch 1/3
IoU: 0.1848 | Pixel Acc: 0.6246
Epoch 2/3
IoU: 0.2243 | Pixel Acc: 0.6359
Epoch 3/3
IoU: 0.2291 | Pixel Acc: 0.6649


#### Трансформерная модель

In [5]:
import torch
import torch.nn as nn
import torch.optim as optim
from torch.utils.data import DataLoader, Dataset
from torchmetrics import JaccardIndex, Accuracy
from torchvision import transforms
from PIL import Image
import os
import numpy as np

device = torch.device("cpu")
BATCH_SIZE = 2
EPOCHS = 3
LEARNING_RATE = 0.001
IMAGE_SIZE = (224, 224)

class CamVidDataset(Dataset):
    def __init__(self, image_dir, label_dir, image_size=(224, 224)):
        self.image_dir = image_dir
        self.label_dir = label_dir
        self.image_size = image_size
        
        self.image_files = sorted([f for f in os.listdir(image_dir) if f.endswith('.png')])
        self.label_files = sorted([f for f in os.listdir(label_dir) if f.endswith('.png')])
                
        self.transform = transforms.Compose([
            transforms.Resize(image_size),
            transforms.ToTensor(),
            transforms.Normalize(mean=[0.485, 0.456, 0.406], std=[0.229, 0.224, 0.225])
        ])

    def __len__(self):
        return len(self.image_files)

    def __getitem__(self, idx):
        img_path = os.path.join(self.image_dir, self.image_files[idx])
        label_path = os.path.join(self.label_dir, self.label_files[idx])
        
        img = Image.open(img_path).convert('RGB')
        label = Image.open(label_path)
        
        img = self.transform(img)
        label = transforms.Resize(self.image_size, Image.NEAREST)(label)
        label = torch.from_numpy(np.array(label)).long()
        
        return img, label

train_data = CamVidDataset(
    image_dir="./CamVid/data/train",
    label_dir="./CamVid/data/trainannot",
    image_size=IMAGE_SIZE
)
test_data = CamVidDataset(
    image_dir="./CamVid/data/test",
    label_dir="./CamVid/data/testannot",
    image_size=IMAGE_SIZE
)

train_loader = DataLoader(train_data, batch_size=BATCH_SIZE, shuffle=True)
test_loader = DataLoader(test_data, batch_size=BATCH_SIZE, shuffle=False)

model = SimpleSegmenter(n_classes=32).to(device)
optimizer = optim.AdamW(model.parameters(), lr=LEARNING_RATE, weight_decay=1e-4)
criterion = nn.CrossEntropyLoss()

iou = JaccardIndex(task="multiclass", num_classes=32).to(device)
pixel_acc = Accuracy(task="multiclass", num_classes=32).to(device)

for epoch in range(EPOCHS):
    model.train()
    for images, targets in train_loader:
        images, targets = images.to(device), targets.to(device)
        optimizer.zero_grad()
        outputs = model(images)
        loss = criterion(outputs, targets)
        loss.backward()
        optimizer.step()

    model.eval()
    with torch.no_grad():
        for images, targets in test_loader:
            images, targets = images.to(device), targets.to(device)
            outputs = model(images)
            preds = torch.argmax(outputs, dim=1)
            iou.update(preds, targets)
            pixel_acc.update(preds, targets)

    print(f"Epoch {epoch+1}/{EPOCHS}")
    print(f"IoU: {iou.compute().item():.4f} | Pixel Acc: {pixel_acc.compute().item():.4f}")
    iou.reset()
    pixel_acc.reset()

Epoch 1/3
IoU: 0.1460 | Pixel Acc: 0.5720
Epoch 2/3
IoU: 0.1558 | Pixel Acc: 0.5685
Epoch 3/3
IoU: 0.1713 | Pixel Acc: 0.6463


### Сравнение результатов с п.2. Выводы

При сравнении результатов самостоятельно реализованных моделей с базовыми версиями из библиотеки segmentation_models.pytorch наблюдаются заметные различия в качестве сегментации. Сверточная модель на основе кастомной U-Net архитектуры показала значение IoU 0.229, что примерно на 30% ниже аналогичного показателя у стандартной ResNet34 (0.329), при этом точность пиксельной классификации также оказалась существенно ниже - 0.665 против 0.777. Еще более значительный разрыв наблюдается у трансформерной модели: самостоятельно реализованный сегментер достиг IoU всего 0.171, что почти в два раза хуже показателя MIT-B0 из библиотеки (0.300), с аналогичным отставанием по pixel accuracy (0.646 против 0.767).

Такое существенное расхождение в метриках объясняется несколькими ключевыми факторами. Во-первых, библиотечные модели используют предобученные на ImageNet энкодеры, что дает им значительное преимущество в качестве извлечения признаков. Во-вторых, в реализациях segmentation_models.pytorch применяются дополнительные оптимизации и тонкая настройка архитектуры, наработанные за годы исследований. В-третьих, самостоятельно созданные модели имеют упрощенную структуру.

Считаю важным заметить, что обе самостоятельно реализованные модели демонстрируют схожее относительное отставание от своих библиотечных аналогов, что показывает важность использования отработанных архитектурных решений и предобученных компонентов.

### Улучшение бейзлайна. Добавлений техник для каждой из моделей из пункта 3c

#### Сверточная модель

Используем улучшения из 2 и 3 гипотезы. Не используем аугментацию (1 гипотеза) - так как она не дала улучшений

In [10]:
import torch
import torch.nn as nn
import segmentation_models_pytorch as smp
from torch.utils.data import DataLoader, Dataset
from torchmetrics import JaccardIndex, Accuracy
from torchvision import transforms
from PIL import Image
from torch.optim import AdamW
from torch.optim.lr_scheduler import CosineAnnealingLR
import os
import numpy as np

device = torch.device("cpu")
BATCH_SIZE = 8
EPOCHS = 5
LEARNING_RATE = 0.0005
IMAGE_SIZE = (128, 128)

class CamVidDataset(Dataset):
    def __init__(self, image_dir, label_dir, image_size=(224, 224)):
        self.image_dir = image_dir
        self.label_dir = label_dir
        self.image_size = image_size
        
        self.image_files = sorted([f for f in os.listdir(image_dir) if f.endswith('.png')])
        self.label_files = sorted([f for f in os.listdir(label_dir) if f.endswith('.png')])
                
        self.transform = transforms.Compose([
            transforms.Resize(image_size),
            transforms.ToTensor(),
            transforms.Normalize(mean=[0.485, 0.456, 0.406], std=[0.229, 0.224, 0.225])
        ])

    def __len__(self):
        return len(self.image_files)

    def __getitem__(self, idx):
        img_path = os.path.join(self.image_dir, self.image_files[idx])
        label_path = os.path.join(self.label_dir, self.label_files[idx])
        
        img = Image.open(img_path).convert('RGB')
        label = Image.open(label_path)
        
        img = self.transform(img)
        label = transforms.Resize(self.image_size, Image.NEAREST)(label)
        label = torch.from_numpy(np.array(label)).long()
        
        return img, label

train_data = CamVidDataset(
    image_dir="./CamVid/data/train",
    label_dir="./CamVid/data/trainannot",
    image_size=IMAGE_SIZE
)
test_data = CamVidDataset(
    image_dir="./CamVid/data/test",
    label_dir="./CamVid/data/testannot",
    image_size=IMAGE_SIZE
)

train_loader = DataLoader(train_data, batch_size=BATCH_SIZE, shuffle=True)
test_loader = DataLoader(test_data, batch_size=BATCH_SIZE, shuffle=False)

model = CustomUNet(n_classes=32).to(device)

class DiceCELoss(nn.Module):
    def __init__(self):
        super().__init__()
        self.ce = nn.CrossEntropyLoss()
        self.dice = smp.losses.DiceLoss(mode='multiclass')
    
    def forward(self, outputs, targets):
        return 0.5 * self.ce(outputs, targets) + 0.5 * self.dice(outputs, targets)

criterion = DiceCELoss()
optimizer = AdamW(model.parameters(), lr=LEARNING_RATE, weight_decay=1e-4)
scheduler = CosineAnnealingLR(optimizer, T_max=EPOCHS)

iou = JaccardIndex(task="multiclass", num_classes=32).to(device)
pixel_acc = Accuracy(task="multiclass", num_classes=32).to(device)

for epoch in range(EPOCHS):
    model.train()
    for images, targets in train_loader:
        images, targets = images.to(device), targets.to(device)
        optimizer.zero_grad()
        outputs = model(images)
        loss = criterion(outputs, targets)
        loss.backward()
        optimizer.step()
    scheduler.step()

    model.eval()
    with torch.no_grad():
        for images, targets in test_loader:
            images, targets = images.to(device), targets.to(device)
            outputs = model(images)
            preds = torch.argmax(outputs, dim=1)
            iou.update(preds, targets)
            pixel_acc.update(preds, targets)

    print(f"Epoch {epoch+1}/{EPOCHS}")
    print(f"IoU: {iou.compute().item():.4f} | Pixel Acc: {pixel_acc.compute().item():.4f}")
    iou.reset()
    pixel_acc.reset()

Epoch 1/5
IoU: 0.1393 | Pixel Acc: 0.6040
Epoch 2/5
IoU: 0.2434 | Pixel Acc: 0.6776
Epoch 3/5
IoU: 0.2425 | Pixel Acc: 0.6892
Epoch 4/5
IoU: 0.3018 | Pixel Acc: 0.7618
Epoch 5/5
IoU: 0.3093 | Pixel Acc: 0.7620


#### Трансформерная модель

Используем улучшения из 2 и 3 гипотезы. Не используем аугментацию (1 гипотеза) - так как она не дала улучшений

In [None]:
import torch
import torch.nn as nn
from torch.utils.data import DataLoader, Dataset
from torchmetrics import JaccardIndex, Accuracy
from torchvision import transforms
from PIL import Image
from torch.optim import AdamW
from torch.optim.lr_scheduler import CosineAnnealingLR
import segmentation_models_pytorch as smp
import os
import numpy as np

device = torch.device("cpu")
BATCH_SIZE = 4
EPOCHS = 5
LEARNING_RATE = 0.0005
IMAGE_SIZE = (224, 224)

class CamVidDataset(Dataset):
    def __init__(self, image_dir, label_dir, image_size=(224, 224)):
        self.image_dir = image_dir
        self.label_dir = label_dir
        self.image_size = image_size
        
        self.image_files = sorted([f for f in os.listdir(image_dir) if f.endswith('.png')])
        self.label_files = sorted([f for f in os.listdir(label_dir) if f.endswith('.png')])
                
        self.transform = transforms.Compose([
            transforms.Resize(image_size),
            transforms.ToTensor(),
            transforms.Normalize(mean=[0.485, 0.456, 0.406], std=[0.229, 0.224, 0.225])
        ])

    def __len__(self):
        return len(self.image_files)

    def __getitem__(self, idx):
        img_path = os.path.join(self.image_dir, self.image_files[idx])
        label_path = os.path.join(self.label_dir, self.label_files[idx])
        
        img = Image.open(img_path).convert('RGB')
        label = Image.open(label_path)
        
        img = self.transform(img)
        label = transforms.Resize(self.image_size, Image.NEAREST)(label)
        label = torch.from_numpy(np.array(label)).long()
        
        return img, label

train_data = CamVidDataset(
    image_dir="./CamVid/data/train",
    label_dir="./CamVid/data/trainannot",
    image_size=IMAGE_SIZE
)
test_data = CamVidDataset(
    image_dir="./CamVid/data/test",
    label_dir="./CamVid/data/testannot",
    image_size=IMAGE_SIZE
)

train_loader = DataLoader(train_data, batch_size=BATCH_SIZE, shuffle=True)
test_loader = DataLoader(test_data, batch_size=BATCH_SIZE, shuffle=False)

model = SimpleSegmenter(n_classes=32).to(device)

class DiceCELoss(nn.Module):
    def __init__(self):
        super().__init__()
        self.ce = nn.CrossEntropyLoss()
        self.dice = smp.losses.DiceLoss(mode='multiclass')
    
    def forward(self, outputs, targets):
        return 0.5 * self.ce(outputs, targets) + 0.5 * self.dice(outputs, targets)

criterion = DiceCELoss()
optimizer = AdamW(model.parameters(), lr=LEARNING_RATE, weight_decay=1e-4)
scheduler = CosineAnnealingLR(optimizer, T_max=EPOCHS)

iou = JaccardIndex(task="multiclass", num_classes=32).to(device)
pixel_acc = Accuracy(task="multiclass", num_classes=32).to(device)

for epoch in range(EPOCHS):
    model.train()
    for images, targets in train_loader:
        images, targets = images.to(device), targets.to(device)
        optimizer.zero_grad()
        outputs = model(images)
        loss = criterion(outputs, targets)
        loss.backward()
        optimizer.step()
    scheduler.step()

    model.eval()
    with torch.no_grad():
        for images, targets in test_loader:
            images, targets = images.to(device), targets.to(device)
            outputs = model(images)
            preds = torch.argmax(outputs, dim=1)
            iou.update(preds, targets)
            pixel_acc.update(preds, targets)

    print(f"Epoch {epoch+1}/{EPOCHS}")
    print(f"IoU: {iou.compute().item():.4f} | Pixel Acc: {pixel_acc.compute().item():.4f}")
    iou.reset()
    pixel_acc.reset()

Epoch 1/5
IoU: 0.1966 | Pixel Acc: 0.6321
Epoch 2/5
IoU: 0.2016 | Pixel Acc: 0.6304
Epoch 3/5
IoU: 0.2585 | Pixel Acc: 0.6794
Epoch 4/5
IoU: 0.2606 | Pixel Acc: 0.6947
Epoch 5/5
IoU: 0.2858 | Pixel Acc: 0.7148


### Выводы

После внесения оптимизаций в самостоятельно реализованные модели удалось добиться значительного улучшения их показателей, хотя они по-прежнему несколько уступают решениям из библиотеки segmentation_models.pytorch. Улучшенная версия кастомной сверточной модели достигла IoU 0.309, что примерно на 15% ниже аналогичного показателя оптимизированной resnet34 (0.364), при этом разрыв в точности пиксельной классификации сократился до 5% (0.762 против 0.812). Еще более показательные изменения произошли с трансформерной архитектурой: после применения тех же методов оптимизации ее IoU вырос до 0.286, что хотя и меньше показателя улучшенного mit_b0 (0.394), но демонстрирует принципиальную возможность эффективного обучения самописных трансформерных моделей для задач сегментации.

Сравнительный анализ показывает, что даже после всех улучшений сохраняется устойчивый разрыв около 20-27% между библиотечными и самописными реализациями, что особенно заметно на трансформерной архитектуре. Это объясняется несколькими фундаментальными факторами: во-первых, в библиотечных моделях используются более сложные и тонко настроенные механизмы внимания и нормализации; во-вторых, их энкодеры предварительно обучались на огромных датасетах; в-третьих, они содержат дополнительные архитектурные оптимизации, которые сложно полностью воспроизвести в кастомных реализациях. 

При этом важно отметить, что относительный прирост метрик после оптимизации у самописных моделей оказался сопоставим с библиотечными (примерно +25-30% по IoU), что подтверждает правильность выбранных направлений улучшения. Особенно показательно, что после всех доработок кастомная сверточная модель практически догнала по точности пиксельной классификации неоптимизированную версию resnet34 из библиотеки, что свидетельствует о перспективности дальнейшей работы по улучшению самостоятельно созданных архитектур.