<a href="https://colab.research.google.com/github/iwatsuki-yuuki/DL-matsuo/blob/main/DL%E5%9F%BA%E7%A4%8E%E8%AC%9B%E5%BA%A7%E7%AC%AC%EF%BC%96%E5%9B%9E%E8%AA%B2%E9%A1%8C.ipynb" target="_parent"><img src="https://colab.research.google.com/assets/colab-badge.svg" alt="Open In Colab"/></a>

# 第6回講義 宿題

## 課題
今Lessonで学んだことに工夫を加えて，FCNでより高性能なVOC2011データセットのセグメンテーションモデルを実装しましょう．

## 目標値
mean-IoU 0.4

## ルール
- 訓練データは`x_train`，`t_train`，テストデータは`x_test`で与えられます．
- 予測結果は，(21, 224, 224)のセグメンテーションマスクとしてください．
- **下のセルで指定されている`x_train`，`t_train`以外の学習データは使わないでください．**
-　事前学習モデルの利用は,
backboneでの使用のみ可とします．
    - torchvision.models.segmentation で提供されているような，ライブラリで実装されている FCN の利用は禁止とします．

## 提出方法

- 2つのファイルを提出していただきます．
    1. テストデータ (`x_test`) に対する予測マスクを`submission_pred.npy`として保存し，**Omnicampusの宿題タブから「第6回 深層学習と画像認識」を選択して**提出してください．
    2. それに対応するpythonのコードを`submission_code.py`として保存し，**Omnicampusの宿題タブから「第6回 深層学習と画像認識 (code)」を選択して**提出してください．pythonファイル自体の提出ではなく，「提出内容」の部分にコードをコピー&ペーストしてください．
      
- なお，採点は1で行い，2はコードの確認用として利用します（成績優秀者はコード内容を公開させていただくかもしれません）．コードの内容を変更した場合は，**1と2の両方を提出し直してください**．

### 評価方法

- 予測ラベルの`t_test`に対するmean-IoUで評価します．
- 即時採点しLeader Boardを更新します（採点スケジュールは別アナウンス）．
- 締切時の点数を最終的な評価とします．

In [26]:
# ドライブのマウント
from google.colab import drive
drive.mount('/content/drive')

Drive already mounted at /content/drive; to attempt to forcibly remount, call drive.mount("/content/drive", force_remount=True).


### データの読み込み（この部分は修正しないでください）

In [27]:
import random

import numpy as np
import pandas as pd
import torch
from torchvision import transforms
from tqdm import tqdm_notebook as tqdm
from PIL import Image
from sklearn.model_selection import train_test_split

#学習データ
x_train = np.load('drive/MyDrive/Colab Notebooks/DLBasics2025_colab/Lecture06/data/x_train.npy', allow_pickle=True)
t_train = np.load('drive/MyDrive/Colab Notebooks/DLBasics2025_colab/Lecture06/data/t_train.npy', allow_pickle=True)

#テストデータ
x_test = np.load('drive/MyDrive/Colab Notebooks/DLBasics2025_colab/Lecture06/data/x_test.npy', allow_pickle=True)

# 実装の都合上，コンストラクタ内で画像をリサイズ
class train_dataset(torch.utils.data.Dataset):
    def __init__(self, x_train, t_train):
        self.x_train = []
        self.t_train = []
        for i in range(x_train.shape[0]):
            self.x_train.append(transforms.Resize((224, 224))(Image.fromarray(np.uint8(x_train[i]))))
            self.t_train.append(transforms.Resize((224, 224))(Image.fromarray(np.uint8(t_train[i]))))
        self.transform = transforms.ToTensor()
        self.target_transform = transforms.ToTensor()

    def __len__(self):
        return len(self.x_train)

    def __getitem__(self, idx):
        return self.transform(self.x_train[idx]), self.target_transform(self.t_train[idx])

class test_dataset(torch.utils.data.Dataset):
    def __init__(self, x_test):
        self.x_test = []
        for i in range(x_test.shape[0]):
            self.x_test.append(transforms.Resize((224, 224))(Image.fromarray(np.uint8(x_test[i]))))
        self.transform = transforms.ToTensor()

    def __len__(self):
        return len(self.x_test)

    def __getitem__(self, idx):
        return self.transform(self.x_test[idx])

trainval_data = train_dataset(x_train, t_train)
test_data = test_dataset(x_test)

### FCN の実装

In [28]:
import torch
from torch import nn
from torch import optim
import torch.nn.functional as F
import torchvision

device = 'cuda' if torch.cuda.is_available() else 'cpu'
print(device)

cuda


In [29]:
def fix_seed(seed=1234):
    random.seed(seed)
    np.random.seed(seed)
    torch.manual_seed(seed)


fix_seed(seed=42)

# データの分割
val_size = 100
train_data, val_data = torch.utils.data.random_split(trainval_data, [len(trainval_data)-val_size, val_size])  # 訓練データと検証データに分割

num_classes = 21

# 前処理の定義
def TargetToTensor(target):
    target = np.array(target)
    target[target > 20] = 0 # labelを0-20の合計21クラスに限定（objectのエッジを消す）
    target = torch.from_numpy(target).type(torch.long)
    target = F.one_hot(target, num_classes=num_classes).permute(2,0,1).type(torch.float)
    return target  # (21, 224, 224)

image_transform = transforms.Compose([
    transforms.ToTensor(),
])
target_transform = transforms.Compose([
    transforms.Lambda(lambda target: TargetToTensor(target))
])

trainval_data.transform = image_transform
trainval_data.target_transform = target_transform
test_data.transform = image_transform

batch_size=16

# dataloaderの定義
dataloader_train = torch.utils.data.DataLoader(
    train_data,
    batch_size=batch_size,
    shuffle=True
)
dataloader_valid = torch.utils.data.DataLoader(
    val_data,
    batch_size=batch_size,
    shuffle=False
)
dataloader_test = torch.utils.data.DataLoader(
    test_data,
    batch_size=batch_size,
    shuffle=False
)

In [52]:
# FCNの定義
class FCN(nn.Module):
    def __init__(self, backbone, num_classes=21):
        super(FCN, self).__init__()
        # backbone
        self.backbone = backbone
        # convolution（ちょっとだけ層を前のCNNの実装にならって深くした）
        self.FCNhead = nn.Sequential(
            nn.Conv2d(2048, 512, 3, padding=1, bias=False),
            nn.BatchNorm2d(512),
            nn.ReLU(inplace=True),
            nn.Conv2d(512, 128, 3, padding=1, bias=False),
            nn.BatchNorm2d(128),
            nn.ReLU(inplace=True),
            nn.Dropout(0.1),
            nn.Conv2d(128, num_classes, 1)
        )
        self.initialize_weights()

    def forward(self, x):
        input_shape = x.shape[-2:] # shape: (224, 224)
        x =  self.backbone(x)
        x = self.FCNhead(x)
        x = F.interpolate(x, size=input_shape, mode='bilinear', align_corners=False)
        return x

    # Relu関数前のHe初期値を行う
    def initialize_weights(self):  # ← クラスの中で定義
        for m in self.FCNhead.modules():
            if isinstance(m, nn.Conv2d):
                nn.init.kaiming_normal_(m.weight, mode='fan_out', nonlinearity='relu')
                if m.bias is not None:
                    nn.init.constant_(m.bias, 0)
            elif isinstance(m, nn.BatchNorm2d):
                nn.init.constant_(m.weight, 1)
                nn.init.constant_(m.bias, 0)
backbone = torchvision.models.resnet50(pretrained=torchvision.models.ResNet50_Weights.DEFAULT)
backbone = nn.Sequential(*list(backbone.children())[:-2])  # GAP層とFC層を外す

In [51]:
# 下記リンク先のmIoU実装を利用
# https://github.com/wkentaro/pytorch-fcn/blob/master/torchfcn/utils.py
class mIoUScore(object):
    def __init__(self, n_classes):
        self.n_classes = n_classes
        self.confusion_matrix = np.zeros((n_classes, n_classes))

    def _fast_hist(self, label_true, label_pred, n_class):
        mask = (label_true >= 0) & (label_true < n_class)
        hist = np.bincount(
            n_class * label_true[mask].astype(int) + label_pred[mask], minlength=n_class ** 2
        ).reshape(n_class, n_class)
        return hist

    def update(self, label_trues, label_preds):
        for lt, lp in zip(label_trues, label_preds):
            self.confusion_matrix += self._fast_hist(lt.flatten(), lp.flatten(), self.n_classes)

    def get_scores(self):
        hist = self.confusion_matrix
        with np.errstate(divide='ignore', invalid='ignore'):
            iou = np.diag(hist) / (hist.sum(axis=1) + hist.sum(axis=0) - np.diag(hist))
        mean_iou = np.nanmean(iou)
        return mean_iou

    def reset(self):
        self.confusion_matrix = np.zeros((self.n_classes, self.n_classes))

In [50]:
model = FCN(backbone=backbone, num_classes=num_classes)
model.to(device)

# optimizer, loss function, metricsの定義
loss_fn = nn.BCEWithLogitsLoss()
metrics = mIoUScore(num_classes)
optimizer = optim.AdamW(model.parameters(), lr=1e-4, weight_decay=1e-3)

n_epochs = 50

In [53]:
# modelの学習
for epoch in range(n_epochs):
    train_losses = []
    valid_losses = []
    metrics.reset()

    model.train()
    with tqdm(total=len(dataloader_train), unit="batch") as pbar:
        pbar.set_description(f"[train] Epoch {epoch+1}/{n_epochs}")
        for image, target in dataloader_train:
            optimizer.zero_grad()
            image, target = image.to(device), target.to(device)
            output = model(image)
            loss = loss_fn(output, target)
            loss.backward()
            optimizer.step()
            train_losses.append(loss.item())
            pbar.set_postfix(loss=np.array(train_losses).mean())
            pbar.update(1)

    model.eval()
    with tqdm(total=len(dataloader_valid), unit="batch") as pbar:
        pbar.set_description(f"[valid] Epoch {epoch+1}/{n_epochs}")
        for image, target in dataloader_valid:
            image, target = image.to(device), target.to(device)
            output = model(image)
            loss = loss_fn(output, target)
            valid_losses.append(loss.item())
            metrics.update(target.argmax(1).cpu().numpy(), output.argmax(1).cpu().numpy())
            pbar.set_postfix(loss=np.array(valid_losses).mean(), mIoU=metrics.get_scores())
            pbar.update(1)

Please use `tqdm.notebook.tqdm` instead of `tqdm.tqdm_notebook`
  with tqdm(total=len(dataloader_train), unit="batch") as pbar:


  0%|          | 0/114 [00:00<?, ?batch/s]

Please use `tqdm.notebook.tqdm` instead of `tqdm.tqdm_notebook`
  with tqdm(total=len(dataloader_valid), unit="batch") as pbar:


  0%|          | 0/7 [00:00<?, ?batch/s]

  0%|          | 0/114 [00:00<?, ?batch/s]

  0%|          | 0/7 [00:00<?, ?batch/s]

  0%|          | 0/114 [00:00<?, ?batch/s]

  0%|          | 0/7 [00:00<?, ?batch/s]

  0%|          | 0/114 [00:00<?, ?batch/s]

  0%|          | 0/7 [00:00<?, ?batch/s]

  0%|          | 0/114 [00:00<?, ?batch/s]

  0%|          | 0/7 [00:00<?, ?batch/s]

  0%|          | 0/114 [00:00<?, ?batch/s]

  0%|          | 0/7 [00:00<?, ?batch/s]

  0%|          | 0/114 [00:00<?, ?batch/s]

  0%|          | 0/7 [00:00<?, ?batch/s]

  0%|          | 0/114 [00:00<?, ?batch/s]

  0%|          | 0/7 [00:00<?, ?batch/s]

  0%|          | 0/114 [00:00<?, ?batch/s]

  0%|          | 0/7 [00:00<?, ?batch/s]

  0%|          | 0/114 [00:00<?, ?batch/s]

  0%|          | 0/7 [00:00<?, ?batch/s]

  0%|          | 0/114 [00:00<?, ?batch/s]

  0%|          | 0/7 [00:00<?, ?batch/s]

  0%|          | 0/114 [00:00<?, ?batch/s]

  0%|          | 0/7 [00:00<?, ?batch/s]

  0%|          | 0/114 [00:00<?, ?batch/s]

  0%|          | 0/7 [00:00<?, ?batch/s]

  0%|          | 0/114 [00:00<?, ?batch/s]

  0%|          | 0/7 [00:00<?, ?batch/s]

  0%|          | 0/114 [00:00<?, ?batch/s]

  0%|          | 0/7 [00:00<?, ?batch/s]

  0%|          | 0/114 [00:00<?, ?batch/s]

  0%|          | 0/7 [00:00<?, ?batch/s]

  0%|          | 0/114 [00:00<?, ?batch/s]

  0%|          | 0/7 [00:00<?, ?batch/s]

  0%|          | 0/114 [00:00<?, ?batch/s]

  0%|          | 0/7 [00:00<?, ?batch/s]

  0%|          | 0/114 [00:00<?, ?batch/s]

  0%|          | 0/7 [00:00<?, ?batch/s]

  0%|          | 0/114 [00:00<?, ?batch/s]

  0%|          | 0/7 [00:00<?, ?batch/s]

  0%|          | 0/114 [00:00<?, ?batch/s]

  0%|          | 0/7 [00:00<?, ?batch/s]

  0%|          | 0/114 [00:00<?, ?batch/s]

  0%|          | 0/7 [00:00<?, ?batch/s]

  0%|          | 0/114 [00:00<?, ?batch/s]

  0%|          | 0/7 [00:00<?, ?batch/s]

  0%|          | 0/114 [00:00<?, ?batch/s]

  0%|          | 0/7 [00:00<?, ?batch/s]

  0%|          | 0/114 [00:00<?, ?batch/s]

  0%|          | 0/7 [00:00<?, ?batch/s]

  0%|          | 0/114 [00:00<?, ?batch/s]

  0%|          | 0/7 [00:00<?, ?batch/s]

  0%|          | 0/114 [00:00<?, ?batch/s]

  0%|          | 0/7 [00:00<?, ?batch/s]

  0%|          | 0/114 [00:00<?, ?batch/s]

  0%|          | 0/7 [00:00<?, ?batch/s]

  0%|          | 0/114 [00:00<?, ?batch/s]

  0%|          | 0/7 [00:00<?, ?batch/s]

  0%|          | 0/114 [00:00<?, ?batch/s]

  0%|          | 0/7 [00:00<?, ?batch/s]

  0%|          | 0/114 [00:00<?, ?batch/s]

  0%|          | 0/7 [00:00<?, ?batch/s]

  0%|          | 0/114 [00:00<?, ?batch/s]

  0%|          | 0/7 [00:00<?, ?batch/s]

  0%|          | 0/114 [00:00<?, ?batch/s]

  0%|          | 0/7 [00:00<?, ?batch/s]

  0%|          | 0/114 [00:00<?, ?batch/s]

  0%|          | 0/7 [00:00<?, ?batch/s]

  0%|          | 0/114 [00:00<?, ?batch/s]

  0%|          | 0/7 [00:00<?, ?batch/s]

  0%|          | 0/114 [00:00<?, ?batch/s]

  0%|          | 0/7 [00:00<?, ?batch/s]

  0%|          | 0/114 [00:00<?, ?batch/s]

  0%|          | 0/7 [00:00<?, ?batch/s]

  0%|          | 0/114 [00:00<?, ?batch/s]

  0%|          | 0/7 [00:00<?, ?batch/s]

  0%|          | 0/114 [00:00<?, ?batch/s]

  0%|          | 0/7 [00:00<?, ?batch/s]

  0%|          | 0/114 [00:00<?, ?batch/s]

  0%|          | 0/7 [00:00<?, ?batch/s]

  0%|          | 0/114 [00:00<?, ?batch/s]

  0%|          | 0/7 [00:00<?, ?batch/s]

  0%|          | 0/114 [00:00<?, ?batch/s]

  0%|          | 0/7 [00:00<?, ?batch/s]

  0%|          | 0/114 [00:00<?, ?batch/s]

  0%|          | 0/7 [00:00<?, ?batch/s]

  0%|          | 0/114 [00:00<?, ?batch/s]

  0%|          | 0/7 [00:00<?, ?batch/s]

  0%|          | 0/114 [00:00<?, ?batch/s]

  0%|          | 0/7 [00:00<?, ?batch/s]

  0%|          | 0/114 [00:00<?, ?batch/s]

  0%|          | 0/7 [00:00<?, ?batch/s]

  0%|          | 0/114 [00:00<?, ?batch/s]

  0%|          | 0/7 [00:00<?, ?batch/s]

  0%|          | 0/114 [00:00<?, ?batch/s]

  0%|          | 0/7 [00:00<?, ?batch/s]

  0%|          | 0/114 [00:00<?, ?batch/s]

  0%|          | 0/7 [00:00<?, ?batch/s]

  0%|          | 0/114 [00:00<?, ?batch/s]

  0%|          | 0/7 [00:00<?, ?batch/s]

In [49]:
# 提出ファイルの作成
model.eval()
t_pred = []

for x in dataloader_test:
    x = x.to(device)

    # 順伝播
    pred = model(x)
    t_pred.extend(pred.cpu().detach().numpy())

t_pred = np.array(t_pred)
t_pred = t_pred.argmax(1).astype(np.uint8)
np.save('drive/MyDrive/Colab Notebooks/DLBasics2025_colab/Lecture06/submission_pred.npy', t_pred)