### 環境構築とデータセットの準備

In [1]:
!pip install -q pytorch_lightning

[K     |████████████████████████████████| 585 kB 6.8 MB/s 
[K     |████████████████████████████████| 596 kB 49.3 MB/s 
[K     |████████████████████████████████| 140 kB 56.6 MB/s 
[K     |████████████████████████████████| 419 kB 66.9 MB/s 
[K     |████████████████████████████████| 1.1 MB 49.6 MB/s 
[K     |████████████████████████████████| 144 kB 66.1 MB/s 
[K     |████████████████████████████████| 271 kB 73.0 MB/s 
[K     |████████████████████████████████| 94 kB 3.7 MB/s 
[?25h

In [2]:
!if [ ! -d BCCD_Dataset ]; then git clone https://github.com/Shenggan/BCCD_Dataset.git; fi

Cloning into 'BCCD_Dataset'...
remote: Enumerating objects: 800, done.[K
remote: Total 800 (delta 0), reused 0 (delta 0), pack-reused 800[K
Receiving objects: 100% (800/800), 7.39 MiB | 14.60 MiB/s, done.
Resolving deltas: 100% (378/378), done.


In [3]:
import torch
import torchvision
import pytorch_lightning as pl

In [4]:
pl.seed_everything(0)

Global seed set to 0


0

In [5]:
from PIL import Image
from torchvision import transforms
from xml.etree.ElementTree import parse

In [6]:
class BCCDDataset(torch.utils.data.Dataset):

    def __init__(self, root, mode='train'):
        self.root = root
        self.bccd_labels = ['BG', 'RBC', 'WBC', 'Platelets']
        self.transform = transforms.Compose([ transforms.ToTensor() ])
        with open(f'{root}/ImageSets/Main/{mode}.txt', 'r') as f:
            self.data_list = f.read().split('\n')[:-1]


    def __getitem__(self, idx):
        data = self.data_list[idx]

        # 入力値
        image_path = f'{self.root}/JPEGImages/{data}.jpg'
        image = Image.open(image_path)
        image = self.transform(image)

        # 目標値
        anno_path = f'{self.root}/Annotations/{data}.xml'
        parser = parse(anno_path)
        labels, boxes = [], []
        for obj in parser.findall('object'):
            box = [int(tag.text) for tag in obj.find('bndbox')]
            label = obj.find('name').text
            label = self.bccd_labels.index(label)
            boxes.append(box)
            labels.append(label)
        boxes = torch.tensor(boxes, dtype=torch.float32)
        labels = torch.tensor(labels, dtype=torch.int64)
        target = {'boxes': boxes, 'labels': labels}

        return image, target


    def __len__(self):
        return len(self.data_list)

In [7]:
# データセットの取得
root = 'BCCD_Dataset/BCCD'
train = BCCDDataset(root, mode='train')
val = BCCDDataset(root, mode='val')
test = BCCDDataset(root, mode='test')

In [8]:
len(train), len(val), len(test)

(205, 87, 72)

### Faster R-CNN

In [9]:
from torchvision.models.detection import fasterrcnn_resnet50_fpn

In [10]:
# 乱数のシードを固定して再現性を確保
pl.seed_everything(0)

# Faster R-CNN（backborn は事前に訓練済み）
model = fasterrcnn_resnet50_fpn(pretrained=True)

Global seed set to 0
  f"The parameter '{pretrained_param}' is deprecated since 0.13 and will be removed in 0.15, "
Downloading: "https://download.pytorch.org/models/fasterrcnn_resnet50_fpn_coco-258fb6c6.pth" to /root/.cache/torch/hub/checkpoints/fasterrcnn_resnet50_fpn_coco-258fb6c6.pth


  0%|          | 0.00/160M [00:00<?, ?B/s]

In [11]:
# 初期設定は out_features が 91
model.roi_heads.box_predictor

FastRCNNPredictor(
  (cls_score): Linear(in_features=1024, out_features=91, bias=True)
  (bbox_pred): Linear(in_features=1024, out_features=364, bias=True)
)

In [12]:
from torchvision.models.detection.faster_rcnn import FastRCNNPredictor

In [13]:
# クラス数を 4 に設定
model.roi_heads.box_predictor = FastRCNNPredictor(1024, 4)

In [14]:
model.roi_heads.box_predictor

FastRCNNPredictor(
  (cls_score): Linear(in_features=1024, out_features=4, bias=True)
  (bbox_pred): Linear(in_features=1024, out_features=16, bias=True)
)

In [15]:
# 初期設定を確認
print(model.training)

True


In [16]:
# 検証モードへ
model.eval()
print(model.training)

False


In [17]:
# 訓練データの 1 サンプル目を取得
x, t = train[0]

x.shape

torch.Size([3, 480, 640])

In [18]:
y = model(x.unsqueeze(0))
y

[{'boxes': tensor([[1.6617e+02, 7.6333e+01, 1.8187e+02, 9.4745e+01],
          [1.6943e+02, 7.7929e+01, 1.8953e+02, 9.7557e+01],
          [1.3614e+02, 4.0376e+01, 1.8645e+02, 1.3914e+02],
          [1.9248e+02, 2.6976e+02, 4.4520e+02, 4.5035e+02],
          [2.4711e+02, 3.6765e+02, 2.8137e+02, 4.1299e+02],
          [4.5601e+02, 3.1827e+02, 4.9165e+02, 3.5192e+02],
          [6.4512e+01, 1.8314e+02, 1.3398e+02, 2.4808e+02],
          [1.9737e+02, 3.0527e+02, 3.6639e+02, 4.7982e+02],
          [5.9433e+00, 1.6169e+02, 1.5701e+02, 3.0676e+02],
          [1.3039e+02, 4.0274e+01, 1.5769e+02, 9.1510e+01],
          [4.6208e+02, 4.0834e+02, 5.2133e+02, 4.7182e+02],
          [1.2270e+01, 3.9097e+02, 4.7502e+02, 4.8000e+02],
          [2.9094e+02, 4.3376e+02, 3.3725e+02, 4.7575e+02],
          [3.5776e+00, 2.2685e+01, 1.6581e+02, 2.1842e+02],
          [7.2452e+00, 3.3855e+02, 1.0378e+02, 4.7072e+02],
          [7.7182e+01, 4.1103e+02, 1.1358e+02, 4.5467e+02],
          [6.5774e+01, 3.2728e+

### 損失関数の算出

In [19]:
model.train()
losses = model(x.unsqueeze(0), [t])

In [20]:
losses

{'loss_box_reg': tensor(0.8816, grad_fn=<DivBackward0>),
 'loss_classifier': tensor(1.4724, grad_fn=<NllLossBackward0>),
 'loss_objectness': tensor(0.7087, grad_fn=<BinaryCrossEntropyWithLogitsBackward0>),
 'loss_rpn_box_reg': tensor(0.0931, grad_fn=<DivBackward0>)}

In [21]:
# 辞書の値のみ抽出
losses.values()

dict_values([tensor(1.4724, grad_fn=<NllLossBackward0>), tensor(0.8816, grad_fn=<DivBackward0>), tensor(0.7087, grad_fn=<BinaryCrossEntropyWithLogitsBackward0>), tensor(0.0931, grad_fn=<DivBackward0>)])

In [22]:
# 全体の総和
loss = sum(losses.values())
loss

tensor(3.1559, grad_fn=<AddBackward0>)

### Faster R-CNN の構造を確認

In [23]:
# 評価モードへ
model.eval()

FasterRCNN(
  (transform): GeneralizedRCNNTransform(
      Normalize(mean=[0.485, 0.456, 0.406], std=[0.229, 0.224, 0.225])
      Resize(min_size=(800,), max_size=1333, mode='bilinear')
  )
  (backbone): BackboneWithFPN(
    (body): IntermediateLayerGetter(
      (conv1): Conv2d(3, 64, kernel_size=(7, 7), stride=(2, 2), padding=(3, 3), bias=False)
      (bn1): FrozenBatchNorm2d(64, eps=0.0)
      (relu): ReLU(inplace=True)
      (maxpool): MaxPool2d(kernel_size=3, stride=2, padding=1, dilation=1, ceil_mode=False)
      (layer1): Sequential(
        (0): Bottleneck(
          (conv1): Conv2d(64, 64, kernel_size=(1, 1), stride=(1, 1), bias=False)
          (bn1): FrozenBatchNorm2d(64, eps=0.0)
          (conv2): Conv2d(64, 64, kernel_size=(3, 3), stride=(1, 1), padding=(1, 1), bias=False)
          (bn2): FrozenBatchNorm2d(64, eps=0.0)
          (conv3): Conv2d(64, 256, kernel_size=(1, 1), stride=(1, 1), bias=False)
          (bn3): FrozenBatchNorm2d(256, eps=0.0)
          (relu): ReLU(

#### 前処理

In [24]:
model.transform

GeneralizedRCNNTransform(
    Normalize(mean=[0.485, 0.456, 0.406], std=[0.229, 0.224, 0.225])
    Resize(min_size=(800,), max_size=1333, mode='bilinear')
)

In [25]:
images, targets = model.transform(x.unsqueeze(0))

images

<torchvision.models.detection.image_list.ImageList at 0x7f205052f990>

In [26]:
# __dict__ を用いて内部の構造を確認
images.__dict__

{'image_sizes': [(800, 1066)],
 'tensors': tensor([[[[0.8447, 0.8310, 0.8104,  ..., 0.0000, 0.0000, 0.0000],
           [0.8515, 0.8378, 0.8172,  ..., 0.0000, 0.0000, 0.0000],
           [0.8618, 0.8481, 0.8275,  ..., 0.0000, 0.0000, 0.0000],
           ...,
           [1.2386, 1.2523, 1.2728,  ..., 0.0000, 0.0000, 0.0000],
           [1.2899, 1.3036, 1.3242,  ..., 0.0000, 0.0000, 0.0000],
           [1.3242, 1.3379, 1.3585,  ..., 0.0000, 0.0000, 0.0000]],
 
          [[0.6954, 0.6814, 0.6603,  ..., 0.0000, 0.0000, 0.0000],
           [0.7024, 0.6884, 0.6673,  ..., 0.0000, 0.0000, 0.0000],
           [0.7129, 0.6989, 0.6778,  ..., 0.0000, 0.0000, 0.0000],
           ...,
           [1.4307, 1.4447, 1.4657,  ..., 0.0000, 0.0000, 0.0000],
           [1.4937, 1.5077, 1.5288,  ..., 0.0000, 0.0000, 0.0000],
           [1.5357, 1.5497, 1.5708,  ..., 0.0000, 0.0000, 0.0000]],
 
          [[0.4265, 0.4265, 0.4265,  ..., 0.0000, 0.0000, 0.0000],
           [0.4335, 0.4335, 0.4334,  ..., 0.0000,

In [27]:
images.image_sizes

[(800, 1066)]

In [28]:
images.tensors.shape

torch.Size([1, 3, 800, 1088])

In [29]:
targets

#### 特徴マップへの変換

In [30]:
model.backbone

BackboneWithFPN(
  (body): IntermediateLayerGetter(
    (conv1): Conv2d(3, 64, kernel_size=(7, 7), stride=(2, 2), padding=(3, 3), bias=False)
    (bn1): FrozenBatchNorm2d(64, eps=0.0)
    (relu): ReLU(inplace=True)
    (maxpool): MaxPool2d(kernel_size=3, stride=2, padding=1, dilation=1, ceil_mode=False)
    (layer1): Sequential(
      (0): Bottleneck(
        (conv1): Conv2d(64, 64, kernel_size=(1, 1), stride=(1, 1), bias=False)
        (bn1): FrozenBatchNorm2d(64, eps=0.0)
        (conv2): Conv2d(64, 64, kernel_size=(3, 3), stride=(1, 1), padding=(1, 1), bias=False)
        (bn2): FrozenBatchNorm2d(64, eps=0.0)
        (conv3): Conv2d(64, 256, kernel_size=(1, 1), stride=(1, 1), bias=False)
        (bn3): FrozenBatchNorm2d(256, eps=0.0)
        (relu): ReLU(inplace=True)
        (downsample): Sequential(
          (0): Conv2d(64, 256, kernel_size=(1, 1), stride=(1, 1), bias=False)
          (1): FrozenBatchNorm2d(256, eps=0.0)
        )
      )
      (1): Bottleneck(
        (conv1): C

In [31]:
features = model.backbone(images.tensors)

In [32]:
features.keys()

odict_keys(['0', '1', '2', '3', 'pool'])

In [33]:
features['0'].shape

torch.Size([1, 256, 200, 272])

In [34]:
features['1'].shape

torch.Size([1, 256, 100, 136])

In [35]:
features['2'].shape

torch.Size([1, 256, 50, 68])

In [36]:
features['3'].shape

torch.Size([1, 256, 25, 34])

In [37]:
features['3'].shape

torch.Size([1, 256, 25, 34])

In [38]:
features['pool'].shape

torch.Size([1, 256, 13, 17])

#### 候補領域の提案

In [39]:
model.rpn

RegionProposalNetwork(
  (anchor_generator): AnchorGenerator()
  (head): RPNHead(
    (conv): Sequential(
      (0): Conv2dNormActivation(
        (0): Conv2d(256, 256, kernel_size=(3, 3), stride=(1, 1), padding=(1, 1))
        (1): ReLU(inplace=True)
      )
    )
    (cls_logits): Conv2d(256, 3, kernel_size=(1, 1), stride=(1, 1))
    (bbox_pred): Conv2d(256, 12, kernel_size=(1, 1), stride=(1, 1))
  )
)

In [40]:
proposals, proposal_losses = model.rpn(images, features)

In [41]:
proposals

[tensor([[  30.0523,   27.0487,  831.6184,  633.7497],
         [ 229.1682,   79.1424, 1066.0000,  800.0000],
         [ 698.7228,  487.3722,  887.7523,  637.6987],
         ...,
         [ 182.1284,  316.7171,  215.2278,  348.5263],
         [ 655.2875,  286.1694,  670.0105,  303.5753],
         [ 851.1652,  502.5859,  887.0001,  536.1241]])]

In [42]:
proposals[0].shape

torch.Size([1000, 4])

In [43]:
proposal_losses

{}

#### クラスと位置の予測

In [44]:
model.roi_heads

RoIHeads(
  (box_roi_pool): MultiScaleRoIAlign(featmap_names=['0', '1', '2', '3'], output_size=(7, 7), sampling_ratio=2)
  (box_head): TwoMLPHead(
    (fc6): Linear(in_features=12544, out_features=1024, bias=True)
    (fc7): Linear(in_features=1024, out_features=1024, bias=True)
  )
  (box_predictor): FastRCNNPredictor(
    (cls_score): Linear(in_features=1024, out_features=4, bias=True)
    (bbox_pred): Linear(in_features=1024, out_features=16, bias=True)
  )
)

In [45]:
detections, detection_losses = model.roi_heads(features, proposals, images.image_sizes)

In [46]:
detections

[{'boxes': tensor([[2.7677e+02, 1.2722e+02, 3.0293e+02, 1.5791e+02],
          [2.8220e+02, 1.2988e+02, 3.1569e+02, 1.6259e+02],
          [2.2676e+02, 6.7294e+01, 3.1055e+02, 2.3190e+02],
          [3.2059e+02, 4.4961e+02, 7.4154e+02, 7.5059e+02],
          [4.1159e+02, 6.1274e+02, 4.6866e+02, 6.8832e+02],
          [7.5955e+02, 5.3046e+02, 8.1891e+02, 5.8654e+02],
          [1.0745e+02, 3.0523e+02, 2.2316e+02, 4.1347e+02],
          [3.2875e+02, 5.0878e+02, 6.1028e+02, 7.9969e+02],
          [9.8994e+00, 2.6948e+02, 2.6152e+02, 5.1126e+02],
          [2.1718e+02, 6.7123e+01, 2.6265e+02, 1.5252e+02],
          [7.6965e+02, 6.8057e+02, 8.6834e+02, 7.8637e+02],
          [2.0437e+01, 6.5162e+02, 7.9120e+02, 8.0000e+02],
          [4.8460e+02, 7.2293e+02, 5.6173e+02, 7.9292e+02],
          [5.9590e+00, 3.7808e+01, 2.7618e+02, 3.6404e+02],
          [1.2068e+01, 5.6425e+02, 1.7286e+02, 7.8454e+02],
          [1.2856e+02, 6.8506e+02, 1.8917e+02, 7.5778e+02],
          [1.0955e+02, 5.4546e+

In [47]:
detection_losses

{}

In [48]:
detections = model(x.unsqueeze(0))
detections

[{'boxes': tensor([[1.6617e+02, 7.6333e+01, 1.8187e+02, 9.4745e+01],
          [1.6943e+02, 7.7929e+01, 1.8953e+02, 9.7557e+01],
          [1.3614e+02, 4.0376e+01, 1.8645e+02, 1.3914e+02],
          [1.9248e+02, 2.6976e+02, 4.4520e+02, 4.5035e+02],
          [2.4711e+02, 3.6765e+02, 2.8137e+02, 4.1299e+02],
          [4.5601e+02, 3.1827e+02, 4.9165e+02, 3.5192e+02],
          [6.4512e+01, 1.8314e+02, 1.3398e+02, 2.4808e+02],
          [1.9737e+02, 3.0527e+02, 3.6639e+02, 4.7982e+02],
          [5.9433e+00, 1.6169e+02, 1.5701e+02, 3.0676e+02],
          [1.3039e+02, 4.0274e+01, 1.5769e+02, 9.1510e+01],
          [4.6208e+02, 4.0834e+02, 5.2133e+02, 4.7182e+02],
          [1.2270e+01, 3.9097e+02, 4.7502e+02, 4.8000e+02],
          [2.9094e+02, 4.3376e+02, 3.3725e+02, 4.7575e+02],
          [3.5776e+00, 2.2685e+01, 1.6581e+02, 2.1842e+02],
          [7.2452e+00, 3.3855e+02, 1.0378e+02, 4.7072e+02],
          [7.7182e+01, 4.1103e+02, 1.1358e+02, 4.5467e+02],
          [6.5774e+01, 3.2728e+