#  Imports and Hyperparameters

In [1]:
import os
import gc
import cv2
import numpy as np
import pandas as pd
from tqdm import tqdm
from shutil import copyfile
import matplotlib.pyplot as plt
from sklearn.model_selection import train_test_split

#customize iPython writefile so we can write variables
from IPython.core.magic import register_line_cell_magic

@register_line_cell_magic
def writetemplate(line, cell):
    with open(line, 'w') as f:
        f.write(cell.format(**globals()))
import torch
print(f"Setup complete. Using torch {torch.__version__} ({torch.cuda.get_device_properties(0).name if torch.cuda.is_available() else 'CPU'})")

Setup complete. Using torch 1.9.0+cu102 (TITAN Xp)


In [2]:
TRAIN_PATH = 'input/siim-covid19-resized-to-512px-png/train/'
IMG_SIZE = 640
BATCH_SIZE = 48
EPOCHS = 35
VAL_SPLIT=.15
HOME= '~/AA_jupyter_tuts/kaggle_SIIM_COVID_Detection'
CLASS_NUMBER=1
RANDOM_SEED=42

IMAGES_TRAIN_DIR='tmp/covid/images/train'
IMAGES_VAL_DIR='tmp/covid/images/valid'
LABELS_TRAIN_DIR='tmp/covid/labels/train'
LABELS_VAL_DIR='tmp/covid/labels/valid'

models={"yolov5s.pt":{"BS":48,"EPOCHS":EPOCHS}, 
        "yolov5m.pt":{"BS":32,"EPOCHS":EPOCHS}, 
        "yolov5l.pt":{"BS":16,"EPOCHS":EPOCHS}}

# ☀️ Setup Yolov5 and wandb

According to the official [Train Custom Data](https://github.com/ultralytics/yolov5/wiki/Train-Custom-Data) guide, YOLOv5 requires a certain directory structure. 

```
/parent_folder
    /dataset
         /images
         /labels
    /yolov5
```

* Create a `/tmp` directory. <br>
* Download YOLOv5 repository and pip install the required dependencies. <br>
* Install the latest version of W&B and login with your wandb account. You can create your free W&B account [here](https://wandb.ai/site).

In [3]:
#make a directory for yolov5
!mkdir tmp
%cd tmp

# Download YOLOv5 if not already there
if os.path.isdir('yolov5'):
    print("Yolo probably installed already")
else:
    !git clone https://github.com/ultralytics/yolov5  # clone repo
    %cd yolov5

    # Install dependenciesimages
    %pip install -qr requirements.txt  # install dependencies
    %cd ..
%cd ../

mkdir: cannot create directory ‘tmp’: File exists
/home/keith/AA_jupyter_tuts/kaggle_SIIM_COVID_Detection/tmp
Yolo probably installed already
/home/keith/AA_jupyter_tuts/kaggle_SIIM_COVID_Detection


In [4]:
# Install W&B 
!pip install -q --upgrade wandb

# Login 
key = os.getenv('WANDB_API_KEY')

# print(key)
import wandb
wandb.login(key=key)



Failed to detect the name of this notebook, you can set it manually with the WANDB_NOTEBOOK_NAME environment variable to enable code saving.
[34m[1mwandb[0m: W&B API key is configured (use `wandb login --relogin` to force relogin)


True

# 🔨 Prepare Dataset

This is the most important section when it comes to training an object detector with YOLOv5. The directory structure, bounding box format, etc must be in the correct order. This section builds every piece needed to train a YOLOv5 model.

I am using [xhlulu's](https://www.kaggle.com/xhlulu) resized dataset. The uploaded 256x256 Kaggle dataset is [here](https://www.kaggle.com/xhlulu/siim-covid19-resized-to-256px-jpg). Find other image resolutions [here](https://www.kaggle.com/c/siim-covid19-detection/discussion/239918).

* Create train-validation split. <br>
* Create required `/dataset` folder structure and more the images to that folder. <br>
* Create `data.yaml` file needed to train the model. <br>
* Create bounding box coordinates in the required YOLO format. 

In [5]:
# Load image level csv file
df = pd.read_csv('input/siim-covid19-detection/train_image_level.csv')

# Modify values in the id column
df['id'] = df.apply(lambda row: row.id.split('_')[0], axis=1)
# Add absolute path
df['path'] = df.apply(lambda row: TRAIN_PATH+row.id+'.jpg', axis=1)
# Get image level labels
df['image_level'] = df.apply(lambda row: row.label.split(' ')[0], axis=1)

df.head(5)

Unnamed: 0,id,boxes,label,StudyInstanceUID,path,image_level
0,000a312787f2,"[{'x': 789.28836, 'y': 582.43035, 'width': 102...",opacity 1 789.28836 582.43035 1815.94498 2499....,5776db0cec75,input/siim-covid19-resized-to-512px-png/train/...,opacity
1,000c3a3f293f,,none 1 0 0 1 1,ff0879eb20ed,input/siim-covid19-resized-to-512px-png/train/...,none
2,0012ff7358bc,"[{'x': 677.42216, 'y': 197.97662, 'width': 867...",opacity 1 677.42216 197.97662 1545.21983 1197....,9d514ce429a7,input/siim-covid19-resized-to-512px-png/train/...,opacity
3,001398f4ff4f,"[{'x': 2729, 'y': 2181.33331, 'width': 948.000...",opacity 1 2729 2181.33331 3677.00012 2785.33331,28dddc8559b2,input/siim-covid19-resized-to-512px-png/train/...,opacity
4,001bd15d1891,"[{'x': 623.23328, 'y': 1050, 'width': 714, 'he...",opacity 1 623.23328 1050 1337.23328 2156 opaci...,dfd9fdd85a3e,input/siim-covid19-resized-to-512px-png/train/...,opacity


In [6]:
df['image_level'].value_counts()

opacity    4294
none       2040
Name: image_level, dtype: int64

In [7]:
# Load meta.csv file
# Original dimensions are required to scale the bounding box coordinates appropriately.
meta_df = pd.read_csv('input/siim-covid19-resized-to-512px-png/meta.csv')
train_meta_df = meta_df.loc[meta_df.split == 'train']
train_meta_df = train_meta_df.drop('split', axis=1)
train_meta_df.columns = ['id', 'dim0', 'dim1']

train_meta_df.head(2)

Unnamed: 0,id,dim0,dim1
0,d8ba599611e5,2336,2836
1,29b23a11d1e4,3488,4256


In [8]:
# Merge both the dataframes
df = df.merge(train_meta_df, on='id',how="left")
df.head(2)

Unnamed: 0,id,boxes,label,StudyInstanceUID,path,image_level,dim0,dim1
0,000a312787f2,"[{'x': 789.28836, 'y': 582.43035, 'width': 102...",opacity 1 789.28836 582.43035 1815.94498 2499....,5776db0cec75,input/siim-covid19-resized-to-512px-png/train/...,opacity,3488,4256
1,000c3a3f293f,,none 1 0 0 1 1,ff0879eb20ed,input/siim-covid19-resized-to-512px-png/train/...,none,2320,2832


## 🍘 Train-validation split

In [9]:
# Create train and validation split.
train_df, valid_df = train_test_split(df, test_size=VAL_SPLIT, random_state=RANDOM_SEED, stratify=df.image_level.values)

train_df.loc[:, 'split'] = 'train'
valid_df.loc[:, 'split'] = 'valid'

df = pd.concat([train_df, valid_df]).reset_index(drop=True)
print(f'Size of dataset: {len(df)}, training images: {len(train_df)}. validation images: {len(valid_df)}')

Size of dataset: 6334, training images: 5383. validation images: 951


A value is trying to be set on a copy of a slice from a DataFrame.
Try using .loc[row_indexer,col_indexer] = value instead

See the caveats in the documentation: https://pandas.pydata.org/pandas-docs/stable/user_guide/indexing.html#returning-a-view-versus-a-copy
  self.obj[key] = value
A value is trying to be set on a copy of a slice from a DataFrame.
Try using .loc[row_indexer,col_indexer] = value instead

See the caveats in the documentation: https://pandas.pydata.org/pandas-docs/stable/user_guide/indexing.html#returning-a-view-versus-a-copy
  self._setitem_single_column(loc, value, pi)


## 🍚 Prepare Required Folder Structure

The required folder structure for the dataset directory is: 

```
/parent_folder
    /dataset
         /images
             /train
             /val
         /labels
             /train
             /val
    /yolov5
```

Note that I have named the directory `covid`.

In [10]:
os.makedirs(IMAGES_TRAIN_DIR, exist_ok=True)
os.makedirs(IMAGES_VAL_DIR, exist_ok=True)
os.makedirs(LABELS_TRAIN_DIR, exist_ok=True)
os.makedirs(LABELS_VAL_DIR, exist_ok=True)

# Move the images to relevant split folder.
for i in tqdm(range(len(df))):
    row = df.loc[i]
    if row.split == 'train':
        copyfile(row.path, f'{IMAGES_TRAIN_DIR}/{row.id}.png')
    else:
        copyfile(row.path, f'{IMAGES_VAL_DIR}/{row.id}.png')

100%|██████████| 6334/6334 [00:01<00:00, 5652.16it/s]


## 🍜 Create `.YAML` file

The `data.yaml`, is the dataset configuration file that defines 

1. an "optional" download command/URL for auto-downloading, 
2. a path to a directory of training images (or path to a *.txt file with a list of training images), 
3. a path to a directory of validation images (or path to a *.txt file with a list of validation images), 
4. the number of classes, 
5. a list of class names.

> 📍 Important: In this competition, each image can either belong to `opacity` or `none` image-level labels. That's why I have  used the number of classes, `nc` to be 2. YOLOv5 automatically handles the images without any bounding box coordinates. 

> 📍 Note: The `data.yaml` is created in the `yolov5/data` directory as required. 

In [11]:
# Create .yaml file 
%cd {HOME}
import yaml

data_yaml = dict(
    train = '../covid/images/train',
    val = '../covid/images/valid',
    nc = 1,
    names = ['opacity']
)

# Note that I am creating the file in the yolov5/data/ directory.
with open('tmp/yolov5/data/data.yaml', 'w') as outfile:
    yaml.dump(data_yaml, outfile, default_flow_style=True)
    
%cat tmp/yolov5/data/data.yaml

/home/keith/AA_jupyter_tuts/kaggle_SIIM_COVID_Detection
{names: [opacity], nc: 1, train: ../covid/images/train, val: ../covid/images/valid}


## 🍮 Prepare Bounding Box Coordinated for YOLOv5

For every image with **bounding box(es)** a `.txt` file with the same name as the image will be created in the format shown below:

* One row per object. <br>
* Each row is class `x_center y_center width height format`. <br>
* Box coordinates must be in normalized xywh format (from 0 - 1). We can normalize by the boxes in pixels by dividing `x_center` and `width` by image width, and `y_center` and `height` by image height. <br>
* Class numbers are zero-indexed (start from 0). <br>

> 📍 Note: We don't have to remove the images without bounding boxes from the training or validation sets. 

In [12]:
# Get the raw bounding box by parsing the row value of the label column.
# Ref: https://www.kaggle.com/yujiariyasu/plot-3positive-classes
def get_bbox(row):
    bboxes = []
    bbox = []
    for i, l in enumerate(row.label.split(' ')):
        if (i % 6 == 0) | (i % 6 == 1):
            continue
        bbox.append(float(l))
        if i % 6 == 5:
            bboxes.append(bbox)
            bbox = []  
            
    return bboxes

# Scale the bounding boxes according to the size of the resized image. 
def scale_bbox(row, bboxes):
    # Get scaling factor
    scale_x = IMG_SIZE/row.dim1
    scale_y = IMG_SIZE/row.dim0
    
    scaled_bboxes = []
    for bbox in bboxes:
        x = int(np.round(bbox[0]*scale_x, 4))
        y = int(np.round(bbox[1]*scale_y, 4))
        x1 = int(np.round(bbox[2]*(scale_x), 4))
        y1= int(np.round(bbox[3]*scale_y, 4))

        scaled_bboxes.append([x, y, x1, y1]) # xmin, ymin, xmax, ymax
        
    return scaled_bboxes

# Convert the bounding boxes in YOLO format.
def get_yolo_format_bbox(img_w, img_h, bboxes):
    yolo_boxes = []
    for bbox in bboxes:
        w = bbox[2] - bbox[0] # xmax - xmin
        h = bbox[3] - bbox[1] # ymax - ymin
        xc = bbox[0] + int(np.round(w/2)) # xmin + width/2
        yc = bbox[1] + int(np.round(h/2)) # ymin + height/2
        
        yolo_boxes.append([xc/img_w, yc/img_h, w/img_w, h/img_h]) # x_center y_center width height
    
    return yolo_boxes

In [13]:
df

Unnamed: 0,id,boxes,label,StudyInstanceUID,path,image_level,dim0,dim1,split
0,cc54237ef4db,"[{'x': 1657.98161, 'y': 616.51372, 'width': 65...",opacity 1 1657.98161 616.51372 2309.72477 995....,047b450939fd,input/siim-covid19-resized-to-512px-png/train/...,opacity,2400,2880,train
1,c957ee30525d,"[{'x': 670.92926, 'y': 1034.5932, 'width': 679...",opacity 1 670.92926 1034.5932 1350.47753999999...,b94e0c41bcee,input/siim-covid19-resized-to-512px-png/train/...,opacity,2806,3056,train
2,5df652c28a0a,,none 1 0 0 1 1,1e88ad314aa9,input/siim-covid19-resized-to-512px-png/train/...,none,3056,2544,train
3,8fa2ed829e2b,"[{'x': 1711.57423, 'y': 852.64, 'width': 778.6...",opacity 1 1711.57423 852.64 2490.2409 1748.106...,671caa8e6113,input/siim-covid19-resized-to-512px-png/train/...,opacity,2336,2836,train
4,c0159b677bda,,none 1 0 0 1 1,366fc85a2ac8,input/siim-covid19-resized-to-512px-png/train/...,none,3488,4256,train
...,...,...,...,...,...,...,...,...,...
6329,d8ef6a6d4981,"[{'x': 2592.0002, 'y': 569.60007, 'width': 154...",opacity 1 2592.0002 569.60007 4140.8 2310.3999...,f40f145b6d3a,input/siim-covid19-resized-to-512px-png/train/...,opacity,3488,4256,valid
6330,8fa48096e899,"[{'x': 490.0611, 'y': 1009.83553, 'width': 321...",opacity 1 490.0611 1009.83553 811.71244 2179.2...,03461d2a218f,input/siim-covid19-resized-to-512px-png/train/...,opacity,3032,3032,valid
6331,c5989c8376dd,,none 1 0 0 1 1,d53147ecb090,input/siim-covid19-resized-to-512px-png/train/...,none,3032,3032,valid
6332,c77674444141,"[{'x': 1039.43829, 'y': 936.51318, 'width': 89...",opacity 1 1039.43829 936.51318 1930.26068 2438...,3fd1aead516b,input/siim-covid19-resized-to-512px-png/train/...,opacity,3480,4248,valid


In [14]:
# Prepare the txt files for bounding boxs
# for i in tqdm(range(10)):
for i in tqdm(range(len(df))):
    row = df.loc[i]
    # Get image id
    img_id = row.id
    # Get split
    split = row.split
    # Get image-level label
    label = row.image_level
    
    if row.split=='train':
        file_name = f'tmp/covid/labels/train/{row.id}.txt'
    else:
        file_name = f'tmp/covid/labels/valid/{row.id}.txt'
        
    
    if label=='opacity':
        # Get bboxes
        bboxes = get_bbox(row)
        # Scale bounding boxes
        scale_bboxes = scale_bbox(row, bboxes)
        # Format for YOLOv5
        yolo_bboxes = get_yolo_format_bbox(IMG_SIZE, IMG_SIZE, scale_bboxes)
        
        with open(file_name, 'w') as f:
            for bbox in yolo_bboxes:
                bbox = [CLASS_NUMBER]+bbox
                bbox = [str(i) for i in bbox]
                bbox = ' '.join(bbox)
#                 print(bbox)
                f.write(bbox)
                f.write('\n')

100%|██████████| 6334/6334 [00:01<00:00, 4671.66it/s]


# 🚅 Train with W&B



```
--img {IMG_SIZE} \ # Input image size.
--batch {BATCH_SIZE} \ # Batch size
--epochs {EPOCHS} \ # Number of epochs
--data data.yaml \ # Configuration file
--weights yolov5l.pt \ # Model name
--save_period 1\ # Save model after interval
--project kaggle-siim-covid # W&B project name
```

In [None]:
%cd tmp/yolov5/
for key, val in models.items():
    !python train.py --img {IMG_SIZE} \
                     --batch {val["BS"]} \
                     --epochs {val["EPOCHS"]} \
                     --data data.yaml \
                     --weights {key} \
                     --save_period 2\
                     --project kaggle-siim-covid\
                     --single-cls
%cd tmp/yolov5/

/home/keith/AA_jupyter_tuts/kaggle_SIIM_COVID_Detection/tmp/yolov5
[34m[1mtrain: [0mweights=yolov5s.pt, cfg=, data=data.yaml, hyp=data/hyps/hyp.scratch.yaml, epochs=35, batch_size=48, img_size=[640], rect=False, resume=False, nosave=False, noval=False, noautoanchor=False, evolve=None, bucket=, cache_images=False, image_weights=False, device=, multi_scale=False, single_cls=True, adam=False, sync_bn=False, workers=8, project=kaggle-siim-covid, entity=None, name=exp, exist_ok=False, quad=False, linear_lr=False, label_smoothing=0.0, upload_dataset=False, bbox_interval=-1, save_period=2, artifact_alias=latest, local_rank=-1
[34m[1mgithub: [0mCommand 'git fetch && git config --get remote.origin.url' timed out after 5 seconds, for updates see https://github.com/ultralytics/yolov5
YOLOv5 🚀 v5.0-294-gdd62e2d torch 1.9.0+cu102 CUDA:0 (TITAN Xp, 12194.0625MB)

[34m[1mhyperparameters: [0mlr0=0.01, lrf=0.2, momentum=0.937, weight_decay=0.0005, warmup_epochs=3.0, warmup_momentum=0.8, warmup

# Submission

In this section, I will show how you can use YOLOv5 as object detector and prepare `submission.csv` file.

In [None]:
# The submisison requires xmin, ymin, xmax, ymax format. 
# YOLOv5 returns x_center, y_center, width, height
def correct_bbox_format(bboxes):
    correct_bboxes = []
    for b in bboxes:
        xc, yc = int(np.round(b[0]*IMG_SIZE)), int(np.round(b[1]*IMG_SIZE))
        w, h = int(np.round(b[2]*IMG_SIZE)), int(np.round(b[3]*IMG_SIZE))

        xmin = xc - int(np.round(w/2))
        xmax = xc + int(np.round(w/2))
        ymin = yc - int(np.round(h/2))
        ymax = yc + int(np.round(h/2))
        
        correct_bboxes.append([xmin, xmax, ymin, ymax])
        
    return correct_bboxes

# Read the txt file generated by YOLOv5 during inference and extract 
# confidence and bounding box coordinates.
def get_conf_bboxes(file_path):
    confidence = []
    bboxes = []
    with open(file_path, 'r') as file:
        for line in file:
            preds = line.strip('\n').split(' ')
            preds = list(map(float, preds))
            confidence.append(preds[-1])
            bboxes.append(preds[1:-1])
    return confidence, bboxes

In [None]:
# Read the submisison file
# sub_df = pd.read_csv('../input/siim-covid19-detection/sample_submission.csv')
# sub_df.tail()
%pwd

In [None]:
# Prediction loop for submission
predictions = []

for i in tqdm(range(len(sub_df))):
    row = sub_df.loc[i]
    id_name = row.id.split('_')[0]
    id_level = row.id.split('_')[-1]
    
    if id_level == 'study':
        # do study-level classification
        predictions.append("Negative 1 0 0 1 1") # dummy prediction
        
    elif id_level == 'image':
        # we can do image-level classification here.
        # also we can rely on the object detector's classification head.
        # for this example submisison we will use YOLO's classification head. 
        # since we already ran the inference we know which test images belong to opacity.
        if f'{id_name}.txt' in prediction_files:
            # opacity label
            confidence, bboxes = get_conf_bboxes(f'{PRED_PATH}/{id_name}.txt')
            bboxes = correct_bbox_format(bboxes)
            pred_string = ''
            for j, conf in enumerate(confidence):
                pred_string += f'opacity {conf} ' + ' '.join(map(str, bboxes[j])) + ' '
            predictions.append(pred_string[:-1]) 
        else:
            predictions.append("None 1 0 0 1 1")

In [None]:
sub_df['PredictionString'] = predictions
sub_df.to_csv('submission.csv', index=False)
sub_df.tail()