In [0]:
import math
import os
import shutil
import sys

import numpy as np
import pandas as pd
import matplotlib.pyplot as plt
import glob
import cv2
from sklearn.model_selection import train_test_split


In [0]:
random_stat = 123
np.random.seed(random_stat)

In [0]:
%tensorflow_version 2.x
import tensorflow
print(tensorflow.__version__)

In [0]:
from google.colab import drive
drive.mount('/content/gdrive/',force_remount=True)

## 2. Data Migration for YOLOv3

### 2.0. Make subdirectories

In [0]:
project_path = "/content/gdrive/My Drive/Colab Notebooks/Capstone Project/"
data_path = "/content/gdrive/My Drive/Colab Notebooks/Capstone Project/data/"
images_path = "/content/gdrive/My Drive/Colab Notebooks/Capstone Project/data/training_images/"


img_dir = data_path+ "training_images"  # .jpg
label_dir = data_path+ "training_labels"  # .txt
metadata_dir = data_path+ "cfg" # .txt

cfg_dir = data_path + "cfg"
#To save the model checkpoints
backup_dir = data_path + "backup"


# We are using the image dir we generated in the EDA. The other directories will have to be created if they don't exist
for directory in [label_dir, metadata_dir, cfg_dir, backup_dir]:
    if os.path.isdir(directory):
        continue
    os.mkdir(directory)

### 2.1. Load patient_data.csv

In [0]:
#The CSV has generated as a part of the EDA has details of the bounding boxes as may be applicable. 
patient_data = pd.read_csv(os.path.join(data_path, "patients_data.csv"))
patient_data.head()

### 2.2. Generate image details for training YOLO
* YOLO needs .txt file for each image, which contains bounding boxes in the image that looks like:
```
<object-class_1> <x_1> <y_1> <width_1> <height_1>
<object-class_2> <x_2> <y_2> <width_2> <height_2>
```
* <object-class\>: Since RSNA task is binary classification basically, <object-class\> is 1 as we are going to train on the images with Pneumonia.
* <x\>, <y\>: Those are float values of bbox center coordinate, divided by image width and height respectively.
* <w\>, <h\>: Those are width and height of bbox, divided by image width and height respectively.


In [0]:

def save_yolo_input_txt():
    # rsna defualt image size
    img_size = 1024
    pneumonia_patients = patient_data[patient_data.Target == 1]
    for index,row in pneumonia_patients.iterrows():
      label_fp = os.path.join(label_dir, "{}.txt".format(row['patientId']))
      f = open(label_fp, "a")
      top_left_x = row['x']
      top_left_y = row['y']
      w = row['width']
      h = row['height']
      
      rx = top_left_x/img_size
      ry = top_left_y/img_size
      rw = w/img_size
      rh = h/img_size
      rcx = rx+rw/2
      rcy = ry+rh/2
      
      line = "{} {} {} {} {}\n".format(0, rcx, rcy, rw, rh)
      
      f.write(line)
      f.flush()
      f.close()


In [0]:
save_yolo_input_txt()

### 2.3. Plot a sample train image and label

In [0]:
#Take a patiend id where target is 1 and show it with the bounding boxes

ex_patient_id = patient_data[patient_data.Target == 1].sample(1)
for index,row in ex_patient_id.iterrows():
  ex_patient_id = row['patientId']
print(ex_patient_id)
ex_img_path = "/content/gdrive/My Drive/Colab Notebooks/Capstone Project/data/training_images/"+ex_patient_id+".jpg"
ex_label_path = "/content/gdrive/My Drive/Colab Notebooks/Capstone Project/data/training_labels/"+ex_patient_id+".txt"



In [0]:
img_size = 1014
print(ex_img_path)
plt.imshow(cv2.imread(ex_img_path))

with open(ex_label_path, "r") as f:
    for line in f:
        print(line)
        class_id, rcx, rcy, rw, rh = list(map(float, line.strip().split()))
        x = (rcx-rw/2)*img_size
        y = (rcy-rh/2)*img_size
        w = rw*img_size
        h = rh*img_size
        plt.plot([x, x, x+w, x+w, x], [y, y+h, y+h, y, y])

In [0]:
#For the following command to be run, you must have cloned darknet from git repo and run a make. 

darknet_dir = '/content/gdrive/My Drive/Colab Notebooks/Capstone Project/darknet'
os.chdir("/content/gdrive/My Drive/Colab Notebooks/Capstone Project/")
if os.path.exists(darknet_dir) == False :
  !git clone https://github.com/pjreddie/darknet
  os.chdir(darknet_dir)
else:
  os.chdir(darknet_dir)
  !git pull



In [0]:
os.chdir(darknet_dir)
!make -B


### 2.4. Generate train/val file path list (.txt)
* We should give the list of image paths to YOLO. two seperate list textfiles for training images and validation images.

In [0]:
def write_train_list(metadata_dir, img_dir, name, series):
    list_fp = os.path.join(metadata_dir, name)
    with open(list_fp, "w") as f:
        for patient_img in series:
            line = "{}\n".format(os.path.join(img_dir, patient_img))
            f.write(line)

In [0]:
patient_data_unique = patient_data[patient_data.Target == 1].patientId.drop_duplicates()

patient_xray_images = patient_data_unique+".jpg"

tr_series, val_series = train_test_split(patient_xray_images, test_size=0.1, random_state=random_stat)
print("The # of train set: {}, The # of validation set: {}".format(tr_series.shape[0], val_series.shape[0]))
# train image path list
write_train_list(metadata_dir, img_dir, "tr_list.txt", tr_series)
# validation image path list
write_train_list(metadata_dir, img_dir, "val_list.txt", val_series)

## 3. Prepare Configuration Files for Using YOLOv3
We should prepare and modify config files, and bring pre-trained weights necessary for training. This proceeds with following four steps.
```
 cfg/rsna.data
 cfg/rsna.names
 darknet53.conv.74
 cfg/rsna_yolov3.cfg_train
```

### - cfg/rsna.data
This file point to RSNA data path
  * train: Path to training image list textfile
  * valid: Path to validation image list textfile
  * names: The file containing the possible RSNA names - This in our case is a file named rsna.names in the same directory, containing just one entry 'pneumonia' as that is the only class. 
  * backup: A directory where trained weights(checkpoints) will be stored as training progresses.
  * results: Path to the directory where the prediction results are to be stored.

Sample content of rsna.names
```
classes= 1
train  = tr_list.txt
valid  = val_list.txt
names  = rsna.names
eval  = rsna.names
backup = model_backup
results = pred_results
```

In [0]:
os.chdir(cfg_dir)

### - darknet53.conv.74  (Download Pre-trained Model)
For training, we would download the pre-trained model weights(darknet53.conv.74) using following wget command. Alternatively, you can download it from the link used in wget command below and upload it to the relevant directory in your path. 

In [0]:
if os.path.exists(cfg_dir+"darknet53.conv.74") == False:
  !wget -q https://pjreddie.com/media/files/darknet53.conv.74

In [0]:
if os.path.exists(cfg_dir+"rsna_yolov3_batch64.cfg_train") == False:
  !wget --no-check-certificate -q "https://docs.google.com/uc?export=download&id=18ptTK4Vbeokqpux8Onr0OmwUP9ipmcYO" -O rsna_yolov3_batch64.cfg_train

## 4. Training YOLOv3



### 4.0. Command for training with Pre-trained CNN Weights (darknet53.conv.74)


In [0]:
!/content/gdrive/My\ Drive/Colab\ Notebooks/Capstone\ Project/darknet/darknet detector train  rsna.data /content/gdrive/My\ Drive/Colab\ Notebooks/Capstone\ Project/darknet/cfg/rsna_yolov3_batch64.cfg_train darknet53.conv.74 -i 0 | tee train_log.txt

## 5. How to use trainined YOLOv3 for test images (command line)

### 5.0. Copy sample test image

In [0]:
ex_patient_id = patient_data[patient_data.Target == 1].patientId.values[2]
shutil.copy(ex_img_path, "test.jpg")
print(ex_img_path)
!pwd

### 5.1. Load trained model (at 6300 iteration)
Load the trained model at 6300 iteration stored in model_backup folder, which took about 5 hours. 
The cfg file `batch` and `subdivision` is set 64 each. The width and height of images set to 416.


### 5.2. cfg file for test (not for training)

In [0]:
!/content/gdrive/My\ Drive/Colab\ Notebooks/Capstone\ Project/darknet/darknet detector valid rsna.data /content/gdrive/My\ Drive/Colab\ Notebooks/Capstone\ Project/data/cfg/yolov3-tiny-c1_test.cfg ./model_backup/rsna_yolov3_batch64_6300.weights

In [0]:
ex_model_path = "./pred_results/comp4_det_test_pneumonia.txt"

df = pd.read_table(ex_model_path,sep=" ",names=['imgName','scale','x','y','w','h'])
df.sample(1)

for index,row in df.sample(1).iterrows():
  imgName = "/content/gdrive/My Drive/Colab Notebooks/Capstone Project/data/training_images/"+row['imgName']+".jpg"
  plt.imshow(cv2.imread(imgName))
  x = (rcx-rw/2)*img_size
        y = (rcy-rh/2)*img_size
        w = rw*img_size
        h = rh*img_size


## 6. Generate Submission Files with YOLOv3 Python Wrapper

### 6.1. Load darknet python wrapper module

In [0]:
from darknet import *

### 6.2. Generate submission files
* When making submission files, be aware of label format which is different in yolo.

In [0]:
threshold = 0.2

In [0]:
submit_file_path = "submission.csv"
cfg_path = os.path.join(cfg_dir, "yolov3-tiny-c1_test.cfg")
weight_path = os.path.join(backup_dir, "rsna_yolov3_batch64_6300.weights")

test_img_list_path = os.path.join(metadata_dir, "te_list.txt")

In [0]:
gpu_index = 0
net = load_net(cfg_path.encode(),
               weight_path.encode(), 
               gpu_index)
meta = load_meta(data_extention_file_path.encode())

In [0]:
submit_dict = {"patientId": [], "PredictionString": []}

with open(test_img_list_path, "r") as test_img_list_f:
    # tqdm run up to 1000(The # of test set)
    for line in tqdm(test_img_list_f):
        patient_id = line.strip().split('/')[-1].strip().split('.')[0]

        infer_result = detect(net, meta, line.strip().encode(), thresh=threshold)

        submit_line = ""
        for e in infer_result:
            confi = e[1]
            w = e[2][2]
            h = e[2][3]
            x = e[2][0]-w/2
            y = e[2][1]-h/2
            submit_line += "{} {} {} {} {} ".format(confi, x, y, w, h)

        submit_dict["patientId"].append(patient_id)
        submit_dict["PredictionString"].append(submit_line)

pd.DataFrame(submit_dict).to_csv(submit_file_path, index=False)