## Kaggle RSNA Pneumonia Detection Challenge:
   ### Build an algorithm that automatically detects potential pneumonia cases (from images).
   ### Evaluated at different thresholds for intersection over union of bounding boxes (or objects).
\begin{align}
\ \mbox{IoU(A, B) intersection over union} \\
\ \mbox{A predicted bounding boxes} \\
\ \mbox{B ground truth bounding boxes} \\
\ IoU(A, B) = \frac{A \bigcap B}{A \bigcup B} \\ 
\ \\
\ \textit{At each threshold value t} \\
\ IoU(A, B) = \frac{TP(t)}{TP(t)+FP(t)+FN(t)} \\
\ \\
\ \textit{True Positives TP} \\
\ \textit{False Positives FP} \\
\ \textit{False Negatives FP} \\
\ \textit{for threshold step of 0.5 over (0.4, 0.75)} \\
\ \\
\ \textit{else the precision of a single image is:} \\
\ \frac{1}{|thresholds|} \sum_t \frac{TP(t)}{TP(t)+FP(t)+FN(t)} \\
\end{align}

    * Note that if ground truth is None, any False Positives give the image a score of 0
    * In nearly all cases confidence will have no effect on scoring
    
****

# links
[Kaggle Challenge - Getting Started](https://www.kaggle.com/c/rsna-pneumonia-detection-challenge#Getting%20Started) <br>
[md ai GitHub](https://github.com/mdai/ml-lessons) <br>
[md ai My projects](https://public.md.ai/hub/projects/user) <br>
[Google Colab kaggle](https://colab.research.google.com/github/mdai/ml-lessons/blob/master/lesson3-rsna-pneumonia-detection-kaggle.ipynb) <br>
[darknet](https://github.com/pjreddie/darknet) <br>
[darknet train classifier from scratch](https://pjreddie.com/darknet/train-cifar/) <br>

## DarkNet, You Only Look Once YOLO & python wrappers
****
[YOLO](https://pjreddie.com/darknet/yolo/) <br>
[YOLO - python](https://github.com/madhawav/YOLO3-4-Py) <br>
[YOLO - py - docker](https://github.com/madhawav/YOLO3-4-Py/tree/master/docker) <br>
[darknetpy pypi](https://pypi.org/project/darknetpy/) <br>
[darknetpy GitHub](https://github.com/danielgatis/darknetpy) <br>
[lightnet GitHub](https://github.com/explosion/lightnet) <br>

****

In [1]:
%matplotlib inline
import os
import sys

import glob
import pylab
import pandas as pd
import pydicom
import numpy as np

from PIL import Image

# '../../src/dcm_wrangler.py'
sys.path.insert(1, '../../src/')
import kaggle_wrangler as kgwr

kaggle_data_dir = '../../data/all'
train_data_dir = os.path.join(kaggle_data_dir, 'stage_1_train_images')
test_data_dir = os.path.join(kaggle_data_dir, 'stage_1_test_images')

# os.listdir(kaggle_data_dir)

## following directions:
[exploratory data analysis](https://www.kaggle.com/peterchang77/exploratory-data-analysis) <br>

## extracting the boxed parts or lungs into thumbnails
    * 1) Get the list of training labels dataframe and the detailed class info dataframe.
    * 2) For each patientId construct a thumbnail file name for each box with these parts:
        * patientId
        * box number if one or more box
        * class code
    * 3) Save a thumbnail for each box or lung if no boxes
        * arbitrarily select the left and right lung.
        * exactly select each box
    
### thumbnail size = 64 x 64

In [2]:
# Caution they have duplicate patient id's

labels_df = pd.read_csv(os.path.join(kaggle_data_dir, 'stage_1_train_labels.csv'))
class_df = pd.read_csv(os.path.join(kaggle_data_dir, 'stage_1_detailed_class_info.csv'))
# dfinal = labels_df.merge(class_df, on="patientId", how='inner')

parsed = kgwr.parse_data(labels_df, train_data_dir)

In [33]:
# get the possible number of thumbnails
pt_Id_list = list(parsed.keys())
positive_files = 0
negative_files = 0
for pt_Id in pt_Id_list:
    pt_Id_data = parsed[pt_Id]
    if os.path.isfile(pt_Id_data['dicom']) == True:
        if pt_Id_data['label'] == 1 and 'boxes' in pt_Id_data:
            positive_files += len(pt_Id_data['boxes'])
            # pass the boxes and filename to write_box_thumbs
            
        elif pt_Id_data['label'] == 0:
            negative_files += 1
            # pass the filename to write_lung_thumbs
            
print(positive_files, negative_files)

5560 12710


## Train for no false positives or negatives
```python
box = boxes_list[0]
sm_im = im[int(box[0]): int(box[0]+box[2]),int(box[1]): int(box[1]+box[3])]
pylab.imshow(sm_im, cmap=pylab.cm.gist_gray)

from PIL import Image

sm_im_tmb = Image.fromarray(sm_im)
scalef = 1/8
size = np.int_(np.array(sm_im.shape) * scalef)
outfile = 'tstim.png'
sm_im_tmb.thumbnail(size, Image.ANTIALIAS)
sm_im_tmb.save(outfile, "png")

```

In [None]:
def write_box_thumbs():
    pass

def write_lung_thumbs():
    pass

In [None]:
%%writefile ../../src/kaggle_wrangler.py

import os
import glob
import pylab
import pandas as pd
import pydicom
import numpy as np

from PIL import Image

file_ext = '.png'
class_codes_dict = {'No Lung Opacity / Not Normal': 'NoLuOpNotNorm',
                            'Lung Opacity': 'LuOp',
                                  'Normal': 'Normal'}

def get_thumbnail_name(patiendID, class_code, number_of_boxes):
    thumbnail_name = patiendID + '_%i_'%(number_of_boxes) + class_code + file_ext
    return thumbnail_name

def get_class_code(patientID, class_info_df):
    index_list = class_info_df.index[class_info_df['patientId'] == patientID].tolist()
    class_code_key = class_info_df.loc[index_list[0]]['class']
    return class_codes_dict[class_code_key]

def get_boxes(patientID, parsed_data):
    boxes_list = parsed_data[patientID]['boxes']
    boxes_dict = {'number_of_boxes':len(boxes_list)}
    if boxes_dict['number_of_boxes'] > 0:
        box_number = 0
        for box in boxes_list:
            x1 = int(box[0])
            x2 = int(box[0]+box[2])
            y1 = int(box[1])
            y2 = int(box[1]+box[3])
        boxes_dict[box_number] = {'top_row': x1, 'bottom_row': x2, 'left_column': y1, 'right_column': y2}
        box_number += 1
        
    return boxes_dict

# https://www.kaggle.com/peterchang77/exploratory-data-analysis

def parse_data(df, images_dir):
    """
    Method to read a CSV file (Pandas dataframe) and parse the 
    data into the following nested dictionary:
      parsed = {
        'patientId-00': {
            'dicom': path/to/dicom/file,
            'label': either 0 or 1 for normal or pnuemonia, 
            'boxes': list of box(es)
        },
        'patientId-01': {
            'dicom': path/to/dicom/file,
            'label': either 0 or 1 for normal or pnuemonia, 
            'boxes': list of box(es)
        }, ...
      }
    """
    # --- Define lambda to extract coords in list [y, x, height, width]
    extract_box = lambda row: [row['y'], row['x'], row['height'], row['width']]

    parsed = {}
    for n, row in df.iterrows():
        # --- Initialize patient entry into parsed 
        pid = row['patientId']
        if pid not in parsed:
            parsed[pid] = {
                'dicom': os.path.join(images_dir, '%s.dcm'% pid),
                'label': row['Target'],
                'boxes': []}
        # --- Add box if opacity is present
        if parsed[pid]['label'] == 1:
            parsed[pid]['boxes'].append(extract_box(row))

    return parsed

def draw(data):
    """ Method to draw single patient with bounding box(es) if present """
    # --- Open DICOM file
    d = pydicom.read_file(data['dicom'])
    im = d.pixel_array

    # --- Convert from single-channel grayscale to 3-channel RGB
    im = np.stack([im] * 3, axis=2)

    # --- Add boxes with random color if present
    for box in data['boxes']:
        rgb = np.floor(np.random.rand(3) * 256).astype('int')
        im = overlay_box(im=im, box=box, rgb=rgb, stroke=6)

    pylab.imshow(im, cmap=pylab.cm.gist_gray)
    pylab.axis('off')

def overlay_box(im, box, rgb, stroke=1):
    """ Method to overlay single box on image """
    # --- Convert coordinates to integers
    box = [int(b) for b in box]
    
    # --- Extract coordinates
    y1, x1, height, width = box
    y2 = y1 + height
    x2 = x1 + width

    im[y1:y1 + stroke, x1:x2] = rgb
    im[y2:y2 + stroke, x1:x2] = rgb
    im[y1:y2, x1:x1 + stroke] = rgb
    im[y1:y2, x2:x2 + stroke] = rgb

    return im