## Kaggle RSNA Pneumonia Detection Challenge:
   ### Build an algorithm that automatically detects potential pneumonia cases (from images).
   ### Evaluated at different thresholds for intersection over union of bounding boxes (or objects).
\begin{align}
\ \mbox{IoU(A, B) intersection over union} \\
\ \mbox{A predicted bounding boxes} \\
\ \mbox{B ground truth bounding boxes} \\
\ IoU(A, B) = \frac{A \bigcap B}{A \bigcup B} \\ 
\ \\
\ \textit{At each threshold value t} \\
\ IoU(A, B) = \frac{TP(t)}{TP(t)+FP(t)+FN(t)} \\
\ \\
\ \textit{True Positives TP} \\
\ \textit{False Positives FP} \\
\ \textit{False Negatives FP} \\
\ \textit{for threshold step of 0.5 over (0.4, 0.75)} \\
\ \\
\ \textit{else the precision of a single image is:} \\
\ \frac{1}{|thresholds|} \sum_t \frac{TP(t)}{TP(t)+FP(t)+FN(t)} \\
\end{align}

    * Note that if ground truth is None, any False Positives give the image a score of 0
    * In nearly all cases confidence will have no effect on scoring
    
****

# links
[Kaggle Challenge - Getting Started](https://www.kaggle.com/c/rsna-pneumonia-detection-challenge#Getting%20Started) <br>
[md ai GitHub](https://github.com/mdai/ml-lessons) <br>
[md ai My projects](https://public.md.ai/hub/projects/user) <br>
[Google Colab kaggle](https://colab.research.google.com/github/mdai/ml-lessons/blob/master/lesson3-rsna-pneumonia-detection-kaggle.ipynb) <br>
[darknet](https://github.com/pjreddie/darknet) <br>
[darknet train classifier from scratch](https://pjreddie.com/darknet/train-cifar/) <br>

## DarkNet, You Only Look Once YOLO & python wrappers
****
[YOLO](https://pjreddie.com/darknet/yolo/) <br>
[YOLO - python](https://github.com/madhawav/YOLO3-4-Py) <br>
[YOLO - py - docker](https://github.com/madhawav/YOLO3-4-Py/tree/master/docker) <br>
[darknetpy pypi](https://pypi.org/project/darknetpy/) <br>
[darknetpy GitHub](https://github.com/danielgatis/darknetpy) <br>
[lightnet GitHub](https://github.com/explosion/lightnet) <br>

****

In [None]:
%matplotlib inline
import os
import sys

import glob
import pylab
import pandas as pd
import pydicom
import numpy as np

from PIL import Image

# '../../src/dcm_wrangler.py'
sys.path.insert(1, '../src/')
import kaggle_wrangler as kgwr

kaggle_data_dir = '../data/all'
train_data_dir = os.path.join(kaggle_data_dir, 'stage_1_train_images')
test_data_dir = os.path.join(kaggle_data_dir, 'stage_1_test_images')

# os.listdir(kaggle_data_dir)

## following directions:
[exploratory data analysis](https://www.kaggle.com/peterchang77/exploratory-data-analysis) <br>

## extracting the boxed parts or lungs into thumbnails
    * 1) Get the list of training labels dataframe and the detailed class info dataframe.
    * 2) For each patientId construct a image file name for each box with these parts:
        * patientId
        * box number if one or more box
        * class code
    * 3) Save the boxed image for each box or lung if no boxes
        * arbitrarily select the left and right lung.
        * exactly select each box

In [None]:
# Caution they have duplicate patient id's

labels_df = pd.read_csv(os.path.join(kaggle_data_dir, 'stage_1_train_labels.csv'))
class_df = pd.read_csv(os.path.join(kaggle_data_dir, 'stage_1_detailed_class_info.csv'))
# dfinal = labels_df.merge(class_df, on="patientId", how='inner')

parsed = kgwr.parse_data(labels_df, train_data_dir)
pt_Id_list = sorted(list(parsed.keys()))

In [None]:
import time
t0 = time.time()

out_dir_local = os.path.abspath('../data/train_data_select')
all_files_written_list = []

with_pneumonia = 0
without_pneumonia = 0
class_conflicted = 0
conflicted_Positive_patientID_list = []
conflicted_Negative_patientID_list = []
count = 0
max_count = 100000
for patientID in pt_Id_list:
    count += 1
    Target_rows = labels_df[labels_df['patientId'] == patientID]
    if any(Target_rows['Target'] == 0):
        without_pneumonia += 1
        written_files_list = kgwr.write_negative_test_images(patientID, parsed, class_df, out_dir=out_dir_local)
        if len(written_files_list) > 0:
            for written in written_files_list:
                all_files_written_list.append(written)
        else:
            conflicted_Positive_patientID_list.append(patientID)
            class_conflicted += 1
        
    elif any(Target_rows['Target'] == 1):
        with_pneumonia += 1
        written_files_list = kgwr.write_box_images(patientID, parsed, class_df, out_dir=out_dir_local)
        if len(written_files_list) > 0:
            for written in written_files_list:
                all_files_written_list.append(written)
        else:
            conflicted_Negative_patientID_list.append(patientID)
            class_conflicted += 1
    else:
        class_conflicted += 1
        
    if count > max_count:
        break
        
    if np.mod(count, 1000) == 0:
        print('count', count)
        
print('Total files written',len(all_files_written_list))
print('In total time %0.3f'%(time.time() - t0))

print('\nwith_pneumonia', with_pneumonia, 
      '\nwithout_pneumonia', without_pneumonia, 
      '\nclass_conflicted', class_conflicted)

print('writing Conflicted PatientID lists')

with open('conflicted_Negatives.txt', 'w') as f:
    for item in conflicted_Negative_patientID_list:
        f.write("%s\n" % item)
        
with open('conflicted_Positives.txt', 'w') as f:
    for item in conflicted_Positive_patientID_list:
        f.write("%s\n" % item)


```bash
Total files written 30980
In total time 1671.489

with_pneumonia 5659 
without_pneumonia 20025 
class_conflicted 9458
```

In [7]:
all_files_written = os.listdir('../data/train_data_selected')
len(all_files_written)

30980

In [None]:
for n in range(20):
    patientID = pt_Id_list[n]
    Target_rows = labels_df[labels_df['patientId'] == patientID]
    if any(Target_rows['Target'] == 0):
        if os.path.isfile(parsed[patientID]['dicom']):
            print('number',n, '\t\tpatientID:', patientID)
            pylab.figure()
            kgwr.draw(parsed[patientID])
        else:
            print('file Not found')
    


In [None]:
print(patientID)
written_files_list = kgwr.write_box_images(patientID, parsed, class_df, out_dir=None)
for full_file in written_files_list:
    wrt_dir, f_name = os.path.split(full_file)
    print(f_name)

```python
import time
t0 = time.time()
out_dir_local = os.path.abspath('train_data')
all_files_written_list = []
for patientID in pt_Id_list:
    written_files_list = kgwr.write_box_images(patientID, parsed, class_df, out_dir=out_dir_local)
    if len(written_files_list) > 0:
        for written in written_files_list:
            all_files_written_list.append(written)
    else:
print(len(all_files_written_list))
print('tt', time.time() - t0)
```

In [None]:
%%writefile ../src/kaggle_wrangler.py

import os
import glob
import pylab
import pandas as pd
import pydicom
import numpy as np

from PIL import Image

file_ext = '.png'
class_codes_dict = {'No Lung Opacity / Not Normal': 'NoLuOpNotNorm',
                            'Lung Opacity': 'LuOp',
                                  'Normal': 'Normal'}

def write_negative_test_images(patientID, parsedict, class_df, out_dir=None):
    """ written_files_list = write_negative_test_images(patientID, parsedict, class_df, out_dir=None) 
    Note that file name L and R is patient point of view whereas variable names are X-ray point of view
    """
    if out_dir is None: out_dir = os.getcwd()
        
    # arbitrary cropping boxes
    row_crop = np.array([0.2, 0.7])
    col_crop = np.array([0.23, 0.46, 0.54, 0.77])
    
    written_files_list = []
    class_code = get_class_code(patientID, class_df) + '_Negative'
    dcm_file_name = parsedict[patientID]['dicom']
    if os.path.isfile(dcm_file_name):
        dcm_data = pydicom.read_file(dcm_file_name)
        im = dcm_data.pixel_array
        im_size = im.shape
        row_bounds = np.int_(row_crop * im_size[0])
        col_bounds = np.int_(col_crop * im_size[1])

        left_im = im[row_bounds[0]:row_bounds[1], col_bounds[0]:col_bounds[1]]
        left_im = Image.fromarray(left_im)
        left_box_file_name = patientID + '_R_' + class_code + file_ext
        left_box_full_file_name = os.path.join(out_dir, left_box_file_name)
        left_im.save(left_box_full_file_name, 'png')
        written_files_list.append(left_box_file_name)

        right_im = im[row_bounds[0]:row_bounds[1], col_bounds[2]:col_bounds[3]]
        right_im = Image.fromarray(right_im)
        right_box_file_name = patientID + '_L_' + class_code + file_ext
        right_box_full_file_name = os.path.join(out_dir, right_box_file_name)
        right_im.save(right_box_full_file_name, 'png')
        written_files_list.append(right_box_file_name)
    
    return written_files_list

def write_box_images(patientID, parsedict, class_df, out_dir=None):
    """ written_files_list = write_box_images(patientID, parsedict, class_df, out_dir=None) 
    """
    if out_dir is None: out_dir = os.getcwd()
    written_files_list = []
    boxes_dict = get_boxes(patientID, parsedict)
    if boxes_dict['number_of_boxes'] > 0:
        class_code = get_class_code(patientID, class_df) + '_Positive'
        dcm_file_name = parsedict[patientID]['dicom']
        if os.path.isfile(dcm_file_name):
            dcm_data = pydicom.read_file(dcm_file_name)
            im = dcm_data.pixel_array
            for box_number in range(boxes_dict['number_of_boxes']):
                box_file_name = get_thumbnail_name(patientID, class_code, box_number)
                box_file_full_name = os.path.join(out_dir, box_file_name)
                box = boxes_dict[box_number]
                box_im = im[box[0]: box[0]+box[2],box[1]: box[1]+box[3]]
                Box_Image = Image.fromarray(box_im)
                Box_Image.save(box_file_full_name, "png")
                written_files_list.append(box_file_name)
            
    return written_files_list

def get_thumbnail_name(patientID, class_code, box_number):
    thumbnail_name = patientID + '_%i_'%(box_number) + class_code + file_ext
    return thumbnail_name

def get_class_code(patientID, class_info_df):
    index_list = class_info_df.index[class_info_df['patientId'] == patientID].tolist()
    class_code_key = class_info_df.loc[index_list[0]]['class']
    return class_codes_dict[class_code_key]

def get_boxes(patientID, parsed_data):
    boxes_list = parsed_data[patientID]['boxes']
    boxes_dict = {'number_of_boxes':len(boxes_list)}
    if boxes_dict['number_of_boxes'] > 0:
        box_number = 0
        for box in boxes_list:
            boxes_dict[box_number] = np.int_(np.array(boxes_list[box_number]))
            box_number += 1
        
    return boxes_dict

# https://www.kaggle.com/peterchang77/exploratory-data-analysis

def parse_data(df, images_dir):
    """
    Method to read a CSV file (Pandas dataframe) and parse the 
    data into the following nested dictionary:
      parsed = {
        'patientId-00': {
            'dicom': path/to/dicom/file,
            'label': either 0 or 1 for normal or pnuemonia, 
            'boxes': list of box(es)
        },
        'patientId-01': {
            'dicom': path/to/dicom/file,
            'label': either 0 or 1 for normal or pnuemonia, 
            'boxes': list of box(es)
        }, ...
      }
    """
    # --- Define lambda to extract coords in list [y, x, height, width]
    extract_box = lambda row: [row['y'], row['x'], row['height'], row['width']]

    parsed = {}
    for n, row in df.iterrows():
        # --- Initialize patient entry into parsed 
        pid = row['patientId']
        if pid not in parsed:
            parsed[pid] = {
                'dicom': os.path.join(images_dir, '%s.dcm'% pid),
                'label': row['Target'],
                'boxes': []}
        # --- Add box if opacity is present
        if parsed[pid]['label'] == 1:
            parsed[pid]['boxes'].append(extract_box(row))

    return parsed

def draw(data):
    """ Method to draw single patient with bounding box(es) if present """
    # --- Open DICOM file
    d = pydicom.read_file(data['dicom'])
    im = d.pixel_array

    # --- Convert from single-channel grayscale to 3-channel RGB
    im = np.stack([im] * 3, axis=2)

    # --- Add boxes with random color if present
    for box in data['boxes']:
        rgb = np.floor(np.random.rand(3) * 256).astype('int')
        im = overlay_box(im=im, box=box, rgb=rgb, stroke=6)

    pylab.imshow(im, cmap=pylab.cm.gist_gray)
    pylab.axis('off')

def overlay_box(im, box, rgb, stroke=1):
    """ Method to overlay single box on image """
    # --- Convert coordinates to integers
    box = [int(b) for b in box]
    
    # --- Extract coordinates
    y1, x1, height, width = box
    y2 = y1 + height
    x2 = x1 + width

    im[y1:y1 + stroke, x1:x2] = rgb
    im[y2:y2 + stroke, x1:x2] = rgb
    im[y1:y2, x1:x1 + stroke] = rgb
    im[y1:y2, x2:x2 + stroke] = rgb

    return im