In [None]:
import numpy as np
import pandas as pd
import os

import plotly.graph_objects as go
from plotly.offline import iplot, init_notebook_mode
init_notebook_mode(connected=True)
import seaborn as sns
import matplotlib.pyplot as plt
from plotly.offline import iplot, init_notebook_mode
init_notebook_mode(connected=True)
import plotly_express as px
import plotly.graph_objects as go
from plotly.subplots import make_subplots
from plotly.offline import init_notebook_mode
import plotly.io as pio
from plotly.subplots import make_subplots
# setting default template to plotly_white for all visualizations
pio.templates.default = "plotly_white"
%matplotlib inline
import gc

import utils_eda as uteda

import warnings
warnings.filterwarnings('ignore')

%load_ext autoreload
%autoreload 2

## Load and inspect data

In [None]:
PATH = './input/siim-covid19-detection'
submission = pd.read_csv(os.path.join(PATH,'sample_submission.csv'), index_col=None)
image_df = pd.read_csv(os.path.join(PATH,'train_image_level.csv'), index_col=None)
study_df = pd.read_csv(os.path.join(PATH,'train_study_level.csv'), index_col=None)
pd.set_option('display.max_columns', None)  
pd.set_option('display.max_colwidth', None)
print(f"Train image level csv shape : {image_df.shape}\nTrain study level csv shape : {study_df.shape}")

In [None]:
image_df.head(2)
len(image_df)

In [None]:
study_df.head(2)
len(study_df)

In [None]:
#get a list of all the files
from tqdm.auto import tqdm

all_files = []
trn_files = uteda.get_files(PATH+'/train')
test_files = uteda.get_files(PATH+'/test')

print(f'number train dcms={len(trn_files)}, number test dcms from test dir={len(test_files)},\nsample_submission.csv has {len(submission)} entries, this includes 1263 images + 1214 studies ')
all_files = trn_files+test_files
# ds=dcmread(all_files[0])
# dir(ds)
# ds.keys

In [None]:
def getinfo(ds,col='id'):
    #lets see if the ds file has only studies
    #and what the mix is
    #assumme we are looking at id column
    out = ds[col].map(lambda x:x.split('_')[1])
    print (f'total records={out.shape}')
    print(f'unique vals={set(out)}')
    for val in (set(out)):
        f = lambda x:x==val
        tot = sum(map(f,out))
        print(f'sum {val} = {tot}')
    print('\n')


In [None]:
getinfo(image_df)
getinfo(study_df)
getinfo(submission)

best_from_kaggle_sub## Where are submission files? In test dir!

# Clean data

In [None]:
#get rid of _image and _study
image_df['id']=image_df['id'].map(lambda x: x.split('_')[0])
study_df['id']=study_df['id'].map(lambda x: x.split('_')[0])

# Find the bounding box distribution

In [None]:
#lets find max number bounding boxes in train set bounding boxes
f=lambda x:x.count('{') if type(x) is str else 0
numb_bboxes_per_row=list(map(f,image_df.boxes))
m=max(numb_bboxes_per_row)
print(f'Maximum number of bounding boxes={m}\n')
tots=0
for i in range(0, m+1):
    tot = numb_bboxes_per_row.count(i)
    print(f'number images with {i} bounding boxes={tot}')
    tots+=tot
print(f'\nTotal images={tots}, total with bounding boxes={tots-numb_bboxes_per_row.count(0)}')
# numb_bboxes_per_row

# Create df that ONLY contains images that have numb_boxes bounding boxes

In [None]:
#get all rows that have this many boxes
numb_boxes=2

In [None]:
def get_rows_with_numb_boxes(df, numb_boxes):
    '''
     find dcm files with numb_boxes bounding boxes
     works by finding number of dicts by checking for the first { char in the dict
     param:  df- loaded from train_image_level.csv (there are some images with no bounding boxes
     so the boxes field is blank)
             numb_boxes-return df with rows with this many bounding boxes
     df 
     returns: dataframe
    '''
    f=lambda x:x.count('{') if type(x) is str else 0
    numb_bboxes_per_row=list(map(f,df.boxes))
    mask= map(lambda x:x==numb_boxes,numb_bboxes_per_row)
    return df[list(mask)]

In [None]:
df1=get_rows_with_numb_boxes(image_df,numb_boxes)
# df1=get_rows(image_df,8)#max bounding boxes, just 1 of these
print(f'Dataframe contains { len(df1)} rows with {numb_boxes} bounding boxes')
# df1.head()

In [None]:
#reindex start from 0, get rid of old non sequential index 
df1.reset_index(inplace=True, drop=True)
df1.head()

# Utility functions

In [None]:
import utils_eda as uteda
from PIL import Image
import os
import json
def copy_dicom_img_to_dir(row, pth_dicom_fles, pth_destdir):
    '''
    row - pandas series
    pth_dicom_fles - 'input/siim-covid19-detection/train/' for ex
    pth_destdir - where all images will wind up (like ''./test_tmp/'')
    return im.shape(height,width) to be logged
    
    ex.
    # copy imag to test dir
    for i in range(MAX_ROWS):
         copy_dicom_img_to_dir(df1.loc[i],TRAIN_DIR ,TEST_DIR )
    
    '''
    study=  row.loc['StudyInstanceUID']
    dcm_file=row.loc['id']
    
    #create a path to the study
    pth =  pth_dicom_fles + row.loc['StudyInstanceUID']

    #get all dicom files from the study
    dcms = uteda.get_files(pth) 

    if (len(dcms)>1):
        #find the correct image
        dcms=list(filter(lambda x:dcm_file in x, dcms))
   
    #get the image
    img=uteda.dicom2array(dcms[0])
    
    #save it to path
    if not os.path.exists(pth_destdir):
        os.mkdir(pth_destdir)

    im = Image.fromarray(img)     
    im.save(pth_destdir+dcm_file+'.png')
    return img.shape

def load_img( pth_destdir, imagename):
    '''
    just loads an image
    ex.
    nme = df1.loc[0,'id']+'.png'
    im= load_img( TEST_DIR, nme)
    '''
    return Image.open(pth_destdir + imagename)
    
def get_boxes(row):
    '''
    Convert the string that contaings bounding boxes 
    into a list of dicts and return
    ex.
    # get boxes
    all_boxes=[]
    for i in range(MAX_ROWS):
        all_boxes.append(get_boxes(df1.loc[i]))
    '''

    if (pd.isnull(row.loc['boxes'])):
        return []
    
    boxes=row.loc['boxes'].replace('\'','"')
    return json.loads(boxes)

# Determine maximum predictions to make from df1 result set
## Copy images to test dir and get image bounding boxes

In [None]:
MAX_ROWS=min(20,len(df1))

In [None]:
#lets get the boxes for the first MAX_ROWS rows of df1 and save the images associated with the dicom files
import json
TRAIN_DIR='./input/siim-covid19-detection/train/'
TEST_DIR='./test_tmp/'

# get boxes
all_boxes=[]
for i in range(MAX_ROWS):
    all_boxes.append(get_boxes(df1.loc[i]))

# copy imag to test dir
for i in range(MAX_ROWS):
     copy_dicom_img_to_dir(df1.loc[i],TRAIN_DIR ,TEST_DIR )

print(all_boxes)   

# Plot the image and only ground truth bounding boxes

In [None]:
nme = df1.loc[0,'id']+'.png'
im= load_img( TEST_DIR, nme)
im=np.array(im)
height,width = im.shape

boxes=get_boxes(df1.loc[0])

#plot it with the b_boxes
uteda.plot_img_with_bboxes(im,nme, boxes, size=15)

# Load a trained yolov5 model and run predictions on files in TEST_DIR.


In [None]:
import os
cwd = os.path.abspath(os.getcwd())

# MODEL1_PATH_2CLASSES ='/artifacts/run_3bh5hck7_model:v199/best.pt'#use alone
MODEL_PATH_1CLASS=cwd +"/best_exp10_yolov5l_img512_1class.pt" #can ensemble with below
MODEL_PATH_1CLASS_KAGGLE = cwd +'/best_from_kaggle_sub_1class.pt'
TEST_DIR=cwd+'/test_tmp/'

#used for debugging yolov5/detect.py in seperate pycharm session
print(MODEL_PATH_1CLASS)
print(MODEL_PATH_1CLASS_KAGGLE)

# Run models, or ensembles of models


In [None]:
# keys= name of the predict output dir
# vals=model weights we are using per run 
run_names=['ENSEMBLE','MODEL_PATH_1CLASS','MODEL_PATH_1CLASS_KAGGLE']
model_names=[ MODEL_PATH_1CLASS + " "+MODEL_PATH_1CLASS_KAGGLE, MODEL_PATH_1CLASS, MODEL_PATH_1CLASS_KAGGLE]
model_sel=dict(zip(run_names,model_names))
model_sel

In [None]:
%cd yolov5

for key, val in model_sel.items():   
    %rm -rf ./runs/detect/{key}
    params="--weights " + val+" --source " + TEST_DIR+ "  --img 512 --conf 0.281 --iou-thres 0.5 --max-det 8  --save-txt  --save-conf --name " + key + " --exist-ok"

    !python detect.py {params}
# #get back to correct dir
%cd ..

## Labels saved to runs/detect/{model_sel key}/labels, 1st is class, last is confidence.  Looks like following
1 0.326869 0.343108 0.252468 0.463613 0.402857 <br>
1 0.695874 0.373716 0.246474 0.478596 0.450281


In [None]:
#list images we tested on
all_files = []
for dirname,_,filenames in os.walk(TEST_DIR):
    for filename in filenames:
        all_files.append(os.path.join(dirname, filename))
print(all_files)

# Get an image, the ground truth bounding boxes, and 1 or more predicted bounding boxes and display

<iframe src="https://www.kaggle.com/embed/rajsengo/beginner-eda-siim-covid-19-detection?cellId=14&cellIds=13&kernelSessionId=64636915" height="300" style="margin: 0 auto; width: 100%; max-width: 950px;" frameborder="0" scrolling="auto" title="[Beginner EDA] SIIM COVID-19 Detection"></iframe>

In [None]:
OUT_DIR='./yolov5/runs/detect/'

# The submisison requires xmin, ymin, xmax, ymax format. 
# YOLOv5 returns x_center, y_center, width, height
def correct_bbox_format(bboxes, orig_width, orig_height):
    correct_bboxes = []
    for b in bboxes:
        xc, yc = int(np.round(b[1]*orig_width)), int(np.round(b[2]*orig_height))
        w, h = int(np.round(b[3]*orig_width)), int(np.round(b[4]*orig_height))

        xmin= xc - int(np.round(w/2))
        ymin= yc - int(np.round(h/2))
        xmax= xc + int(np.round(w/2))
        ymax= yc + int(np.round(h/2))
        conf= b[5]
        
        correct_bboxes.append([xmin, ymin, xmax, ymax, conf])
        
    return correct_bboxes

def get_pred_bboxes(img_name, run_names, run_dir,orig_width=None, orig_height=None):
    '''
    img_name name + suffix
    get all the bounding boxes for img that are stored in multiple run directories
    '''
    if(orig_width is None or orig_height is None):
        im= load_img( TEST_DIR, img_name)
        im=np.array(im)
        orig_height,orig_width = im.shape
    
    results=[]
    for dir in run_names:
        #convert bounding boxes into lists of floats
        pred_boxes_and_confidence=[]
        
        #file to open
        fle = OUT_DIR+dir+ '/labels/' + img_name.split('.')[0] +'.txt'
        if not os.path.isfile(fle):
            print(f'Missing label file for image {img_name} for run {dir}' )
            results.append([])
            continue
            
        with open(fle) as f:
            lines=f.readlines()
            for lne in lines:
                lne=lne.replace('\n','')
                lne="[" +lne.replace(' ',',') +"]"
                lne=json.loads(lne)               
                pred_boxes_and_confidence.append(lne)
        pred_boxes = correct_bbox_format(pred_boxes_and_confidence,orig_width,orig_height)
        
        #convert to a dict
        keys=["x1","y1","x2","y2","conf"]
        for i,b in enumerate(pred_boxes):
            pred_boxes[i]=dict(zip(keys,b))
        results.append(pred_boxes)
    return results
    

# Tying it together, show images, ground truth and all predictions

In [None]:
def generate_images(df,max_rows, test_dir,out_dir, run_names, show_image):
    for i in range(max_rows):
        row=df.loc[i]

        img_name = row.loc['id']+'.png'

        img= load_img( test_dir, img_name)
        img=np.array(img)
        height,width = img.shape
        # print(f'Height={height} Width={width}')

        #ground truth bounding boxes
        gt_boxes=get_boxes(row)

        #get predicted bounding boxes
        results = get_pred_bboxes(img_name, run_names, out_dir, orig_width=width, orig_height=height)

        results1=dict(zip(run_names,results))   

        #plot it with the b_boxes
        uteda.plot_img_with_bboxes(img,img_name, gt_boxes,results1, size=15, show_image=show_image)

## Show an image

In [None]:
OUTPUT_IMAGES_DIR='output/'

In [None]:
#lets see an image
generate_images(df1, 1, TEST_DIR,OUTPUT_IMAGES_DIR, run_names=run_names, show_image=True)

## Save all marked up images

In [None]:
#now lets generate images for all the rows we predicted
generate_images(df1, MAX_ROWS, TEST_DIR,OUTPUT_IMAGES_DIR, run_names=run_names, show_image=False)

# Display

In [None]:
fls=uteda.get_files(OUTPUT_IMAGES_DIR)

imgs=[]
for fle in fls:
    img = Image.open(fle)
    img=np.array(img)
    imgs.append(img)

In [None]:
uteda.plot_imgs(imgs, cols=2, size=20, is_rgb=True, title="", cmap='gray', img_size=(500,500))