In [1]:
import numpy as np
import pandas as pd
import os

import plotly.graph_objects as go
from plotly.offline import iplot, init_notebook_mode
init_notebook_mode(connected=True)
import seaborn as sns
import matplotlib.pyplot as plt
from plotly.offline import iplot, init_notebook_mode
init_notebook_mode(connected=True)
import plotly_express as px
import plotly.graph_objects as go
from plotly.subplots import make_subplots
from plotly.offline import init_notebook_mode
import plotly.io as pio
from plotly.subplots import make_subplots
# setting default template to plotly_white for all visualizations
pio.templates.default = "plotly_white"
%matplotlib inline
import gc

import utils_eda as uteda

import warnings
warnings.filterwarnings('ignore')

%load_ext autoreload
%autoreload 2

# Load and inspect data

In [2]:
PATH = './input/siim-covid19-detection'
submission = pd.read_csv(os.path.join(PATH,'sample_submission.csv'), index_col=None)
image_df = pd.read_csv(os.path.join(PATH,'train_image_level.csv'), index_col=None)
study_df = pd.read_csv(os.path.join(PATH,'train_study_level.csv'), index_col=None)
pd.set_option('display.max_columns', None)  
pd.set_option('display.max_colwidth', None)
print(f"Train image level csv shape : {image_df.shape}\nTrain study level csv shape : {study_df.shape}")

Train image level csv shape : (6334, 4)
Train study level csv shape : (6054, 5)


In [3]:
image_df.head(2)
len(image_df)

6334

In [4]:
study_df.head(2)
len(study_df)

6054

In [5]:
#get a list of all the files
from tqdm.auto import tqdm

all_files = []
trn_files = uteda.get_files(PATH+'/train')
test_files = uteda.get_files(PATH+'/test')

print(f'number train dcms={len(trn_files)}, number test dcms from test dir={len(test_files)},\nsample_submission.csv has {len(submission)} entries, this includes 1263 images + 1214 studies ')
all_files = trn_files+test_files
# ds=dcmread(all_files[0])
# dir(ds)
# ds.keys

number train dcms=6334, number test dcms from test dir=1263,
sample_submission.csv has 2477 entries, this includes 1263 images + 1214 studies 


In [6]:
def getinfo(ds,col='id'):
    #lets see if the ds file has only studies
    #and what the mix is
    #assumme we are looking at id column
    out = ds[col].map(lambda x:x.split('_')[1])
    print (f'total records={out.shape}')
    print(f'unique vals={set(out)}')
    for val in (set(out)):
        f = lambda x:x==val
        tot = sum(map(f,out))
        print(f'sum {val} = {tot}')
    print('\n')


In [7]:
getinfo(image_df)
getinfo(study_df)
getinfo(submission)

total records=(6334,)
unique vals={'image'}
sum image = 6334


total records=(6054,)
unique vals={'study'}
sum study = 6054


total records=(2477,)
unique vals={'image', 'study'}
sum image = 1263
sum study = 1214




best_from_kaggle_sub## Where are submission files? In test dir!

# Clean data

In [8]:
#get rid of _image and _study
image_df['id']=image_df['id'].map(lambda x: x.split('_')[0])
study_df['id']=study_df['id'].map(lambda x: x.split('_')[0])

# Find the bounding box distribution

In [9]:
#lets find max number bounding boxes in train set bounding boxes
f=lambda x:x.count('{') if type(x) is str else 0
bbox_distribution=list(map(f,image_df.boxes))

## Show the bounding box distribution

In [10]:
m=max(bbox_distribution)
print(f'Maximum number of bounding boxes={m}\n')
tots=0

percentages=[]
for i in range(0, m+1):
    tot = bbox_distribution.count(i)
    print(f'number images with {i} bounding boxes={tot}')
    percentages.append(tot)
    tots+=tot
print(f'\nTotal images={tots}, total with bounding boxes={tots-bbox_distribution.count(0)}\n')

percentages=[x/tots for x in percentages]
for i,pct in enumerate(percentages):
    print(f'{"{:.2f}".format(pct)} % has {i} bounding boxes')

Maximum number of bounding boxes=8

number images with 0 bounding boxes=2040
number images with 1 bounding boxes=973
number images with 2 bounding boxes=3113
number images with 3 bounding boxes=183
number images with 4 bounding boxes=23
number images with 5 bounding boxes=1
number images with 6 bounding boxes=0
number images with 7 bounding boxes=0
number images with 8 bounding boxes=1

Total images=6334, total with bounding boxes=4294

0.32 % has 0 bounding boxes
0.15 % has 1 bounding boxes
0.49 % has 2 bounding boxes
0.03 % has 3 bounding boxes
0.00 % has 4 bounding boxes
0.00 % has 5 bounding boxes
0.00 % has 6 bounding boxes
0.00 % has 7 bounding boxes
0.00 % has 8 bounding boxes


# Create dataframe with rows of interest, 3 ways.

In [11]:
MAX_ROWS_WITH_THIS_MANY_BBOXES=10
NUMBER_OF_BOXES=5  
DO_TINY_DATASET_ON_OVERTRAINED_MODEL=False

### 1. dataframe that ONLY contains images that have NUMB_BOXES bounding boxes

In [12]:
def get_rows_with_numb_boxes(df, numb_bboxes_wanted):
    '''
     find dcm files with numb_boxes bounding boxes
     works by finding number of dicts by checking for the first { char in the dict
     param:  df- loaded from train_image_level.csv (there are some images with no bounding boxes
     so the boxes field is blank)
             numb_bboxes_wanted-return df with rows with this many bounding boxes 
     returns: dataframe
    '''
    if 'numb_boxes' not in df.columns:
        f=lambda x:x.count('{') if type(x) is str else 0
        df['numb_boxes'] = list(map(f,df.boxes))
    
    mask= map(lambda x:x==numb_bboxes_wanted, df['numb_boxes'])
    df1=df[list(mask)]
    df1.reset_index(inplace=True, drop=True)
    return df1

In [13]:
df1=get_rows_with_numb_boxes(image_df,NUMBER_OF_BOXES)
# df1=get_rows(image_df,8)#max bounding boxes, just 1 of these
print(f'Dataframe contains { len(df1)} rows with {NUMBER_OF_BOXES} bounding boxes')
# df1.head()

Dataframe contains 1 rows with 5 bounding boxes


### 2. dataframe with even distribution of bounding boxes, up to MAX_ROWS_WITH_THIS_MANY_BBOXES each) (not representative of the above bbounding box distribution though)

In [14]:
%%time
df1=pd.DataFrame()
for i in range((max(bbox_distribution)+1)):
    df1=df1.append(get_rows_with_numb_boxes(image_df,i)[:MAX_ROWS_WITH_THIS_MANY_BBOXES])
df1.reset_index(inplace=True, drop=True)
print(f'Dataframe contains { len(df1)} rows with {NUMBER_OF_BOXES} bounding boxes')
# df1

Dataframe contains 52 rows with 5 bounding boxes
CPU times: user 15.4 ms, sys: 0 ns, total: 15.4 ms
Wall time: 14.5 ms


### 3. something totally different, build a tiny dataset, use it to verify model can be overfit from training

In [15]:
if DO_TINY_DATASET_ON_OVERTRAINED_MODEL==True:
    #create the tiny dataset
    TINY_PATH='tmp/covid_small/images/train'
    fls = uteda.get_files(TINY_PATH)
    f=lambda x:x.split('/')[-1]
    fls = [f(fle) for fle in fls]
    fls = [(lambda x: x.split('.')[0])(x) for x in fls]
    # fls

    %%time
    df1=image_df[image_df['id'].isin(fls)]

    # Load meta.csv file
    # Original dimensions are required to scale the bounding box coordinates appropriately.
    meta_df = pd.read_csv('input/siim-covid19-resized-to-512px-png/meta.csv')

    cols=meta_df.columns
    cols=[col for col in cols]
    cols[0]='id'
    meta_df.columns=cols
    len(meta_df)

    # Merge both the dataframes
    df1 = df1.merge(meta_df, on='id',how="left")
    df1.head(2)
    df1
else:
    print("Not doing tiny dataset")

Not doing tiny dataset


# Utility functions

In [16]:
import utils_eda as uteda
from PIL import Image
import os
import json
def copy_dicom_img_to_dir(row, pth_dicom_fles, pth_destdir):
    '''
    row - pandas series
    pth_dicom_fles - 'input/siim-covid19-detection/train/' for ex
    pth_destdir - where all images will wind up (like ''./test_tmp/'')
    return im.shape(height,width) to be logged
    
    ex.
    # copy imag to test dir
    for i in range(MAX_ROWS):
         copy_dicom_img_to_dir(df1.loc[i],TRAIN_DIR ,TEST_DIR )
    
    '''       
    study=  row.loc['StudyInstanceUID']
    dcm_file=row.loc['id']
    
    #create a path to the study
    pth =  pth_dicom_fles + row.loc['StudyInstanceUID']

    #get all dicom files from the study
    dcms = uteda.get_files(pth) 

    if (len(dcms)>1):
        #find the correct image
        dcms=list(filter(lambda x:dcm_file in x, dcms))
   
    #get the image
    img=uteda.dicom2array(dcms[0])
    
    #save it to path
    if not os.path.exists(pth_destdir):
        os.mkdir(pth_destdir)

    im = Image.fromarray(img)     
    im.save(pth_destdir+dcm_file+'.png')
    return img.shape

def load_img( pth_destdir, imagename):
    '''
    just loads an image
    ex.
    nme = df1.loc[0,'id']+'.png'
    im= load_img( TEST_DIR, nme)
    '''
    return Image.open(pth_destdir + imagename)
    
def get_boxes(row):
    '''
    Convert the string that contaings bounding boxes 
    into a list of dicts and return
    ex.
    # get boxes
    all_boxes=[]
    for i in range(MAX_ROWS):
        all_boxes.append(get_boxes(df1.loc[i]))
    '''

    if (pd.isnull(row.loc['boxes'])):
        return []
    
    boxes=row.loc['boxes'].replace('\'','"')
    return json.loads(boxes)

# Determine maximum predictions to make from df1 result set
## Copy images to test dir and get image bounding boxes

In [17]:
MAX_ROWS=len(df1)
MAX_ROWS
DO_TINY_DATASET_ON_OVERTRAINED_MODEL

False

In [18]:
%%time
if DO_TINY_DATASET_ON_OVERTRAINED_MODEL == True:
    print("Doing tiny dataset")
else:
    # NOTE: DO NOT run this if you are doing the tiny dataset route above, it already has files elsewhere to use 
    #lets get the boxes for the first MAX_ROWS rows of df1 and save the images associated with the dicom files
    import json
    TRAIN_DIR='./input/siim-covid19-detection/train/'
    TEST_DIR='./test_tmp/'

    # get boxes
    all_boxes=[]
    for i in tqdm(range(MAX_ROWS)):
        all_boxes.append(get_boxes(df1.loc[i]))

    # clear dest dir first
    if os.path.exists(TEST_DIR):
        for file in os.scandir(TEST_DIR):
            if os.path.isfile(file):
                os.remove(file.path)

    # copy imag to test dir
    for i in tqdm(range(MAX_ROWS)):
         copy_dicom_img_to_dir(df1.loc[i],TRAIN_DIR ,TEST_DIR )

    # print(all_boxes)   

    
        

  0%|          | 0/52 [00:00<?, ?it/s]

  0%|          | 0/52 [00:00<?, ?it/s]

KeyboardInterrupt: 

# Load a trained yolov5 model and run predictions on files in TEST_DIR.


In [19]:
import os
cwd = os.path.abspath(os.getcwd())

# MODEL1_PATH_2CLASSES ='/artifacts/run_3bh5hck7_model:v199/best.pt'#use alone
MODEL_PATH_1CLASS=cwd +"/best_exp11_yolov5l_img512_1class.pt" #can ensemble with below
MODEL_PATH_2CLASS=cwd +"/best_exp10_yolov5s_img256_1class.pt" #can ensemble with below
MODEL_PATH_3CLASS=cwd +"/best_exp12_yolov5m_img512_1class.pt" #can ensemble with below
MODEL_PATH_4CLASS_KAGGLE = cwd +'/best_from_kaggle_sub_1class.pt'

#overfit to prove model viability and that image prediction code is correct
MODEL_OVERFIT_ON_TINY_DATA=cwd+"/tmp/yolov5/kaggle-siim-covid_tiny/exp7/weights/best.pt"
TEST_DIR=cwd+'/test_tmp/'

#used for debugging yolov5/detect.py in seperate pycharm session
print(MODEL_PATH_1CLASS)
print(MODEL_PATH_2CLASS)
print(MODEL_PATH_3CLASS)
print(MODEL_PATH_4CLASS_KAGGLE)

/home/keith/AA_jupyter_tuts/kaggle_SIIM_COVID_Detection/best_exp11_yolov5l_img512_1class.pt
/home/keith/AA_jupyter_tuts/kaggle_SIIM_COVID_Detection/best_exp10_yolov5s_img256_1class.pt
/home/keith/AA_jupyter_tuts/kaggle_SIIM_COVID_Detection/best_exp12_yolov5m_img512_1class.pt
/home/keith/AA_jupyter_tuts/kaggle_SIIM_COVID_Detection/best_from_kaggle_sub_1class.pt


# Run models, or ensembles of models


In [20]:
if(DO_TINY_DATASET_ON_OVERTRAINED_MODEL == True):
    #NOTE: use the following 3 to test on overtrained model, otherwise 
    # comment all 3 out and uncomment above
    run_names=['MODEL_OVERFIT_ON_TINY_DATA']
    model_names=[ MODEL_OVERFIT_ON_TINY_DATA]
    TEST_DIR=cwd+'/tmp/covid_small/images/train/'
else:       
    # keys= name of the predict output dir
    # vals=model weights we are using per run 
    run_names=['MODEL_PATH_1CLASS','MODEL_PATH_2CLASS','MODEL_PATH_3CLASS','MODEL_PATH_4CLASS_KAGGLE']
    model_names=[ MODEL_PATH_1CLASS, MODEL_PATH_2CLASS,MODEL_PATH_3CLASS, MODEL_PATH_4CLASS_KAGGLE]

# model_names=[ MODEL_PATH_1CLASS + " "+MODEL_PATH_2CLASS, MODEL_PATH_1CLASS,MODEL_PATH_2CLASS, MODEL_PATH_3CLASS_KAGGLE]
model_sel=dict(zip(run_names,model_names))
model_sel

{'MODEL_PATH_1CLASS': '/home/keith/AA_jupyter_tuts/kaggle_SIIM_COVID_Detection/best_exp11_yolov5l_img512_1class.pt',
 'MODEL_PATH_2CLASS': '/home/keith/AA_jupyter_tuts/kaggle_SIIM_COVID_Detection/best_exp10_yolov5s_img256_1class.pt',
 'MODEL_PATH_3CLASS': '/home/keith/AA_jupyter_tuts/kaggle_SIIM_COVID_Detection/best_exp12_yolov5m_img512_1class.pt',
 'MODEL_PATH_4CLASS_KAGGLE': '/home/keith/AA_jupyter_tuts/kaggle_SIIM_COVID_Detection/best_from_kaggle_sub_1class.pt'}

In [21]:
%%time
%cd tmp/yolov5

for key, val in model_sel.items():   
    %rm -rf ./runs/detect/{key}
#     params="--weights " + val+" --source " + TEST_DIR+ "  --img 512 --conf 0.281 --iou-thres 0.5 --max-det 8  --save-txt  --save-conf --name " + key + " --exist-ok"
#CHANGING CONFIDENCE SCORE
    params="--weights " + val+" --source " + TEST_DIR+ "  --img 512 --conf 0.081 --iou-thres 0.5 --max-det 4  --save-txt  --save-conf --name " + key + " --exist-ok"

    !python detect.py {params} >out.txt
# #get back to correct dir
%cd ../..

/home/keith/AA_jupyter_tuts/kaggle_SIIM_COVID_Detection/yolov5
YOLOv5 🚀 v5.0-303-g3bef77f torch 1.9.0+cu102 CUDA:0 (TITAN Xp, 12194.0625MB)

Fusing layers... 
  return torch.max_pool2d(input, kernel_size, stride, padding, dilation, ceil_mode)
Model Summary: 392 layers, 46600566 parameters, 0 gradients, 114.1 GFLOPs
libpng error: Read Error
Traceback (most recent call last):
  File "/home/keith/AA_jupyter_tuts/kaggle_SIIM_COVID_Detection/yolov5/detect.py", line 230, in <module>
    main(opt)
  File "/home/keith/AA_jupyter_tuts/kaggle_SIIM_COVID_Detection/yolov5/detect.py", line 225, in main
    run(**vars(opt))
  File "/home/keith/anaconda3/envs/p39/lib/python3.9/site-packages/torch/autograd/grad_mode.py", line 28, in decorate_context
    return func(*args, **kwargs)
  File "/home/keith/AA_jupyter_tuts/kaggle_SIIM_COVID_Detection/yolov5/detect.py", line 95, in run
    for path, img, im0s, vid_cap in dataset:
  File "/home/keith/AA_jupyter_tuts/kaggle_SIIM_COVID_Detection/yolov5/utils/da

## Labels saved to runs/detect/{model_sel key}/labels, 1st is class, last is confidence.  Looks like following
1 0.326869 0.343108 0.252468 0.463613 0.402857 <br>
1 0.695874 0.373716 0.246474 0.478596 0.450281


In [22]:
# #list images we tested on
# all_files = []
# for dirname,_,filenames in os.walk(TEST_DIR):
#     for filename in filenames:
#         all_files.append(os.path.join(dirname, filename))
# print(all_files)

# Get an image, the ground truth bounding boxes, and 1 or more predicted bounding boxes and display

> 📍 Note: 1 is class id (opacity), the first four float numbers are `x_center`, `y_center`, `width` and `height`. The final float value is `confidence`.

In [23]:
prediction_files = os.listdir(PRED_PATH)
print('Number of test images predicted as opaque: ', len(prediction_files))

NameError: name 'PRED_PATH' is not defined

> 📍 Out of 1263 test images, 583 were predicted with `opacity` label and thus we have that many prediction txt files.

<iframe src="https://www.kaggle.com/embed/rajsengo/beginner-eda-siim-covid-19-detection?cellId=14&cellIds=13&kernelSessionId=64636915" height="300" style="margin: 0 auto; width: 100%; max-width: 950px;" frameborder="0" scrolling="auto" title="[Beginner EDA] SIIM COVID-19 Detection"></iframe>

In [None]:
OUT_DIR_PREDS='./yolov5/runs/detect/'

# The submisison requires xmin, ymin, xmax, ymax format. 
# YOLOv5 returns x_center, y_center, width, height
def correct_bbox_format(bboxes, orig_width, orig_height):
    correct_bboxes = []
    for b in bboxes:
        xc, yc = int(np.round(b[1]*orig_width)), int(np.round(b[2]*orig_height))
        w, h = int(np.round(b[3]*orig_width)), int(np.round(b[4]*orig_height))

        xmin= xc - int(np.round(w/2))
        ymin= yc - int(np.round(h/2))
        xmax= xc + int(np.round(w/2))
        ymax= yc + int(np.round(h/2))
        conf= b[5]
        
        correct_bboxes.append([xmin, ymin, xmax, ymax, conf])
        
    return correct_bboxes

def get_pred_bboxes(img_name, run_names, run_dir,orig_width=None, orig_height=None):
    '''
    img_name name + suffix
    get all the bounding boxes for img that are stored in multiple run directories
    '''
    if(orig_width is None or orig_height is None):
        im= load_img( TEST_DIR, img_name)
        im=np.array(im)
        orig_height,orig_width = im.shape
    
    results=[]
    for dir in run_names:
        #convert bounding boxes into lists of floats
        pred_boxes_and_confidence=[]
        
        #file to open
        fle = OUT_DIR_PREDS+dir+ '/labels/' + img_name.split('.')[0] +'.txt'
        if not os.path.isfile(fle):
            print(f'Missing label file for image {img_name} for run {dir}' )
            results.append([])
            continue
            
        with open(fle) as f:
            lines=f.readlines()
            for lne in lines:
                lne=lne.replace('\n','')
                lne="[" +lne.replace(' ',',') +"]"
                lne=json.loads(lne)               
                pred_boxes_and_confidence.append(lne)
        pred_boxes = correct_bbox_format(pred_boxes_and_confidence,orig_width,orig_height)
        
        #convert to a dict
        keys=["x1","y1","x2","y2","conf"]
        for i,b in enumerate(pred_boxes):
            pred_boxes[i]=dict(zip(keys,b))
        results.append(pred_boxes)
    return results
    

# Tying it together, show images, ground truth and all predictions

In [None]:
# Scale the bounding boxes according to the size of the resized image. 
def scale_bbox(row, bboxes, img_x, img_y):
    # Get scaling factor
    scale_x = img_x/row.dim1
    scale_y = img_y/row.dim0
    
    for bbox in bboxes:
        bbox['x'] = int(np.round(bbox['x']*scale_x, 4))
        bbox['y'] = int(np.round(bbox['y']*scale_y, 4))
        bbox['width'] = int(np.round(bbox['width']*(scale_x), 4))
        bbox['height']= int(np.round(bbox['height']*scale_y, 4))
        
    return bboxes

def generate_images(df,max_rows, test_dir,out_dir, run_names, show_image, img_suffix='.png'):
#     print(out_dir)
    for i in tqdm(range(max_rows)):
        row=df.loc[i]

        img_name = row.loc['id']+img_suffix

        img= load_img( test_dir, img_name)
        img=np.array(img)
        height,width = img.shape
#         print(f'Height={height} Width={width}')     

        #ground truth bounding boxes
        gt_boxes=get_boxes(row)
#         print(gt_boxes)

        #lets see if df1 has better h,w info
        if 'dim0' in row:
            gt_boxes = scale_bbox(row, gt_boxes, width, height)
              
        #get predicted bounding boxes
        results = get_pred_bboxes(img_name, run_names, out_dir, orig_width=width, orig_height=height)
#         print(results)
        
        results1=dict(zip(run_names,results)) 

        #plot it with the b_boxes
        uteda.plot_img_with_bboxes(img,img_name, gt_boxes,results1, size=15, out_path=out_dir,show_image=show_image)

## Show an image

In [None]:
OUTPUT_IMAGES_DIR='output_'+str([*model_sel][0])+'_gt_bboxes/'
OUTPUT_IMAGES_DIR
# TEST_DIR

In [None]:
df1

In [None]:
img_suffix= '.jpg' if DO_TINY_DATASET_ON_OVERTRAINED_MODEL == True else '.png'
img_suffix

In [None]:
#lets see a single image
generate_images(df1, 1, TEST_DIR,OUTPUT_IMAGES_DIR, run_names=run_names, show_image=True, img_suffix=img_suffix)

In [None]:
#lets see a single image
generate_images(df1, 1, TEST_DIR,OUTPUT_IMAGES_DIR, run_names=run_names, show_image=True, img_suffix=img_suffix)

## Save all marked up images

In [None]:
OUTPUT_IMAGES_DIR

In [None]:
%%time
# clear dest dir first
if os.path.exists(OUTPUT_IMAGES_DIR):
    for file in os.scandir(OUTPUT_IMAGES_DIR):
        if os.path.isfile(file):
            os.remove(file.path)

#now lets generate images for all the rows we predicted for later display
generate_images(df1, MAX_ROWS, TEST_DIR,OUTPUT_IMAGES_DIR, run_names=run_names, show_image=False, img_suffix=img_suffix)

# Display bunch of images

In [None]:
df1.columns

In [None]:
# filters what we see, set to None to see all
show_only_with_this_many_GT_boxes=0

# get a list of files of interest
fls=df1.loc[df1['numb_boxes']==show_only_with_this_many_GT_boxes,'id'].tolist()
fls =[OUTPUT_IMAGES_DIR+l+'.png'  for l in fls]

#or just iterate over entire directory of interest
# fls=uteda.get_files(OUTPUT_IMAGES_DIR)

imgs=[]
for fle in fls:
    img = Image.open(fle)
    img=np.array(img)
    imgs.append(img)

In [None]:
uteda.plot_imgs(imgs, cols=3, size=30, is_rgb=True, title="", cmap='gray', img_size=(600,600))