In [1]:
import sys; sys.path.insert(0, '..')

import numpy as np
import pickle
import matplotlib.pyplot as plt

from sklearn.ensemble import RandomForestClassifier
from sklearn.metrics import roc_auc_score

from database.dataset import *
from mc_candidate_proposal.hough_mc import HoughCalcificationDetection
from feature_extraction.feature_extraction import CandidatesFeatureExtraction
import feature_extraction.haar_features.haar_modules as hm
from general_utils.plots import plot_froc, plot_bootstrap_froc
from metrics.metrics import froc_curve, froc_curve_bootstrap
from mc_candidate_proposal.candidate_utils import balance_candidates
from metrics.metrics_utils import (get_froc_df_of_img, get_froc_df_of_many_imgs_features,
                                   get_tp_fp_fn_center_patch_criteria)

pd.options.mode.chained_assignment = None
cmap = plt.get_cmap("tab10")
import os
import sys
import shutil
from tqdm import tqdm

INFO:numexpr.utils:NumExpr defaulting to 8 threads.


### DL patches extraction

In [2]:
db = INBreast_Dataset(
    return_lesions_mask=True,
    level='rois',
    extract_patches_method='all',
    patch_size=224,
    stride=100,
    max_lesion_diam_mm=None,
    partitions=['train', 'validation'],
    min_breast_fraction_roi=0.5,
    normalize=None,
    n_jobs=-1,
    extract_patches=False,
    use_muscle_mask=False
)

INFO:root:Start extracting patches
100%|██████████| 205/205 [02:08<00:00,  1.60it/s]


In [3]:
db.df.columns

Index(['calcification', 'asymmetry', 'cluster', 'distortion', 'mass', 'normal',
       'breast_fraction', 'patch_bbox', 'filename', 'mask_filename', 'case_id',
       'img_id', 'side', 'view', 'acr', 'birads', 'label'],
      dtype='object')

In [4]:
db.df

Unnamed: 0,calcification,asymmetry,cluster,distortion,mass,normal,breast_fraction,patch_bbox,filename,mask_filename,case_id,img_id,side,view,acr,birads,label
0,0.000000,0,0.0,0,0,0,0.934291,"[[0, 0], [224, 224]]",20586908/20586908_roi_0.png,empty_mask,6c613a14b80a8591,20586908,R,CC,2,2,normal
333,0.003468,0,0.0,0,0,0,1.000000,"[[900, 2100], [1124, 2324]]",20586908/20586908_roi_333.png,20586908/20586908_roi_333_mask.png,6c613a14b80a8591,20586908,R,CC,2,2,abnormal
332,0.000000,0,0.0,0,0,0,1.000000,"[[800, 2100], [1024, 2324]]",20586908/20586908_roi_332.png,empty_mask,6c613a14b80a8591,20586908,R,CC,2,2,normal
331,0.000060,0,0.0,0,0,0,1.000000,"[[700, 2100], [924, 2324]]",20586908/20586908_roi_331.png,20586908/20586908_roi_331_mask.png,6c613a14b80a8591,20586908,R,CC,2,2,abnormal
330,0.000060,0,0.0,0,0,0,1.000000,"[[600, 2100], [824, 2324]]",20586908/20586908_roi_330.png,20586908/20586908_roi_330_mask.png,6c613a14b80a8591,20586908,R,CC,2,2,abnormal
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
66949,0.000020,0,0.0,0,0,0,1.000000,"[[700, 900], [924, 1124]]",30011674/30011674_roi_88.png,30011674/30011674_roi_88_mask.png,6968748e66837bc7,30011674,L,CC,1,5,abnormal
66948,0.000020,0,0.0,0,0,0,1.000000,"[[600, 900], [824, 1124]]",30011674/30011674_roi_87.png,30011674/30011674_roi_87_mask.png,6968748e66837bc7,30011674,L,CC,1,5,abnormal
66947,0.000020,0,0.0,0,0,0,1.000000,"[[500, 900], [724, 1124]]",30011674/30011674_roi_86.png,30011674/30011674_roi_86_mask.png,6968748e66837bc7,30011674,L,CC,1,5,abnormal
66953,0.000000,0,0.0,0,0,0,1.000000,"[[1100, 900], [1324, 1124]]",30011674/30011674_roi_92.png,empty_mask,6968748e66837bc7,30011674,L,CC,1,5,normal


In [8]:
TRAIN_SPLIT_SIZE = 0.7

unique_cases = db.df.case_id.unique()
train_cases = np.random.choice(unique_cases, int(TRAIN_SPLIT_SIZE*len(unique_cases)), replace=False)
train_df = db.df.loc[db.df.case_id.isin(train_cases)]
test_df = db.df.loc[~db.df.case_id.isin(train_cases)]
print(train_df.shape,test_df.shape)

(47477, 17) (19660, 17)


In [11]:
NEGATIVE_RATE = 5

positive_train_df = train_df.loc[train_df.mask_filename != 'empty_mask']
negative_train_df = train_df.loc[train_df.mask_filename == 'empty_mask']

balanced_train_df = pd.concat([positive_train_df, negative_train_df.sample(NEGATIVE_RATE*len(positive_train_df))])



In [18]:
for i,row in tqdm(balanced_train_df.iterrows()):
    filename = row['filename']
    src_path = '/home/acortinau/projects/data_rois/patches/' + filename
    if row['mask_filename'] != 'empty_mask':
        dst_path = '/home/acortinau/projects/p224_database/train_set/positive/' + filename.split('/')[1]
        shutil.copyfile(src_path, dst_path)
    else:
        dst_path = '/home/acortinau/projects/p224_database/train_set/negative/' + filename.split('/')[1]
        shutil.copyfile(src_path, dst_path)


21864it [00:12, 1766.83it/s]


In [19]:
for i,row in tqdm(test_df.iterrows(), total=len(test_df)):
    filename = row['filename']
    src_path = '/home/acortinau/projects/data_rois/patches/' + filename
    if row['mask_filename'] != 'empty_mask':
        dst_path = '/home/acortinau/projects/p224_database/val_set/positive/' + filename.split('/')[1]
        shutil.copyfile(src_path, dst_path)
    else:
        dst_path = '/home/acortinau/projects/p224_database/val_set/negative/' + filename.split('/')[1]
        shutil.copyfile(src_path, dst_path)

100%|██████████| 19660/19660 [00:22<00:00, 884.75it/s] 


In [21]:
balanced_train_df.to_csv('/home/acortinau/projects/p224_database/train_set/balanced_train_df.csv')
train_df.to_csv('/home/acortinau/projects/p224_database/val_set/train_df.csv')

In [22]:
db.df.to_csv('/home/acortinau/projects/p224_database/all_df.csv')

### GM detection test

In [2]:
from mc_candidate_proposal.morphology_mc import MorphologyCalcificationDetection

In [3]:
db = INBreast_Dataset(
    return_lesions_mask=True,
    level='image',
    partitions=['train', 'validation'],
    max_lesion_diam_mm=None,
    cropped_imgs=True,
    use_muscle_mask=True,
    lesion_types=['calcification', 'cluster'],
    keep_just_images_of_lesion_type = False
)

rbd_path = Path.cwd().parent/'data/gsm_imgs'

hd = MorphologyCalcificationDetection(rbd_img_path=rbd_path,threshold=0.96,min_distance=6,area=14*14,store_intermediate=True,filter_muscle_region=True)

In [6]:
idx = 1
db_sample = db[idx]
image = db_sample['img']
image_id = db.df.iloc[idx].img_id
image_mask = db_sample['lesion_mask']
muscle_mask = db_sample['muscle_mask']

candidates = hd.detect(image, image_id, muscle_mask=muscle_mask)


In [5]:
# patient 0, filter by size
print(np.unique(candidates[:,2],return_counts=True))
print(len(candidates))

(array([ 0,  1,  2,  3,  4,  5,  6,  7,  8,  9, 10]), array([1209,  352,  127,   63,   38,   22,   10,   22,    6,    6,    6]))
1861


In [7]:
#patient 0, no filter
np.unique(candidates[:,2],return_counts=True)

(array([ 0,  1,  2,  3,  4,  5,  6,  7,  8,  9, 10, 11, 12, 14, 15, 16, 17,
        40]),
 array([1209,  352,  127,   63,   38,   22,   10,   22,    6,    6,    6,
           3,    2,    1,    2,    1,    1,    1]))

In [10]:
# patient 1, no filter
print(np.unique(candidates[:,2],return_counts=True))
print(len(candidates))


(array([ 0,  1,  2,  3,  4,  5,  6,  7,  8,  9, 10, 11, 12, 13, 14, 16, 21,
       22, 23]), array([640, 237,  94,  46,  25,  22,   9,  13,   5,   7,   3,   7,   1,
         1,   2,   1,   2,   1,   1]))
1117


In [7]:
# patient 1, filter by size
print(np.unique(candidates[:,2],return_counts=True))
print(len(candidates))

(array([ 0,  1,  2,  3,  4,  5,  6,  7,  8,  9, 10]), array([640, 237,  94,  46,  25,  22,   9,  13,   5,   7,   3]))
1101
