## Feature Extaction Test

In [1]:
import sys
sys.path.insert(0, '../..')
from metrics.metrics_utils import get_tp_fp_fn_center_patch_criteria
from general_utils.plots import plot_bboxes_over_image, simple_im_show, simple_im_show2
from candidate_proposal.hough_mc import HoughCalcificationDetection
from candidate_proposal.morphology_mc import MorphologyCalcificationDetection, GSM_DEFAULT_PARAMS
from feature_extraction.feature_extraction import  CandidatesFeatureExtraction_MP
from database.dataset import *
import feature_extraction.haar_features.haar_modules as hm

import pickle

### Database initialization

In [2]:
db = INBreast_Dataset(
    return_lesions_mask=True,
    level='image',
    max_lesion_diam_mm=None,
    extract_patches=False,
    partitions=['train', 'validation'],
    lesion_types=['calcification', 'cluster'],
    cropped_imgs=True,
    keep_just_images_of_lesion_type=False,
    use_muscle_mask=False,
    ignore_diameter_px=15
)


### Detectors Parameters and initialization

In [3]:
cfe = CandidatesFeatureExtraction_MP()

In [4]:
db_range = list(range(len(db)))

### Detector initialization

## FE

### Hough

In [9]:
hd = HoughCalcificationDetection()

In [10]:
data_path = Path.cwd().parent.parent.parent/'data/features/hough'

fdf = []
fns_df = []
ingnored_df = []

batch = 1

for idx in tqdm(db_range[:]):
    # extracting data
    db_sample = db[idx]
    image = db_sample['img']
    image_id = db.df.iloc[idx].img_id
    image_mask = db_sample['lesion_mask']

    # candidate selection
    candidates = hd.detect(image, image_id, load_processed_images=True, hough2=False)
     # labeling of candidates:
    tp, fp, fn, ignored_candidates = get_tp_fp_fn_center_patch_criteria(
        candidates, image_mask, None, 14)
    
    candidates = pd.concat([tp, fp], axis=0, ignore_index=True)
    
    # Extracting features
    labels = np.where(candidates.label.values == 'TP', True, False)
    
    X = candidates.drop_duplicates(subset='repeted_idxs')
    X = cfe.extract_features(X.loc[:, ['x','y','radius']].values.astype(int), image)
    
    # Get features dfs
    X = pd.DataFrame(data=X, columns=cfe.feature_names)
    X.index = candidates.drop_duplicates(subset='repeted_idxs').index

    res = X.loc[candidates.repeted_idxs.tolist(),:]
    res['img_id'] = image_id
    res['repeted_idxs'] = candidates.repeted_idxs.tolist()
    res['matching_gt'] = candidates.matching_gt.tolist()
    res['label'] = labels

    # Generate a fn dataframe to compute frocs
    fn['img_id'] = image_id
    fns_df.append(fn)
    fdf.append(res)
    ingnored_df.append(ignored_candidates)
    
all_data_df = pd.concat(fdf, ignore_index=True)
fns_df = pd.concat(fns_df, ignore_index=True)
ingnored_df = pd.concat(ingnored_df, ignore_index=True)

all_data_df.to_feather(str(data_path/f'all_data_df_{batch}_haar.f'))
fns_df.to_feather(str(data_path/f'fns_df_{batch}_haar.f'))
ingnored_df.to_feather(str(data_path/f'ingnored_df_{batch}_haar.f'))

 77%|███████▋  | 161/210 [2:46:11<1:00:33, 74.15s/it]

: 

: 

In [None]:
del all_data_df
del fns_df
del fdf
del ingnored_df

In [18]:
data_path = Path.cwd().parent.parent.parent/'data/features/hough'

fdf = []
fns_df = []
ingnored_df = []

batch = 2

for idx in tqdm(db_range[110:]):
    # extracting data
    db_sample = db[idx]
    image = db_sample['img']
    image_id = db.df.iloc[idx].img_id
    image_mask = db_sample['lesion_mask']

    # candidate selection
    candidates = hd.detect(image, image_id, load_processed_images=True, hough2=False)
     # labeling of candidates:
    tp, fp, fn, ignored_candidates = get_tp_fp_fn_center_patch_criteria(
        candidates, image_mask, None, 14)
    
    candidates = pd.concat([tp, fp], axis=0, ignore_index=True)
    
    # Extracting features
    labels = np.where(candidates.label.values == 'TP', True, False)
    
    X = candidates.drop_duplicates(subset='repeted_idxs')
    X = cfe.extract_features(X.loc[:, ['x','y','radius']].values.astype(int), image)
    
    # Get features dfs
    X = pd.DataFrame(data=X, columns=cfe.feature_names)
    X.index = candidates.drop_duplicates(subset='repeted_idxs').index

    res = X.loc[candidates.repeted_idxs.tolist(),:]
    res['img_id'] = image_id
    res['repeted_idxs'] = candidates.repeted_idxs.tolist()
    res['matching_gt'] = candidates.matching_gt.tolist()
    res['label'] = labels

    # Generate a fn dataframe to compute frocs
    fn['img_id'] = image_id
    fns_df.append(fn)
    fdf.append(res)
    ingnored_df.append(ignored_candidates)
    
all_data_df = pd.concat(fdf, ignore_index=True)
fns_df = pd.concat(fns_df, ignore_index=True)
ingnored_df = pd.concat(ingnored_df, ignore_index=True)

all_data_df.to_feather(str(data_path/f'all_data_df_{batch}_haar.f'))
fns_df.to_feather(str(data_path/f'fns_df_{batch}_haar.f'))
ingnored_df.to_feather(str(data_path/f'ingnored_df_{batch}_haar.f'))

100%|██████████| 1/1 [01:07<00:00, 67.71s/it]


In [21]:
print(str(data_path/f'ingnored_df_{batch}_haar.f'))

/home/vzalevskyi/projects/data/features/hough/ingnored_df_2_haar.f


In [19]:
val_img_ids = set(db.df.img_id[db.df.partition == 'validation'].unique())
normal_img_ids = set(db.get_normal_imgs_ids())

validation_mask = all_data_df.img_id.isin(val_img_ids)
validation_normals_mask = all_data_df.img_id.isin(normal_img_ids&val_img_ids)
normals_mask = all_data_df.img_id.isin(normal_img_ids)

print(f'General train+val sensitivity: {all_data_df.label.sum()/(all_data_df.label.sum() + len(fns_df))}')
print(f'General val sensitivity: {all_data_df[validation_mask].label.sum()/(all_data_df[validation_mask].label.sum() + len(fns_df[fns_df.img_id.isin(val_img_ids)]))}')

General train+val sensitivity: 0.5
General val sensitivity: 0.5


### Morphology

In [5]:
md = MorphologyCalcificationDetection(**GSM_DEFAULT_PARAMS)

cfe = CandidatesFeatureExtraction_MP()

In [6]:
data_path = Path.cwd().parent.parent.parent/'data/features/morph'

fdf = []
fns_df = []
ingnored_df = []

for idx in tqdm(db_range[:]):
    # extracting data
    db_sample = db[idx]
    image = db_sample['img']
    image_id = db.df.iloc[idx].img_id
    image_mask = db_sample['lesion_mask']

    # candidate selection
    candidates = md.detect(image, image_id)
     # labeling of candidates:
    tp, fp, fn, ignored_candidates = get_tp_fp_fn_center_patch_criteria(
        candidates, image_mask, None, 14, use_euclidean_dist=True)
    
    candidates = pd.concat([tp, fp], axis=0, ignore_index=True)
    
    # Extracting features
    labels = np.where(candidates.label.values == 'TP', True, False)
    
    X = candidates.drop_duplicates(subset='repeted_idxs')
    X = cfe.extract_features(X.loc[:, ['x','y','radius']].values.astype(int), image)
    
    # Get features dfs
    X = pd.DataFrame(data=X, columns=cfe.feature_names)
    X.index = candidates.drop_duplicates(subset='repeted_idxs').index

    res = X.loc[candidates.repeted_idxs.tolist(),:]
    res['img_id'] = image_id
    res['repeted_idxs'] = candidates.repeted_idxs.tolist()
    res['matching_gt'] = candidates.matching_gt.tolist()
    res['label'] = labels

    # Generate a fn dataframe to compute frocs
    fn['img_id'] = image_id
    fns_df.append(fn)
    fdf.append(res)
    ingnored_df.append(ignored_candidates)
    
all_data_df = pd.concat(fdf, ignore_index=True)
fns_df = pd.concat(fns_df, ignore_index=True)
ingnored_df = pd.concat(ingnored_df, ignore_index=True)

all_data_df.to_feather(str(data_path/f'all_data_df.f'))
fns_df.to_feather(str(data_path/f'fns_df.f'))
ingnored_df.to_feather(str(data_path/f'ingnored_df.f'))

100%|██████████| 210/210 [2:26:57<00:00, 41.99s/it]  


In [7]:
print("Saved to \n", str(data_path/f'all_data_df.f'))

Saved to 
 /home/vzalevskyi/projects/data/features/morph/all_data_df.f


In [8]:
val_img_ids = set(db.df.img_id[db.df.partition == 'validation'].unique())
normal_img_ids = set(db.get_normal_imgs_ids())

validation_mask = all_data_df.img_id.isin(val_img_ids)
validation_normals_mask = all_data_df.img_id.isin(normal_img_ids&val_img_ids)
normals_mask = all_data_df.img_id.isin(normal_img_ids)

print(f'General train+val sensitivity: {all_data_df.label.sum()/(all_data_df.label.sum() + len(fns_df))}')
print(f'General val sensitivity: {all_data_df[validation_mask].label.sum()/(all_data_df[validation_mask].label.sum() + len(fns_df[fns_df.img_id.isin(val_img_ids)]))}')

General train+val sensitivity: 0.7957466697826595
General val sensitivity: 0.7951807228915663


## Merging batches

Merging differnet batches of features

In [20]:
features_data1 = pd.read_feather('/home/vzalevskyi/projects/data/features/hough/final_features_hough1.f')
features_data2 = pd.read_feather('/home/vzalevskyi/projects/data/features/hough/final_features_hough2.f')

features_data = pd.concat([features_data1, features_data2])

del features_data1
del features_data2

features_data.reset_index().drop(columns=['index']).to_feather('/home/vzalevskyi/projects/data/features/hough/features_hough.f')


FileNotFoundError: [Errno 2] No such file or directory: '/home/vzalevskyi/projects/data/features/hough/final_features_hough1.f'

In [7]:
# fnsdf1 =  pd.read_feather('/home/vzalevskyi/projects/data/features/hough/fns_df_1.f')
# fnsdf2 =  pd.read_feather('/home/vzalevskyi/projects/data/features/hough/fns_df_2.f')

# ignored1 = pd.read_feather('/home/vzalevskyi/projects/data/features/hough/ingnored_df_1.f')
# ignored2 = pd.read_feather('/home/vzalevskyi/projects/data/features/hough/ingnored_df_2.f')

# fnsdf = pd.concat([fnsdf1, fnsdf2]).reset_index().drop(columns=['index'])
# ignored = pd.concat([ignored1, ignored2]).reset_index().drop(columns=['index'])

# fnsdf.to_feather('/home/vzalevskyi/projects/data/features/hough/fnsdf_hough.f')
# ignored.to_feather('/home/vzalevskyi/projects/data/features/hough/ignored_hough.f')

Merging haar features to rest of features

In [24]:
# # used to merge batches of features, ignore
# features_data1 = pd.read_feather('/home/vzalevskyi/projects/data/features/hough/all_data_df_2.f')
# features_data1_haar = pd.read_feather('/home/vzalevskyi/projects/data/features/hough/all_data_df_2_haar.f')


# merged_data = features_data1.merge(features_data1_haar, on=['img_id', 'repeted_idxs', 'matching_gt'], how='inner', suffixes=(None, '_y'), validate='one_to_one')
# merged_data = merged_data.drop(columns=[x for x in merged_data.columns if '_y' in x])

# merged_data.to_feather('/home/vzalevskyi/projects/data/features/hough/final_features_hough2.f')

# fns_df_rest = pd.read_feather('/home/vzalevskyi/projects/data/features/hough/fns_df_1.f')
# fns_df_haar = pd.read_feather('/home/vzalevskyi/projects/data/features/hough/fns_df_1_haar.f')

# fns_df_rest == fns_df_haar