# Prepare result table:

In [1]:
%load_ext autoreload
%autoreload 2
import collections
import os
import pandas as pd
import numpy as np
import pickle
import json
import sys
import tensorflow as tf
from vis.utils import utils
from loguru  import logger
from tqdm import tqdm
from screening.validation.crossval import crossval_table, crossval_ref_filter, crossval_max_value_filter
from pprint import pprint

physical_devices = tf.config.list_physical_devices('GPU')
if len(physical_devices)>0:
    tf.config.experimental.set_memory_growth(physical_devices[0], True)
tf.config.run_functions_eagerly(False)

2024-07-13 19:01:40.075346: I tensorflow/core/platform/cpu_feature_guard.cc:193] This TensorFlow binary is optimized with oneAPI Deep Neural Network Library (oneDNN) to use the following CPU instructions in performance-critical operations:  AVX2 AVX_VNNI FMA
To enable them in other operations, rebuild TensorFlow with the appropriate compiler flags.
2024-07-13 19:01:41.206364: I tensorflow/core/util/port.cc:104] oneDNN custom operations are on. You may see slightly different numerical results due to floating-point round-off errors from different computation orders. To turn them off, set the environment variable `TF_ENABLE_ONEDNN_OPTS=0`.
2024-07-13 19:01:52.653408: I tensorflow/compiler/xla/stream_executor/cuda/cuda_gpu_executor.cc:981] successful NUMA node read from SysFS had negative value (-1), but there must be at least one NUMA node, so returning NUMA node zero
2024-07-13 19:01:52.717084: I tensorflow/compiler/xla/stream_executor/cuda/cuda_gpu_executor.cc:981] successful NUMA node 

## Best models:

In [2]:

def create_op_dict( op_name, extra_suf="" ):

    d = collections.OrderedDict( {
                'max_sp'           : f'summary{extra_suf}/max_sp',
                'auc'              : f'summary{extra_suf}/auc',
                #'acc'              : f'summary{extra_suf}/acc',
                #'pd'               : f'summary{extra_suf}/pd',
                #'fa'               : f'summary{extra_suf}/fa',
                'sens'             : f'summary{extra_suf}/sensitivity',
                'spec'             : f'summary{extra_suf}/specificity',
                'threshold'        : f'summary{extra_suf}/threshold',
                'roc'              : f'summary{extra_suf}/roc',
                'roc_val'          : f'summary{extra_suf}/roc_val',
                'roc_op'           : f'summary{extra_suf}/roc_op',
                'roc_test'         : f'summary{extra_suf}/roc_test',

                'min_spec_sens_reached' : f'{op_name}{extra_suf}/min_spec_sens_reached',

                'max_sp_val'       : f'summary{extra_suf}/max_sp_val',
                'auc_val'          : f'summary{extra_suf}/auc_val',
                #'acc_val'          : f'summary{extra_suf}/acc_val',
                #'pd_val'           : f'summary{extra_suf}/pd_val',
                #'fa_val'           : f'summary{extra_suf}/fa_val',
                'sens_val'         : f'summary{extra_suf}/sensitivity_val',
                'spec_val'         : f'summary{extra_suf}/specificity_val', 

                'max_sp_test'      : f'summary{extra_suf}/max_sp_test',
                'auc_test'         : f'summary{extra_suf}/auc_test',
                #'acc_test'         : f'summary{extra_suf}/acc_test',
                #'pd_test'          : f'summary{extra_suf}/pd_test',
                #'fa_test'          : f'summary{extra_suf}/fa_test',
                'sens_test'        : f'summary{extra_suf}/sensitivity_test',
                'spec_test'        : f'summary{extra_suf}/specificity_test', 

                'max_sp_op'        : f'summary{extra_suf}/max_sp_op',
                'auc_op'           : f'summary{extra_suf}/auc_op',
                #'acc_op'           : f'summary{extra_suf}/acc_op',
                #'pd_op'            : f'summary{extra_suf}/pd_op',
                #'fa_op'            : f'summary{extra_suf}/fa_op',
                'sens_op'          : f'summary{extra_suf}/sensitivity_op',
                'spec_op'          : f'summary{extra_suf}/specificity_op', 

                'sp_index'         : f'{op_name}{extra_suf}/sp_index',
                'sens_at'          : f'{op_name}{extra_suf}/sensitivity',
                'spec_at'          : f'{op_name}{extra_suf}/specificity',
                'acc_at'           : f'{op_name}{extra_suf}/acc',
                'threshold_at'     : f'{op_name}{extra_suf}/threshold',

                'sp_index_val'     : f'{op_name}{extra_suf}/sp_index_val',
                'sens_at_val'      : f'{op_name}{extra_suf}/sensitivity_val',
                'spec_at_val'      : f'{op_name}{extra_suf}/specificity_val',
                #'acc_at_val'       : f'{op_name}{extra_suf}/acc_val',
                'threshold_at_val' : f'{op_name}{extra_suf}/threshold_val',

                'sp_index_test'    : f'{op_name}{extra_suf}/sp_index_test',
                'sens_at_test'     : f'{op_name}{extra_suf}/sensitivity_test',
                'spec_at_test'     : f'{op_name}{extra_suf}/specificity_test',
                #'acc_at_test'      : f'{op_name}{extra_suf}/acc_test',
                'threshold_at_test': f'{op_name}{extra_suf}/threshold_test',

                'sp_index_op'      : f'{op_name}{extra_suf}/sp_index_op',
                'sens_at_op'       : f'{op_name}{extra_suf}/sensitivity_op',
                'spec_at_op'       : f'{op_name}{extra_suf}/specificity_op',
                #'acc_at_op'        : f'{op_name}{extra_suf}/acc_op',
                'threshold_at_op'  : f'{op_name}{extra_suf}/threshold_op',

                'inference'        : f'{op_name}{extra_suf}/inference',
        })
    return d


#extra_suf='_val'
extra_suf=''
conf_dict = collections.OrderedDict(
    {
        'sens90'   : create_op_dict( 'sens90'  , extra_suf=extra_suf ),
        'max_sp'   : create_op_dict( 'max_sp' , extra_suf=extra_suf),
        'spec70'   : create_op_dict( 'spec70'  , extra_suf=extra_suf),
    }
)
pprint(conf_dict)

OrderedDict([('sens90',
              OrderedDict([('max_sp', 'summary/max_sp'),
                           ('auc', 'summary/auc'),
                           ('sens', 'summary/sensitivity'),
                           ('spec', 'summary/specificity'),
                           ('threshold', 'summary/threshold'),
                           ('roc', 'summary/roc'),
                           ('roc_val', 'summary/roc_val'),
                           ('roc_op', 'summary/roc_op'),
                           ('roc_test', 'summary/roc_test'),
                           ('min_spec_sens_reached',
                            'sens90/min_spec_sens_reached'),
                           ('max_sp_val', 'summary/max_sp_val'),
                           ('auc_val', 'summary/auc_val'),
                           ('sens_val', 'summary/sensitivity_val'),
                           ('spec_val', 'summary/specificity_val'),
                           ('max_sp_test', 'summary/max_sp_test'),
                

In [3]:
basepath='/mnt/brics_data/joao.pinto'


models = [
    ( 'user.philipp.gaspar.convnets_v0.altogether.shenzhen_santacasa.exp_wgan_p2p.67de4190c1.r1'                , 'v0.alto.sh-sc.ewp'         ),
    ( 'user.philipp.gaspar.convnets_v0.altogether.shenzhen_santacasa.exp_wgan_p2p_cycle.a19a3a4f8c.r1'          , 'v0.alto.sh-sc.ewpc'        ),
    ( 'user.philipp.gaspar.convnets_v0.altogether.shenzhen_santacasa_manaus.exp_wgan_p2p.0d13030165.r1'         , 'v0.alto.sh-sc-ma.ewp'      ),
    ( 'user.philipp.gaspar.convnets_v0.altogether.shenzhen_santacasa_manaus.exp_wgan_p2p_cycle.c5143abd1b.r1'   , 'v0.alto.sh-sc-ma.ewpc'     ),
    
    ( 'user.philipp.gaspar.convnets_v0.baseline.shenzhen_santacasa.exp.989f87bed5.r1'                           , 'v0.base.sh-sc.e'           ),
    ( 'user.philipp.gaspar.convnets_v0.baseline.shenzhen_santacasa_manaus.exp.ffe6cbee11.r1'                    , 'v0.base.sh-sc-ma.e'        ),
    
    ( 'user.philipp.gaspar.convnets_v0.interleaved.shenzhen_santacasa.exp_wgan_p2p.e540d24b4b.r1'               , 'v0.inte.sh-sc.ewp'         ),
    ( 'user.philipp.gaspar.convnets_v0.interleaved.shenzhen_santacasa.exp_wgan_p2p_cycle.a19a3a4f8c.r1'         , 'v0.inte.sh-sc.ewpc'        ),
    ( 'user.philipp.gaspar.convnets_v0.interleaved.shenzhen_santacasa_manaus.exp_wgan_p2p.ac79954ba0.r1'        , 'v0.inte.sh-sc-ma.ewp'      ),
    ( 'user.philipp.gaspar.convnets_v0.interleaved.shenzhen_santacasa_manaus.exp_wgan_p2p_cycle.c5143abd1b.r1'  , 'v0.inte.sh-sc-ma.ewpc'     ),

    ( 'user.philipp.gaspar.convnets_v1.baseline.shenzhen_santacasa.exp.20240303.r1'                             , 'v1.base.sh-sc.e'           ),
    ( 'user.philipp.gaspar.convnets_v1.interleaved.shenzhen_santacasa.exp_wgan_p2p.20240303.r1'                 , 'v1.inte.sh-sc.ewp'         ),
    ( 'user.philipp.gaspar.convnets_v1.altogether.shenzhen_santacasa.exp_wgan_p2p.20240303.r1'                  , 'v1.alto.sh-sc.ewp'         ),
    ( 'user.philipp.gaspar.convnets_v1.baseline.shenzhen_santacasa_manaus.exp.20240303.r1'                      , 'v1.base.sh-sc-ma.e'        ),
    ( 'user.philipp.gaspar.convnets_v1.interleaved.shenzhen_santacasa_manaus.exp_wgan_p2p.20240303.r1'          , 'v1.inte.sh-sc-ma.ewp'      ),
    ( 'user.philipp.gaspar.convnets_v1.altogether.shenzhen_santacasa_manaus.exp_wgan_p2p.20240303.r1'           , 'v1.alto.sh-sc-ma.ewp'      ),
]



In [4]:
cv = crossval_table( conf_dict )
#for path, train_tag in models:
#    cv.fill( basepath+'/'+path , train_tag )
cv.table = pd.read_pickle("table_results.pkl")
table = cv.table
#table.to_pickle("table_results.pkl")

In [5]:
table.head()

Unnamed: 0,train_tag,op_name,test,sort,file_name,max_sp,auc,sens,spec,threshold,...,threshold_at_val,sp_index_test,sens_at_test,spec_at_test,threshold_at_test,sp_index_op,sens_at_op,spec_at_op,threshold_at_op,inference
0,v0.alto.sh-sc.ewp,sens90,3,0,/mnt/brics_data/joao.pinto/user.philipp.gaspar...,0.886746,0.944909,0.887129,0.886364,0.441392,...,0.34974,0.782621,0.727273,0.84,0.34974,0.872353,0.897959,0.847118,0.34974,"{'russia': {'sp_index': 0.5123920778165847, 'f..."
1,v0.alto.sh-sc.ewp,max_sp,3,0,/mnt/brics_data/joao.pinto/user.philipp.gaspar...,0.886746,0.944909,0.887129,0.886364,0.441392,...,0.34974,0.782621,0.727273,0.84,0.34974,0.872353,0.897959,0.847118,0.34974,"{'russia': {'sp_index': 0.5123920778165847, 'f..."
2,v0.alto.sh-sc.ewp,spec70,3,0,/mnt/brics_data/joao.pinto/user.philipp.gaspar...,0.886746,0.944909,0.887129,0.886364,0.441392,...,0.156788,0.768306,0.818182,0.72,0.156788,0.822598,0.944341,0.709273,0.156788,"{'russia': {'sp_index': 0.6983964089075902, 'f..."
3,v0.alto.sh-sc.ewp,sens90,4,0,/mnt/brics_data/joao.pinto/user.philipp.gaspar...,0.944462,0.984971,0.924,0.965147,0.294554,...,0.425267,0.760254,0.666667,0.86,0.425267,0.933962,0.895131,0.973618,0.425267,"{'russia': {'sp_index': 0.6983964089075902, 'f..."
4,v0.alto.sh-sc.ewp,max_sp,4,0,/mnt/brics_data/joao.pinto/user.philipp.gaspar...,0.944462,0.984971,0.924,0.965147,0.294554,...,0.294554,0.766816,0.69697,0.84,0.294554,0.940772,0.919476,0.962312,0.294554,"{'russia': {'sp_index': 0.7183985753211187, 'f..."


In [6]:

def apply_filters( table):

    best_sorts_list = []
    best_tests_list = []

    filters = {
        'sens90' : crossval_ref_filter(0.9, 'sens_op', 'spec_op', test_key='spec_op'),
        'max_sp' : crossval_max_value_filter(sort_key='max_sp_op', test_key='max_sp_op'),
        'spec70' : crossval_ref_filter(0.7, 'spec_op', 'sens_op', test_key='sens_op'),
    }
    
    for op_name in table.op_name.unique():
        best_sorts = filters[op_name].filter_sorts( table.loc[table.op_name==op_name])
        best_tests = filters[op_name].filter_tests( best_sorts )
        best_sorts_list.append(best_sorts)
        best_tests_list.append(best_tests)
    return pd.concat(best_sorts_list, axis='rows'), pd.concat(best_tests_list, axis='rows')

best_sorts, best_tests = apply_filters(table)


In [7]:
best_tests[['train_tag','op_name','test','sort','sens_val','spec_val','sens_op','spec_op','sens_test','spec_test']]

Unnamed: 0,train_tag,op_name,test,sort,sens_val,spec_val,sens_op,spec_op,sens_test,spec_test
795,v0.alto.sh-sc-ma.ewp,sens90,0,8,0.868421,0.887097,0.920474,0.965278,0.763158,0.938462
822,v0.alto.sh-sc-ma.ewpc,sens90,7,0,0.684211,0.892308,0.935691,0.967767,0.763158,0.888889
3,v0.alto.sh-sc.ewp,sens90,4,0,0.852941,0.92,0.919476,0.962312,0.69697,0.84
357,v0.alto.sh-sc.ewpc,sens90,9,7,0.823529,0.895833,0.884547,0.957265,0.794118,0.770833
1362,v0.base.sh-sc-ma.e,sens90,7,0,0.868421,0.923077,0.908012,0.936097,0.921053,0.84127
1230,v0.base.sh-sc.e,sens90,2,7,0.794118,0.9375,0.891089,0.936795,0.636364,0.9
2181,v0.inte.sh-sc-ma.ewp,sens90,1,2,0.648649,0.923077,0.880471,0.953373,0.736842,0.830769
2574,v0.inte.sh-sc-ma.ewpc,sens90,3,6,0.842105,0.84127,0.730645,0.826291,0.72973,0.892308
1641,v0.inte.sh-sc.ewp,sens90,8,8,0.852941,0.916667,0.914339,0.960445,0.823529,0.9375
2058,v0.inte.sh-sc.ewpc,sens90,6,0,0.823529,0.84,0.787719,0.862999,0.823529,0.857143


## Prepare datasets:

In [25]:
def read_datasets(basepath : str="/mnt/brics_data/public/datasets"):

    datasets = [
        f"{basepath}/Shenzhen/china/raw/Shenzhen_china_table_from_raw.csv",
        f"{basepath}/SantaCasa/imageamento_anonimizado_valid/raw/SantaCasa_imageamento_anonimizado_valid_table_from_raw.csv",
        f"{basepath}/Manaus/manaus/raw/Manaus_manaus_table_from_raw.csv",
        f"{basepath}/Caxias/caxias/raw/images.csv",
        f"{basepath}/Indonesia/indonesia/raw/images.csv",
        f"{basepath}/Russia/russia/raw/images.csv",
        f"{basepath}/Rio/fiocruz/raw/Rio_fiocruz_table_from_raw.csv",
    ]

    autocrop = [False,False,False,False,False,False,False,True]

    blacklists = [
        f"{basepath}/Caxias/caxias/raw/blacklist.pkl",#
        f"{basepath}/Indonesia/indonesia/raw/blacklist.pkl", #
        f"{basepath}/Russia/russia/raw/blacklist.pkl", #
    ]
    
    data_list = []
    for idx,path in enumerate(datasets):
        raw_path = '/'.join(path.split('/')[:-1])
        data=pd.read_csv(path, index_col=0)
        def append_basepath(row):
            return f"{raw_path}/{row.image_path}"
        data['image_path'] = data.apply(lambda row : append_basepath(row), axis='columns')
        data['autocrop']=autocrop[idx]
        data_list.append( data )
    data = pd.concat(data_list,axis='rows')
    blacklist = []
    for path in blacklists:
        images = pickle.load(open(path,'rb'))['black_list']
        blacklist.extend(images)
    #pprint(blacklist)
    data = data[~data['project_id'].isin(blacklist)]
    data=data.reset_index()
    return data

data = read_datasets()

In [26]:
data.head()

Unnamed: 0,index,dataset_name,project_id,image_path,insertion_date,metadata,target,autocrop
0,135,china,china_CHNCXR_0001_0_E464A8,/mnt/brics_data/public/datasets/Shenzhen/china...,2021-08-17,"{'gender': 'male', 'age': 45, 'has_tb': False,...",0,False
1,323,china,china_CHNCXR_0002_0_961172,/mnt/brics_data/public/datasets/Shenzhen/china...,2021-08-17,"{'gender': 'male', 'age': 63, 'has_tb': False,...",0,False
2,102,china,china_CHNCXR_0003_0_BA565D,/mnt/brics_data/public/datasets/Shenzhen/china...,2021-08-17,"{'gender': 'female', 'age': 48, 'has_tb': Fals...",0,False
3,229,china,china_CHNCXR_0004_0_96C984,/mnt/brics_data/public/datasets/Shenzhen/china...,2021-08-17,"{'gender': 'male', 'age': 58, 'has_tb': False,...",0,False
4,37,china,china_CHNCXR_0005_0_B6ECEF,/mnt/brics_data/public/datasets/Shenzhen/china...,2021-08-17,"{'gender': 'male', 'age': 28, 'has_tb': False,...",0,False


## Prediction

In [10]:

def load_model( path ):

    def preproc_for_convnets( path ,channels=3, image_shape=(256,256), crop : bool=False):
        image_encoded = tf.io.read_file(path)
        image = tf.io.decode_jpeg(image_encoded, channels=channels)
        # image = tf.image.rgb_to_grayscale(image)
        image = tf.cast(image, dtype=tf.float32) / tf.constant(255., dtype=tf.float32)
        if crop:
            shape = tf.shape(image) 
            image = tf.image.crop_to_bounding_box(image, 0,0,shape[0]-70,shape[1])
        image = tf.image.resize(image, image_shape, method='nearest')
        return image.numpy()
    

    preproc = {
        'convnets': preproc_for_convnets
    }

    logger.info(f"reading file from {path}...")
    with open(path, 'rb') as f:
        d = pickle.load(f)
        name = d["__name__"]
        version = d["__version__"]
        if name == "convnets":
            logger.info(f"strategy is {name}...")
            if version == 1:
                metadata = d['metadata']
                model = d["model"]
                model = tf.keras.models.model_from_json( json.dumps( d['model']['sequence'], separators=(',',':')) )
                model.set_weights( d['model']['weights'] )
                sort = metadata['sort']; test = metadata['test']
                logger.info(f"sort = {sort} and test = {test}")
                history = d['history']
                threshold = {}
                logger.info("current operation points:")
                for key, values in history.items():
                    if (type(values)==collections.OrderedDict) and (key!='summary') and (not 'val' in key): # get all thresholds dicts
                        threshold[key]=values['threshold']
                        logger.info(f"   {key} : {values['threshold']}")
                meta = d['metadata']
                tag = f"{name}-{meta['type']}-test{test}-sort{sort}"
                logger.info(f"model tag : {tag}")
                return model, preproc[name], threshold, tag
            else:
                logger.error(f"version {version} not supported.")
        elif name == "oneclass-svm":
            logger.error("not implemented yet.")
        else:
            logger.error(f"name {name} not supported.")

def add(d, key,value):
    if key in d.keys():
        d[key].append(value)
    else:
        d[key] = [value]  

In [11]:

for row_idx, row in best_tests.iterrows():
    d = {}
    path = row.file_name
    train_tag = row.train_tag

    if os.path.exists(f"{train_tag}_outputs.csv"):
        continue
    model, preproc, threshold, _ = load_model(path)
    with tqdm(total=len(data), file=sys.stdout) as pbar:
        #images= []
        for idx, row_data in data.iterrows():
            img = preproc(row_data.image_path, crop=row_data.autocrop)
            img = np.expand_dims(img, 0)
            #images.append(img)
            output = model.predict(img, verbose=False)[0][0]
            add(d, 'project_id', row_data.project_id )
            add(d, 'dataset_name' , row_data.dataset_name)
            add(d, f'{train_tag}_output', output )
            for key, value in threshold.items():
                add(d,f"{train_tag}_{key}_accept", output>value) 
            pbar.set_description('processed: %d' % (1 + idx))
            pbar.update(1)
    predictions = pd.DataFrame(d)
    predictions.to_csv(f"{train_tag}_outputs.csv")

## Final data:

In [42]:

final_data=data.copy()
for row_idx, row in best_tests.iterrows():
    path = row.file_name
    train_tag = row.train_tag
    path=f"{train_tag}_outputs.csv"
    predictions = pd.read_csv(path)
    #predictions.drop_duplicates(subset=['project_id'],inplace=True)
    #predictions.to_csv(path, index=False)
    predictions.drop(columns=['dataset_name'],inplace=True)
    #print(len(predictions))
    #final_data=pd.concat([final_data, predictions],axis='columns')
    final_data=pd.merge(final_data, predictions, on='project_id', how='outer')

In [43]:
#final_data.drop(columns=['autocrop'],inplace=True)
final_data.target=final_data.target.astype(bool)

In [44]:
final_data.head()

Unnamed: 0,index,dataset_name,project_id,image_path,insertion_date,metadata,target,autocrop,v0.alto.sh-sc-ma.ewp_output_x,v0.alto.sh-sc-ma.ewp_sens90_accept_x,...,v1.base.sh-sc.e_max_sp_accept,v1.base.sh-sc.e_spec70_accept,v1.inte.sh-sc-ma.ewp_output,v1.inte.sh-sc-ma.ewp_sens90_accept,v1.inte.sh-sc-ma.ewp_max_sp_accept,v1.inte.sh-sc-ma.ewp_spec70_accept,v1.inte.sh-sc.ewp_output,v1.inte.sh-sc.ewp_sens90_accept,v1.inte.sh-sc.ewp_max_sp_accept,v1.inte.sh-sc.ewp_spec70_accept
0,135,china,china_CHNCXR_0001_0_E464A8,/mnt/brics_data/public/datasets/Shenzhen/china...,2021-08-17,"{'gender': 'male', 'age': 45, 'has_tb': False,...",False,False,0.001588,False,...,False,False,0.017651,False,False,False,0.023274,False,False,False
1,323,china,china_CHNCXR_0002_0_961172,/mnt/brics_data/public/datasets/Shenzhen/china...,2021-08-17,"{'gender': 'male', 'age': 63, 'has_tb': False,...",False,False,0.000239,False,...,False,False,0.015422,False,False,False,0.00115,False,False,False
2,102,china,china_CHNCXR_0003_0_BA565D,/mnt/brics_data/public/datasets/Shenzhen/china...,2021-08-17,"{'gender': 'female', 'age': 48, 'has_tb': Fals...",False,False,0.00318,False,...,False,True,0.067521,False,False,False,0.062431,False,False,False
3,229,china,china_CHNCXR_0004_0_96C984,/mnt/brics_data/public/datasets/Shenzhen/china...,2021-08-17,"{'gender': 'male', 'age': 58, 'has_tb': False,...",False,False,0.00784,False,...,False,False,0.02243,False,False,False,0.084395,False,False,False
4,37,china,china_CHNCXR_0005_0_B6ECEF,/mnt/brics_data/public/datasets/Shenzhen/china...,2021-08-17,"{'gender': 'male', 'age': 28, 'has_tb': False,...",False,False,0.000756,False,...,False,False,0.172292,False,False,True,0.027037,False,False,False


In [45]:
final_data.to_csv('final_result.csv',index=False)

In [46]:
len(final_data)

2526

In [29]:
len(data)

2526