In [1]:
import os,gc
import numpy as np 
import pandas as pd 
import matplotlib.pyplot as plt 

from functools import partial
import albumentations as albu

from sklearn.metrics import roc_auc_score

from kaggle_datasets import KaggleDatasets 
from sklearn.model_selection import train_test_split 
import tensorflow as tf
import tensorflow_addons as tfa
import tensorflow.keras.applications.efficientnet as efn 
from tensorflow.keras.applications import ResNet152

In [2]:
def auto_select_accelerator():
    tpu_ok = False 
    try:
        tpu = tf.distribute.cluster_resolver.TPUClusterResolver()
        tf.config.experimental_connect_to_cluster(tpu)
        tf.tpu.experimental.initialize_tpu_system(tpu)
        strategy = tf.distribute.experimental.TPUStrategy(tpu)
        tpu_ok = True
        print("Running on TPU:", tpu.master())
    except ValueError:
        strategy = tf.distribute.get_strategy()
    print(f"Running on {strategy.num_replicas_in_sync} replicas")
    return strategy,tpu_ok

AUTO = tf.data.experimental.AUTOTUNE
strategy,tpu_ok = auto_select_accelerator()
if tpu_ok:
    load_dir = KaggleDatasets().get_gcs_path("ranzcr-clip-catheter-line-classification")
else:
    load_dir = "../input/ranzcr-clip-catheter-line-classification"

Running on TPU: grpc://10.0.0.2:8470
Running on 8 replicas


In [3]:
labels = np.array(['ETT - Abnormal', 'ETT - Borderline',
       'ETT - Normal', 'NGT - Abnormal', 'NGT - Borderline',
       'NGT - Incompletely Imaged', 'NGT - Normal', 'CVC - Abnormal',
       'CVC - Borderline', 'CVC - Normal', 'Swan Ganz Catheter Present'])

DEBUG = False 
class CONFIG:
    version = 1 
    batchsize =16*strategy.num_replicas_in_sync
    epochs = 1 if DEBUG else 16  
    imsize = (512,512)  
    shuffle = 2048
    seed = 123 
    lr = 1e-4 
    min_lr = 5e-6
    max_lr = 5e-4
    n_labels = 11 
    n_folds = 5
    tta = 2 

print(f"EPOCHS : {CONFIG.epochs}")
print(f"BATCHSIZE : {CONFIG.batchsize}")

EPOCHS : 16
BATCHSIZE : 128


In [4]:
## Utils 
def seed_everything(seed):
    np.random.seed(seed)
    os.environ["PYTHONHASHSEED"] = str(seed)
    tf.random.set_seed(seed)
    
## decoder 
def decoder_image512(img):
    img = tf.io.decode_jpeg(img,channels=3)
    img = tf.cast(img,tf.float32) #float32にcast
    img /= 255.0 
    img = tf.image.resize(img,(512,512))
    return img

def decoder_image768(img):
    img = tf.io.decode_jpeg(img,channels=3)
    img = tf.cast(img,tf.float32) #float32にcast
    img /= 255.0 
    img = tf.image.resize(img,(768,768))
    return img


## Augmentation
def augmenter(img,label):
    img = tf.image.random_flip_left_right(img)
    img = tf.image.random_flip_up_down(img)
    return img,label

## For TFRecord 
def read_labeled_tfrecord512(example):
    LABELED_TFREC_FORMAT = {
        "image" : tf.io.FixedLenFeature([],tf.string),
        "ETT - Abnormal":tf.io.FixedLenFeature([],tf.int64),
        "ETT - Borderline":tf.io.FixedLenFeature([],tf.int64),
        "ETT - Normal":tf.io.FixedLenFeature([],tf.int64),
        "NGT - Abnormal":tf.io.FixedLenFeature([],tf.int64),
        "NGT - Borderline":tf.io.FixedLenFeature([],tf.int64),
        "NGT - Incompletely Imaged":tf.io.FixedLenFeature([],tf.int64),
        "NGT - Normal":tf.io.FixedLenFeature([],tf.int64),
        "CVC - Abnormal":tf.io.FixedLenFeature([],tf.int64),
        "CVC - Borderline":tf.io.FixedLenFeature([],tf.int64),
        "CVC - Normal":tf.io.FixedLenFeature([],tf.int64),
        "Swan Ganz Catheter Present":tf.io.FixedLenFeature([],tf.int64),
        'StudyInstanceUID': tf.io.FixedLenFeature([],tf.string),
        'PatientID':tf.io.FixedLenFeature([],tf.string) 
    }
    example = tf.io.parse_single_example(example,LABELED_TFREC_FORMAT)
    image = decoder_image512(example["image"])
    targets = tf.stack([tf.cast(example[label],tf.float32) for label in labels])
    return image,targets

def read_labeled_tfrecord768(example):
    LABELED_TFREC_FORMAT = {
        "image" : tf.io.FixedLenFeature([],tf.string),
        "ETT - Abnormal":tf.io.FixedLenFeature([],tf.int64),
        "ETT - Borderline":tf.io.FixedLenFeature([],tf.int64),
        "ETT - Normal":tf.io.FixedLenFeature([],tf.int64),
        "NGT - Abnormal":tf.io.FixedLenFeature([],tf.int64),
        "NGT - Borderline":tf.io.FixedLenFeature([],tf.int64),
        "NGT - Incompletely Imaged":tf.io.FixedLenFeature([],tf.int64),
        "NGT - Normal":tf.io.FixedLenFeature([],tf.int64),
        "CVC - Abnormal":tf.io.FixedLenFeature([],tf.int64),
        "CVC - Borderline":tf.io.FixedLenFeature([],tf.int64),
        "CVC - Normal":tf.io.FixedLenFeature([],tf.int64),
        "Swan Ganz Catheter Present":tf.io.FixedLenFeature([],tf.int64),
        'StudyInstanceUID': tf.io.FixedLenFeature([],tf.string),
        'PatientID':tf.io.FixedLenFeature([],tf.string) 
    }
    example = tf.io.parse_single_example(example,LABELED_TFREC_FORMAT)
    image = decoder_image768(example["image"])
    targets = tf.stack([tf.cast(example[label],tf.float32) for label in labels])
    return image,targets

def make_dataset512(paths,cache_dir=False,augment=False,repeat=False,shuffle=0):
    if cache_dir:
        os.makedirs(cache_dir,exist_ok=True)
    dset = tf.data.TFRecordDataset(paths)
    dset = dset.map(read_labeled_tfrecord512,num_parallel_calls=AUTO)
    dset = dset.cache(cache_dir) if cache_dir else dset 
    dset = dset.map(augmenter,num_parallel_calls=AUTO) if augment else dset
    dset = dset.repeat() if repeat else dset
    dset = dset.shuffle(shuffle) if shuffle else dset 
    dset = dset.batch(CONFIG.batchsize)
    dset = dset.prefetch(AUTO)
    return dset

def make_dataset768(paths,cache_dir=False,augment=False,repeat=False,shuffle=0):
    if cache_dir:
        os.makedirs(cache_dir,exist_ok=True)
    dset = tf.data.TFRecordDataset(paths)
    dset = dset.map(read_labeled_tfrecord768,num_parallel_calls=AUTO)
    dset = dset.cache(cache_dir) if cache_dir else dset 
    dset = dset.map(augmenter,num_parallel_calls=AUTO) if augment else dset
    dset = dset.repeat() if repeat else dset
    dset = dset.shuffle(shuffle) if shuffle else dset 
    dset = dset.batch(CONFIG.batchsize)
    dset = dset.prefetch(AUTO)
    return dset

In [5]:
## Metrics
def mean_roc_auc(targets,probabilities):
    roc_auc = [roc_auc_score(targets[:,k],probabilities[:,k]) for k in range(CONFIG.n_labels)]
    return np.average(roc_auc)


def create_model(imsize):
    with strategy.scope():
        model = tf.keras.Sequential([
            ResNet152(input_shape=(*imsize,3),
                                  weights=None,
                                  include_top=False),
            tf.keras.layers.GlobalAveragePooling2D(),
            tf.keras.layers.Dense(CONFIG.n_labels,activation="sigmoid")
        ])
    return model

In [None]:
def ensemble(path1,path2,df,n_folds=CONFIG.n_folds):
    scores = []
    seed_everything(CONFIG.seed)
    load_dir1 = KaggleDatasets().get_gcs_path("ranzcrclahe") 
    load_dir2 = KaggleDatasets().get_gcs_path("ranzcr-768-sgkf") 
    
        
    for fold in range(n_folds):
        print("-"*50)
        print(f"FOLD : {fold}") 

        # TTA
        print("TTA...")
        valid_size = 30083 - fold_size[fold]
        steps = (CONFIG.tta*valid_size + CONFIG.batchsize - 1)//CONFIG.batchsize 
        
        path = load_dir1 + f"/train_clahe_512_{fold}.tfrec" 
        model = create_model((512,512))
        model.load_weights(path1+f"/model_nb13_4_{fold}.h5")
        tta_dset = make_dataset512(path,augment=True,repeat=True)
        pred_tta = model.predict(tta_dset,
                                 steps = steps,
                                 batch_size=CONFIG.batchsize,
                                 verbose=1)[:CONFIG.tta*valid_size]
        pred1 = np.mean(pred_tta.reshape((valid_size,CONFIG.tta,CONFIG.n_labels),order = "F"),axis=1)
        pd.DataFrame(pred1,columns=labels).to_csv(f"pred_clahe_{CONFIG.version}_{fold}.csv",index=False)
        

        path = load_dir2 + f"/train_768_{fold}.tfrec" 
        model = create_model((768,768))
        model.load_weights(path2+f"/model_nb13_7_{fold}.h5")  
        tta_dset = make_dataset768(path,augment=True,repeat=True)
        pred_tta = model.predict(tta_dset,
                                 steps = steps,
                                 batch_size=CONFIG.batchsize,
                                 verbose=1)[:CONFIG.tta*valid_size]
        pred2 = np.mean(pred_tta.reshape((valid_size,CONFIG.tta,CONFIG.n_labels),order = "F"),axis=1)
        pd.DataFrame(pred2,columns=labels).to_csv(f"pred768_{CONFIG.version}_{fold}.csv",index=False)
        
        valid_labels = df[df.fold == fold][labels].values
        pred = (pred1 + pred2)/2 
        auc = mean_roc_auc(valid_labels,pred)
        
        print(f"FOLD : AUC {auc}")
        scores.append(auc)
        del model,path,tta_dset,pred_tta
        gc.collect() 

    scores = np.array(scores)
    print(f"SCORES : {scores.mean()}")

In [None]:
df = pd.read_csv("../input/ranzcr-sgkf-data/train_folds.csv")
fold_size = [24080,24042,24092,24065,24053]
ensemble("../input/model-clahe-512","../input/nb13-7",df)