# Appendicitis Data Preprocessing

In this notebook, we prepare the pediatric appendicitis dataset for model training. 

In [None]:
# Imports
import sys
import copy
import glob
import json
import re
import os
import numpy as np
import pandas as pd
from shutil import copyfile
from sklearn.impute import KNNImputer
from sklearn.model_selection import GroupShuffleSplit, StratifiedKFold
sys.path.insert(0, '../')
from datasets.preprocessing import preprocess
from datasets.generate_app_data import generate_files
from DeepFill.run_preprocessing import deep_fill

In [None]:
# Important constants
# TODO: fill in relevant directories

# Directory with the original, unprocessed ultrasound images
IMAGE_DIR = '...'
# Directory for saving unprocessed ultrasound images from the training set
TRAIN_IMAGE_DIR = '...'
# Directory for saving unprocessed ultrasound images from the test set
TEST_IMAGE_DIR = '...'
# Directory for the temporary output of the DeepFill model
DEPFILL_TEMP_DIR = '...'
# Directory saving preprocessed ultrasound images
PREPROC_IMAGE_DIR = '...'
# Excel file with the tabular data
CLINICAL_DATA_FILE = '...'
# Directory for saving the data dictionaries
OUTPUT_DIR = '...'
# File with the names of the images containing multiple US snapshots
BLACKLIST_FILE = '...'

# Type of padding to apply to images: 'constant', 'speckle', 'reflect' or 'resize'
PADDING_MODE = 'constant'

# Target variable to predict: 'diagnosis', 'treatment' or 'complications'
TARGET_LABEL = 'diagnosis'

# Feature names to retrieve from the tabular data
TABULAR_FEATURES = ['Age', 'Sex', 'Height', 'Weight', 'BMI', 'Alvarado_Score', 'Paedriatic_Appendicitis_Score',
                    'Peritonitis', 'Migratory_Pain', 'Lower_Right_Abd_Pain', 'Contralateral_Rebound_Tenderness', 
                    'Coughing_Pain', 'Psoas_Sign', 'Nausea', 'Loss_of_Appetite', 'Body_Temperature', 'Dysuria', 
                    'Stool', 'WBC_Count', 'Neutrophil_Percentage', 'CRP', 'Ketones_in_Urine', 'RBC_in_Urine', 
                    'WBC_in_Urine', 'Appendix_on_US', 'Appendix_Diameter', 'Free_Fluids', 
                    'Appendix_Wall_Layers', 'Target_Sign', 'Perfusion', 'Perforation', 
                    'Surrounding_Tissue_Reaction', 'Pathological_Lymph_Nodes', 'Bowel_Wall_Thickening', 'Ileus', 
                    'Coprostasis', 'Meteorism', 'Enteritis', 'Appendicular_Abscess', 
                    'Conglomerate_of_Bowel_Loops', 'Gynecological_Findings']

# Continuously valued features
REAL_VALUED = ['Age', 'Height', 'Weight', 'BMI', 'Body_Temperature', 'WBC_Count', 'Neutrophil_Percentage', 
               'CRP', 'Appendix_Diameter']

# Binary valued features
CONCEPTS = ['Appendix_on_US',
            'Free_Fluids',
            'Appendix_Wall_Layers',
            'Target_Sign',
            'Surrounding_Tissue_Reaction',
            'Pathological_Lymph_Nodes',
            'Bowel_Wall_Thickening',
            'Coprostasis', 
            'Meteorism',
            'Enteritis', 
            'Appendix_Diameter',
            'Perforation', 
            'Appendicular_Abscess', 
            'Conglomerate_of_Bowel_Loops', 
            'Gynecological_Findings']

# Create directories if they do not exist
if not os.path.exists(TRAIN_IMAGE_DIR):
    os.makedirs(TRAIN_IMAGE_DIR)
if not os.path.exists(TEST_IMAGE_DIR):
    os.makedirs(TEST_IMAGE_DIR)

## Test Set

First, an independent test set is reserved.

In [None]:
def test_split(train_image_dir, test_image_dir):
    # Verifies that the training and test sets form disjoint sets of subjects
    train_image_file_list = glob.glob(train_image_dir + '/*')
    train_groups = []
    for file in train_image_file_list:
        name = file.split('/')[-1]
        patient_code = re.split('_| |\.', name)[0]
        train_groups.append(patient_code)
    train_groups = np.array(list(map(int, train_groups)))
    
    test_image_file_list = glob.glob(test_image_dir + '/*')
    test_groups = []
    for file in test_image_file_list:
        name = file.split('/')[-1]
        patient_code = re.split('_| |\.', name)[0]
        test_groups.append(patient_code)
    test_groups = np.array(list(map(int, test_groups)))
    
    print("\nNumber of patients in train set: ", len(np.unique(train_groups)))
    print("Number of patients in test set: ", len(np.unique(test_groups)))
    print("\nNumber of images in train set: ", len(train_groups))
    print("Number of images in test set: ", len(test_groups))
    
    if len(np.intersect1d(train_groups, test_groups)) == 0:
        print("\nPatients in train and test set don't overlap!")

In [None]:
# Split the original dataset
image_file_list = glob.glob(IMAGE_DIR + '/*')
image_names = []
groups = []
for file in image_file_list:
    name = file.split('/')[-1]
    patient_code = re.split('_| |\.', name)[0]
    image_names.append(name)
    groups.append(patient_code)
groups = np.array(list(map(int, groups)))
print("Total number of patients having US images: ", len(np.unique(groups)))
print("Total number of images: ", len(image_file_list))

# TODO: insert the relevant directory
# Load the list form a CSV file containing patient codes for the predefined test set 
# NOTE: set to None to generate another train-test split
test_set_codes = np.genfromtxt('...')

# Copy train and test images to new folders
if test_set_codes is not None:
    for image_idx, group in enumerate(groups):
        if group in test_set_codes:
            copyfile(os.path.join(IMAGE_DIR, image_names[image_idx]), 
                     os.path.join(TEST_IMAGE_DIR, image_names[image_idx]))
        else:
            copyfile(os.path.join(IMAGE_DIR, image_names[image_idx]), 
                     os.path.join(TRAIN_IMAGE_DIR, image_names[image_idx]))
        
else:
    # Group-stratified split
    gss = GroupShuffleSplit(n_splits=1, train_size=.9, random_state=42)
    for idx1, idx2 in gss.split(image_names, groups=groups):
        train_idx = idx1
        test_idx = idx2
    
    for image_idx, image_name in enumerate(image_names):
        if image_idx in train_idx:
            copyfile(os.path.join(IMAGE_DIR, image_name), os.path.join(TRAIN_IMAGE_DIR, image_name))
        else:
            copyfile(os.path.join(IMAGE_DIR, image_name), os.path.join(TEST_IMAGE_DIR, image_name))
        
# Verify that patients in the splits do not overlap
test_split(TRAIN_IMAGE_DIR, TEST_IMAGE_DIR)

## DeepFill 

Before cropping the images to required dimension, automatic filling of markers and annotations can be done. All file extensions will be changed from `.bmp` to `.png`, and all spaces in the file names will be replaced with `'_'`. 

In [None]:
deep_fill(
    TRAIN_IMAGE_DIR, # Raw dataset directory
    os.path.join(DEPFILL_TEMP_DIR, 'train'), # Temporary directory for DeepFill
    os.path.join(DEPFILL_TEMP_DIR, 'train_mask'), # Mask directory for DeepFill
    os.path.join(PREPROC_IMAGE_DIR, 'deepfilled_train'), # Output directory for DeepFill
    "../DeepFill/preproc/patterns" # Directory with pattern templates to be removed
)

deep_fill(
    TEST_IMAGE_DIR, # Raw dataset directory
    os.path.join(DEPFILL_TEMP_DIR, 'test'), # Temporary directory for DeepFill
    os.path.join(DEPFILL_TEMP_DIR, 'test_mask'), # Mask directory for DeepFill
    os.path.join(PREPROC_IMAGE_DIR, 'deepfilled_test'), # Output directory for DeepFill
    "../DeepFill/preproc/patterns" # Directory with pattern templates to be removed
)

## Cropping

Next, we crop the images using the specified padding mode.

In [None]:
config_deepfill_train = {
    'images': os.path.join(PREPROC_IMAGE_DIR, 'deepfilled_train'), 
    'target': os.path.join(PREPROC_IMAGE_DIR, PADDING_MODE + '_padding/deepfilled_cropped_train/'), 
    'padding': PADDING_MODE,
    'debug': False,
    "no_black_triangles": False
}
config_deepfill_test = {
    'images': os.path.join(PREPROC_IMAGE_DIR, 'deepfilled_test'), 
    'target': os.path.join(PREPROC_IMAGE_DIR, PADDING_MODE + '_padding/deepfilled_cropped_test/'), 
    'padding': PADDING_MODE,
    'debug': False,
    'no_black_triangles': False
}

preprocess(config_deepfill_train)
preprocess(config_deepfill_test)

## Dictionary File Generation

Finally, dictionary files are generated for training and test sets. A list of images to be excluded from the dataset can be passed as an argument.

In [None]:
import warnings
warnings.filterwarnings('ignore')

def build_config(image_dir, target_dir, target_file, blacklist, concepts):
    # Build a config file for the dataset
    if TARGET_LABEL == 'diagnosis':
        label_col_name = 'Diagnosis'
        true_label = ['appendicitis']
        false_label = ['no appendicitis']
    
    elif TARGET_LABEL == 'treatment':
        label_col_name = 'Management'
        true_label = ['primary surgical', 'secondary surgical', 'simultaneous appendectomy']
        false_label = ['conservative']
    
    else:
        label_col_name = 'Severity' 
        true_label = ['complicated']
        false_label = ['uncomplicated']
    
    config = {
            'info_file': CLINICAL_DATA_FILE,
            'image_dir': image_dir,
            'output_dir': target_dir,
            'output_file': target_file,
            'blacklist': blacklist,
            'label': label_col_name ,
            'true_label': true_label,
            'false_label': false_label,
            'concepts': concepts
    }
    
    return config

config_gen_deepfilled_cropped_train = build_config(
    image_dir = os.path.join(PREPROC_IMAGE_DIR,'constant_padding/deepfilled_cropped_train'), 
    target_dir = os.path.join(OUTPUT_DIR, TARGET_LABEL),
    target_file = 'app_data_train',
    blacklist = BLACKLIST_FILE,
    concepts=CONCEPTS)

config_gen_deepfilled_cropped_test = build_config(
    image_dir = os.path.join(PREPROC_IMAGE_DIR,'constant_padding/deepfilled_cropped_test'), 
    target_dir = os.path.join(OUTPUT_DIR, TARGET_LABEL),
    target_file = 'app_data_test',
    blacklist = BLACKLIST_FILE,
    concepts=CONCEPTS)

generate_files(config_gen_deepfilled_cropped_train)

generate_files(config_gen_deepfilled_cropped_test)

## Imputation of missing values 

Tabular variables contain missing values that are imputed using the $k$-nearest neighbors method. To prevent data leakage, validation/test data are not used when fitting the imputer.

In [None]:
def decompose_dict(f):
    # Retrieve information from the data dictionary
    keys = []
    image_names = []
    X = []
    y = []
    for i in range(len(f)):
        keys.append(list(f)[i])
        image_names.append(list(f.values())[i][0])
        y.append(list(f.values())[i][1])
        X.append(list(f.values())[i][2])
    X = pd.DataFrame(X, columns=TABULAR_FEATURES)
    y = np.array(y)
    return keys, image_names, X, y

def impute_folds(keys, image_names, X, y, dataset_name):
    # Perform imutation for the cross-validation
    kfold = StratifiedKFold(n_splits=5, shuffle=True, random_state=42)
    for fold, (train_ids, val_ids) in enumerate(kfold.split(X, y)):
        mapping = {}
        X_imp = copy.deepcopy(X)
        imputer = KNNImputer(n_neighbors=1)
        X_imp.iloc[train_ids] = imputer.fit_transform(X_imp.iloc[train_ids])
        X_imp.iloc[val_ids] = imputer.transform(X_imp.iloc[val_ids])
        X_imp_norm = copy.deepcopy(X_imp)
        # Standardize real-valued variables
        for real_var in REAL_VALUED:
            mean = np.mean(X_imp[real_var].iloc[train_ids].values)
            std = np.std(X_imp[real_var].iloc[train_ids].values)     
            X_imp_norm[real_var] = (X_imp[real_var]-mean)/std        
        concepts = X_imp_norm[CONCEPTS]
        for idx, key in enumerate(keys):
            mapping[key] = [image_names[idx], int(y[idx]), X_imp_norm.iloc[idx].values.tolist(),
                            concepts.iloc[idx].values.tolist()]
        if not os.path.exists(os.path.join(OUTPUT_DIR, TARGET_LABEL, os.path.join('imputed', f'fold{fold}'))):
            os.makedirs(os.path.join(OUTPUT_DIR, TARGET_LABEL, os.path.join('imputed', f'fold{fold}')))
        with open(os.path.join(OUTPUT_DIR, TARGET_LABEL, 'imputed', f'fold{fold}', dataset_name), 'w') as f:
            json.dump(mapping, f)  
            
def impute_final(keys, image_names, X, y, dataset_name, keys_test, image_names_test, X_test, y_test, 
                 test_dataset_name):
    # Perform imputation for the train-test split
    mapping = {}
    mapping_test = {}
    imputer = KNNImputer(n_neighbors=1)
    
    X = pd.DataFrame(imputer.fit_transform(X), columns=TABULAR_FEATURES)
    X_test = pd.DataFrame(imputer.transform(X_test), columns=TABULAR_FEATURES)
    X_norm = copy.deepcopy(X)
    X_test_norm = copy.deepcopy(X_test)
    # Standardize real-valued variables
    for real_var in REAL_VALUED:
        mean = np.mean(X[real_var].values)
        std = np.std(X[real_var].values)
        X_norm[real_var] = (X[real_var]-mean)/std
        X_test_norm[real_var] = (X_test[real_var]-mean)/std        
    concepts = X_norm[CONCEPTS]
    concepts_test = X_test_norm[CONCEPTS]
    for idx, key in enumerate(keys):
        mapping[key] = [image_names[idx], int(y[idx]), X_norm.iloc[idx].values.tolist(),
                        concepts.iloc[idx].values.tolist()]
    for idx, key in enumerate(keys_test):
        mapping_test[key] = [image_names_test[idx], int(y_test[idx]), X_test_norm.iloc[idx].values.tolist(),
                             concepts_test.iloc[idx].values.tolist()]
    
    if not os.path.exists(os.path.join(OUTPUT_DIR, TARGET_LABEL, 'imputed', 'final')):
        os.makedirs(os.path.join(OUTPUT_DIR, TARGET_LABEL, 'imputed', 'final'))
    
    with open(os.path.join(OUTPUT_DIR, TARGET_LABEL, 'imputed', 'final', dataset_name), 'w') as f:
        json.dump(mapping, f) 
    with open(os.path.join(OUTPUT_DIR, TARGET_LABEL, 'imputed', 'final', test_dataset_name), 'w') as f:
        json.dump(mapping_test, f) 

In [None]:
dataset_name = 'app_data_train'
with open(os.path.join(OUTPUT_DIR, TARGET_LABEL, dataset_name)) as f:
    data_dict = json.load(f)
keys, image_names, X, y = decompose_dict(data_dict)
impute_folds(keys, image_names, X, y, dataset_name)
test_dataset_name = 'app_data_test'
with open(os.path.join(OUTPUT_DIR, TARGET_LABEL, test_dataset_name)) as f:
    data_dict_test = json.load(f)
keys_test, image_names_test, X_test, y_test = decompose_dict(data_dict_test)
impute_final(keys, image_names, X, y, dataset_name, keys_test, image_names_test, X_test, y_test, 
             test_dataset_name)    