# Preprocessing of KneeMRI and Data Augmentation

In [1]:
import os
import pickle
import platform
from glob import glob

import numpy as np
import pandas as pd
import utils

In [2]:
# Directory where the volumetric data is located
kneemri_data_dir = 'Data/KneeMRI'

# Path to metadata csv file
kneemri_metadata_csv_path = 'Data/KneeMRI/metadata.csv'

In [3]:
# For running code on Windows
if platform.system() == "Windows":
    kneemri_data_dir = kneemri_data_dir.replace('/', '\\')
    kneemri_metadata_csv_path = kneemri_metadata_csv_path.replace('/', '\\')

In [4]:
# Dataset label meanings
kneemri_labels = {0: 'healthy', 1: 'partially ruptured', 2: 'completely ruptured'}

In [5]:
if platform.system() == "Windows":
    mri_vol_paths = glob(kneemri_data_dir + "\\vol*")
else:
    mri_vol_paths = glob(kneemri_data_dir + "/vol*")
mri_vol_paths.sort()

In [6]:
mri_vol_paths

['Data/KneeMRI/vol01',
 'Data/KneeMRI/vol02',
 'Data/KneeMRI/vol03',
 'Data/KneeMRI/vol04',
 'Data/KneeMRI/vol05',
 'Data/KneeMRI/vol06',
 'Data/KneeMRI/vol07',
 'Data/KneeMRI/vol08',
 'Data/KneeMRI/vol09',
 'Data/KneeMRI/vol10']

In [7]:
# names=True loads the interprets the first row of csv file as column names
# 'i4' = 4 byte signed integer, 'U20' = unicode max 20 char string
metadata = np.genfromtxt(kneemri_metadata_csv_path, delimiter=',', names=True,
                         dtype='i4,i4,i4,i4,i4,i4,i4,i4,i4,i4,U20')

In [8]:
metadata_df = pd.DataFrame(metadata)

In [9]:
metadata_df

Unnamed: 0,examId,seriesNo,aclDiagnosis,kneeLR,roiX,roiY,roiZ,roiHeight,roiWidth,roiDepth,volumeFilename
0,329637,8,0,1,139,184,14,74,72,3,329637-8.pck
1,390116,9,0,0,113,105,10,83,98,6,390116-9.pck
2,404663,8,1,1,120,117,15,101,115,2,404663-8.pck
3,406320,9,0,0,117,124,12,91,80,3,406320-9.pck
4,412857,8,0,1,122,105,14,83,98,4,412857-8.pck
...,...,...,...,...,...,...,...,...,...,...,...
912,1027212,5,1,1,113,127,16,101,99,3,1027212-5.pck
913,1028019,5,1,1,105,102,14,95,100,3,1028019-5.pck
914,1028028,5,0,0,118,84,15,100,100,2,1028028-5.pck
915,1028069,5,0,0,105,97,15,103,106,4,1028069-5.pck


In [10]:
def preprocess_mri_vols(kneemri_data_paths, overwrite=False):
    """
    This function preprocesses all the MRI volumes in KneeMRI
    and stores them under 'Preprocessed_Data' directory.

    Args:
        kneemri_data_paths (list): List of the directories in KneeMRI dataset
        overwrite (bool, optional): Option to overwrite already preprocessed MRI
    """
    for mri_data_path in kneemri_data_paths:
        if platform.system() == "Windows":
            all_exams = glob(mri_data_path + "\\*.pck")
        else:
            all_exams = glob(mri_data_path + "/*.pck")

        all_exams.sort()

        for exam in all_exams:
            exam_path = os.path.normpath(exam).split(os.sep)
            exam_path[0] = 'Preprocessed_Data'

            file_temp = exam_path[-1]
            dot_index = file_temp.index('.')
            exam_path[-1] = file_temp[:dot_index] + '.npy'
            preprocessed_exam_path = os.path.join(*exam_path)

            if overwrite or not os.path.exists(preprocessed_exam_path):
                with open(exam, 'rb') as file_handler:  # Must use 'rb' as the data is binary
                    mri_vol = pickle.load(file_handler)
                    mri_vol = mri_vol.astype(np.float64)  # Change the dtype to float64
                    preprocessed_mri_vol = utils.preprocess_mri(mri_vol)
                    os.makedirs(os.path.join(*exam_path[:-1]), exist_ok=True)
                    np.save(preprocessed_exam_path, preprocessed_mri_vol)

In [12]:
def augment_mri_vols(kneemri_data_paths, metadata_df, aug_flip_prob=0.95, overwrite=False):
    """
    This function augments MRI volumes in KneeMRI dataset to create more samples
    for labels that have lower number of cases.

    Args:
        kneemri_data_paths (list): List of the directories in KneeMRI dataset
        metadata_df (Pandas dataframe): Metadata dataframe for the cases
        aug_flip_prob (float, optional): Augmentation flip probability
        overwrite (bool, optional): Option to overwrite already preprocessed MRI
    """
    aug_labels_list = []
    for mri_data_path in kneemri_data_paths:
        if platform.system() == "Windows":
            all_exams = glob(mri_data_path+"\\*.pck")
        else:
            all_exams = glob(mri_data_path+"/*.pck")
        all_exams.sort()
        for exam in all_exams:
            exam_path = os.path.normpath(exam).split(os.sep)
            exam_vol_name = exam_path[-1]
            exam_labels = metadata_df[metadata_df['volumeFilename'] == exam_vol_name].copy()
            acl_diagnosis = exam_labels['aclDiagnosis'].tolist()[0]
            kneelr_val = exam_labels['kneeLR'].tolist()[0]
            
            # If acl_diagnosis is 1, only 5% chance of augmentation as majority samples are healthy
            if np.random.rand() >= aug_flip_prob or acl_diagnosis == 1 or acl_diagnosis == 2:
                
                if acl_diagnosis == 0: # Augment into only one sample
                    # Flip kneeLR value as we do a horizontal flip
                    exam_labels.loc[exam_labels['volumeFilename'] == exam_vol_name, 'kneeLR'] = 1 - kneelr_val
                    
                    # Make the new voumeFilename value by adding '-aug' in the original
                    new_vol_name = exam_vol_name.split('.')
                    new_vol_name[0] = new_vol_name[0] + '-aug-0' #We will augment healthy samples only once
                    
                    # Update the df volumeFilename accordingly
                    new_vol_name = '.'.join(new_vol_name)
                    exam_labels.loc[exam_labels['volumeFilename'] == exam_vol_name, 'volumeFilename'] = new_vol_name
                    
                    # Make changes to path where it will be stored
                    exam_path[0] = 'Preprocessed_Data'
                    exam_path.insert(-1, 'aug')
                    
                    # Add changes to filename to be stored
                    file_temp = exam_path[-1]
                    dot_index = file_temp.index('.')
                    exam_path[-1] = file_temp[:dot_index] + '-aug-0.npy'
                    preprocessed_exam_path = os.path.join(*exam_path)
                
                    if overwrite or not os.path.exists(preprocessed_exam_path):
                        with open(exam, 'rb') as file_handler: # Must use 'rb' as the data is binary
                            mri_vol = pickle.load(file_handler)
                            mri_vol = mri_vol.astype(np.float64) # Change the dtype to float64
                            aug_mri_vol = utils.random_horizontal_flip(mri_vol)
                            aug_mri_vol = utils.random_rotation(aug_mri_vol)
                            preprocessed_aug_mri_vol = utils.preprocess_mri(aug_mri_vol)
                            os.makedirs(os.path.join(*exam_path[:-1]), exist_ok=True)
                            np.save(preprocessed_exam_path, preprocessed_aug_mri_vol)
                            # Add labels to the augmented samples list
                            aug_labels_list.append(exam_labels.values.tolist()[0])
                    
                elif acl_diagnosis == 1: # Augment into multiple samples
                    
                    for aug_ind in range(2): # Two augmentations for each sample
                        if aug_ind >= 1:
                            exam_labels = metadata_df[metadata_df['volumeFilename'] == exam_vol_name].copy()
                            kneelr_val = exam_labels['kneeLR'].tolist()[0]
                            
                        # Make the new voumeFilename value by adding '-aug' in the original
                        new_vol_name = exam_vol_name.split('.')
                        new_vol_name[0] = f"{new_vol_name[0]}-aug-{aug_ind}" # We will augment partial tear samples two times
                        
                        # Update the df volumeFilename accordingly
                        new_vol_name = '.'.join(new_vol_name)
                        exam_labels.loc[exam_labels['volumeFilename'] == exam_vol_name, 'volumeFilename'] = new_vol_name

                        
                        # Make changes to path where it will be stored
                        # And changes to filename to be stored
                        if aug_ind == 0:
                            exam_path[0] = 'Preprocessed_Data'
                            exam_path.insert(-1, 'aug')
                            file_temp = exam_path[-1]
                            dot_index = file_temp.index('.')
                        
                        exam_path[-1] = f"{file_temp[:dot_index]}-aug-{aug_ind}.npy"
                        preprocessed_exam_path = os.path.join(*exam_path)

                        if overwrite or not os.path.exists(preprocessed_exam_path):
                            with open(exam, 'rb') as file_handler: # Must use 'rb' as the data is binary
                                mri_vol = pickle.load(file_handler)
                                mri_vol = mri_vol.astype(np.float64) # Change the dtype to float64
                                if aug_ind == 0:
                                    # Flip kneeLR value as we do a horizontal flip
                                    exam_labels.loc[exam_labels['volumeFilename'] == exam_vol_name, 'kneeLR'] = 1 - kneelr_val
                                    aug_mri_vol = utils.random_horizontal_flip(mri_vol)
                                elif aug_ind == 1:
                                    aug_mri_vol = utils.random_rotation(mri_vol)
                                elif aug_ind == 2:
                                    # Flip kneeLR value as we do a horizontal flip
                                    exam_labels.loc[exam_labels['volumeFilename'] == exam_vol_name, 'kneeLR'] = 1 - kneelr_val
                                    aug_mri_vol = utils.random_horizontal_flip(mri_vol)
                                    aug_mri_vol = utils.random_rotation(aug_mri_vol)
                                preprocessed_aug_mri_vol = utils.preprocess_mri(aug_mri_vol)
                                os.makedirs(os.path.join(*exam_path[:-1]), exist_ok=True)
                                np.save(preprocessed_exam_path, preprocessed_aug_mri_vol)
                                
                                # Add labels to the augmented samples list
                                aug_labels_list.append(exam_labels.values.tolist()[0])
                
                elif acl_diagnosis == 2: # Augment into even more samples
                    
                    for aug_ind in range(5): # Five augmentations for each sample
                        if aug_ind >= 1:
                            exam_labels = metadata_df[metadata_df['volumeFilename'] == exam_vol_name].copy()
                            kneelr_val = exam_labels['kneeLR'].tolist()[0]
                        
                        # Make the new voumeFilename value by adding '-aug' in the original
                        new_vol_name = exam_vol_name.split('.')
                        new_vol_name[0] = f"{new_vol_name[0]}-aug-{aug_ind}" # We will augment complete tear samples five times
                        
                        # Update the df volumeFilename accordingly
                        new_vol_name = '.'.join(new_vol_name)
                        exam_labels.loc[exam_labels['volumeFilename'] == exam_vol_name, 'volumeFilename'] = new_vol_name

                        # Make changes to path where it will be stored
                        # And changes to filename to be stored
                        if aug_ind == 0:
                            exam_path[0] = 'Preprocessed_Data'
                            exam_path.insert(-1, 'aug')
                            file_temp = exam_path[-1]
                            dot_index = file_temp.index('.')
                        
                        exam_path[-1] = f"{file_temp[:dot_index]}-aug-{aug_ind}.npy"
                        preprocessed_exam_path = os.path.join(*exam_path)

                        if overwrite or not os.path.exists(preprocessed_exam_path):
                            with open(exam, 'rb') as file_handler: # Must use 'rb' as the data is binary
                                mri_vol = pickle.load(file_handler)
                                mri_vol = mri_vol.astype(np.float64) # Change the dtype to float64
                                if aug_ind == 0:
                                    # Flip kneeLR value as we do a horizontal flip
                                    exam_labels.loc[exam_labels['volumeFilename'] == exam_vol_name, 'kneeLR'] = 1 - kneelr_val
                                    aug_mri_vol = utils.random_horizontal_flip(mri_vol)
                                elif aug_ind == 1:
                                    # Flip kneeLR value as we do a horizontal flip
                                    exam_labels.loc[exam_labels['volumeFilename'] == exam_vol_name, 'kneeLR'] = 1 - kneelr_val
                                    aug_mri_vol = utils.random_horizontal_flip(mri_vol)
                                    aug_mri_vol = utils.random_rotation(aug_mri_vol)
                                elif aug_ind == 2:
                                    # Flip kneeLR value as we do a horizontal flip
                                    exam_labels.loc[exam_labels['volumeFilename'] == exam_vol_name, 'kneeLR'] = 1 - kneelr_val
                                    aug_mri_vol = utils.random_horizontal_flip(mri_vol)
                                    aug_mri_vol = utils.random_rotation(aug_mri_vol)
                                elif aug_ind == 3:
                                    aug_mri_vol = utils.random_rotation(mri_vol)
                                elif aug_ind == 4:
                                    aug_mri_vol = utils.random_rotation(mri_vol)
                                preprocessed_aug_mri_vol = utils.preprocess_mri(aug_mri_vol)
                                os.makedirs(os.path.join(*exam_path[:-1]), exist_ok=True)
                                np.save(preprocessed_exam_path, preprocessed_aug_mri_vol)
                                
                                # Add labels to the augmented samples list
                                aug_labels_list.append(exam_labels.values.tolist()[0])
    
    aug_labels_df = pd.DataFrame(aug_labels_list, columns=metadata_df.columns)    
    csv_file_path = os.path.normpath(kneemri_data_paths[0]).split(os.sep)
    if platform.system() == "Windows":
        aug_labels_df.to_csv(os.path.join(*csv_file_path[:-1])+"\\metadata-aug.csv")
    else:
        aug_labels_df.to_csv(os.path.join(*csv_file_path[:-1])+"/metadata-aug.csv")
    print(f"For KneeMRI datset we have {len(aug_labels_list)} augmented samples.")    

In [14]:
preprocess_mri_vols(mri_vol_paths)

In [15]:
augment_mri_vols(mri_vol_paths, metadata_df)

For KneeMRI datset we have 650 augmented samples.
