In [1]:
import pandas as pd

# Creating detailed dataframe
Extract additional data from csv files containing filepaths to each image, that will make dataset handling easier. Detailed dataframe will have these columns:<br>
* sp_id (identifier created from other columns, may be used later for dataset filtering)
* filepath (filepath to each image from original csv files)
* label (positive/negative label as 0/1, extracted from study folder name)
* body_part (what body part does the X-ray scan belong to, without the XR_ prefix)
* patient (numeric string representing one patient)
* study (number representing study for each patient)
* split (which part of dataset does image belong to based on original data split)


In [2]:
# change filepaths as needed
TRAIN_DATASET_PATH = "../MURA-v1.1/train_image_paths.csv"
VALID_DATASET_PATH = "../MURA-v1.1/valid_image_paths.csv"

DETAILED_DATASET_PATH = "../MURA-v1.1/detailed_paths.csv"

In [3]:
# Load dataframes
train_df = pd.read_csv(TRAIN_DATASET_PATH, names=['filepath'])
valid_df = pd.read_csv(VALID_DATASET_PATH, names=['filepath'])

In [4]:
print('Train image count:', len(train_df))
print('Valid image count:', len(valid_df))
print('Example filepath:', train_df.loc[0][0])

Train image count: 36808
Valid image count: 3197
Example filepath: MURA-v1.1/train/XR_SHOULDER/patient00001/study1_positive/image1.png


In [5]:
# Parse data into new columns
rstring = '.*/XR_(?P<body_part>.*)/[a-z]+(?P<patient>[0-9]+)/[a-z]+(?P<study>[0-9]+)_(?P<label>[a-z]+)/'

train_cols = train_df['filepath'].str.extract(rstring, expand=True)
valid_cols = valid_df['filepath'].str.extract(rstring, expand=True)

# Add "dataset type"
train_cols['split'] = 'train'
valid_cols['split'] = 'valid'

# Create study identifiers from other columns
train_cols['sp_id'] =  train_cols['body_part'].str.slice(0,2) + train_cols['patient'] + "_"\
                        + train_cols['study'] + train_cols['label'].str.get(0)
valid_cols['sp_id'] =  valid_cols['body_part'].str.slice(0,2) + valid_cols['patient'] + "_"\
                        + valid_cols['study'] + valid_cols['label'].str.get(0)


# Change labels to numeric values
train_cols.loc[train_cols['label'] == 'positive', 'label'] = 1
train_cols.loc[train_cols['label'] == 'negative', 'label'] = 0
valid_cols.loc[valid_cols['label'] == 'positive', 'label'] = 1
valid_cols.loc[valid_cols['label'] == 'negative', 'label'] = 0

# Concat into final dataframes
train_expanded = pd.concat([train_df, train_cols], axis=1)
valid_expanded = pd.concat([valid_df, valid_cols], axis=1)

# Create detailed dataframe
detailed_df = pd.concat([train_expanded, valid_expanded])

# Reorder columns
detailed_df = detailed_df.reindex(columns=['sp_id', 'filepath', 'label', 'body_part', 'patient', 'study', 'split'])

In [6]:
# Save dataframes as csv
detailed_df.to_csv(DETAILED_DATASET_PATH)

In [7]:
display(detailed_df.head(10))

Unnamed: 0,sp_id,filepath,label,body_part,patient,study,split
0,SH00001_1p,MURA-v1.1/train/XR_SHOULDER/patient00001/study...,1,SHOULDER,1,1,train
1,SH00001_1p,MURA-v1.1/train/XR_SHOULDER/patient00001/study...,1,SHOULDER,1,1,train
2,SH00001_1p,MURA-v1.1/train/XR_SHOULDER/patient00001/study...,1,SHOULDER,1,1,train
3,SH00002_1p,MURA-v1.1/train/XR_SHOULDER/patient00002/study...,1,SHOULDER,2,1,train
4,SH00002_1p,MURA-v1.1/train/XR_SHOULDER/patient00002/study...,1,SHOULDER,2,1,train
5,SH00002_1p,MURA-v1.1/train/XR_SHOULDER/patient00002/study...,1,SHOULDER,2,1,train
6,SH00003_1p,MURA-v1.1/train/XR_SHOULDER/patient00003/study...,1,SHOULDER,3,1,train
7,SH00003_1p,MURA-v1.1/train/XR_SHOULDER/patient00003/study...,1,SHOULDER,3,1,train
8,SH00003_1p,MURA-v1.1/train/XR_SHOULDER/patient00003/study...,1,SHOULDER,3,1,train
9,SH00004_1p,MURA-v1.1/train/XR_SHOULDER/patient00004/study...,1,SHOULDER,4,1,train
