In [1]:
import pandas as pd
import numpy as np

# Creating detailed dataframe
Extract additional data from csv files containing filepaths to each image, that will make dataset handling easier. Detailed dataframe will have these columns:<br>
* sp_id (identifier created from other columns, may be used later for dataset filtering)
* filepath (filepath to each image from original csv files)
* label (positive/negative label, extracted from study folder name)
* body_part (what body part does the X-ray scan belong to, without the XR_ prefix)
* patient (numeric string representing one patient)
* study (number representing study for each patient)
* split (which part of dataset does image belong to based on original data split)

<b>Note: another dataset csv file with custom test split is created in this notebook</b>

In [57]:
# change filepaths as needed
TRAIN_DATASET_PATH = "../MURA-v1.1/train_image_paths.csv"
VALID_DATASET_PATH = "../MURA-v1.1/valid_image_paths.csv"

TRAIN_VALID_DETAILED_DATASET_PATH = "../MURA-v1.1/tv_detailed_paths.csv"
TRAIN_VALID_TEST_DETAILED_DATASET_PATH = "../MURA-v1.1/tvt_detailed_paths.csv"

In [3]:
# Load dataframes
train_df = pd.read_csv(TRAIN_DATASET_PATH, names=['filepath'])
valid_df = pd.read_csv(VALID_DATASET_PATH, names=['filepath'])

In [4]:
print('Train image count:', len(train_df))
print('Valid image count:', len(valid_df))
print('Example filepath:', train_df.loc[0][0])

Train image count: 36808
Valid image count: 3197
Example filepath: MURA-v1.1/train/XR_SHOULDER/patient00001/study1_positive/image1.png


In [5]:
# Parse data into new columns
rstring = '.*/XR_(?P<body_part>.*)/[a-z]+(?P<patient>[0-9]+)/[a-z]+(?P<study>[0-9]+)_(?P<label>[a-z]+)/'

train_cols = train_df['filepath'].str.extract(rstring, expand=True)
valid_cols = valid_df['filepath'].str.extract(rstring, expand=True)

# Add "dataset type"
train_cols['split'] = 'train'
valid_cols['split'] = 'valid'

# Create study identifiers from other columns
train_cols['sp_id'] =  train_cols['body_part'].str.slice(0,2) + train_cols['patient'] + "_"\
                        + train_cols['study'] + train_cols['label'].str.get(0)
valid_cols['sp_id'] =  valid_cols['body_part'].str.slice(0,2) + valid_cols['patient'] + "_"\
                        + valid_cols['study'] + valid_cols['label'].str.get(0)

# Concat into final dataframes
train_expanded = pd.concat([train_df, train_cols], axis=1)
valid_expanded = pd.concat([valid_df, valid_cols], axis=1)

# Create detailed dataframe
detailed_df = pd.concat([train_expanded, valid_expanded])

# Reorder columns
detailed_df = detailed_df.reindex(columns=['sp_id', 'filepath', 'label', 'body_part', 'patient', 'study', 'split'])

### Save created dataframe as csv

In [59]:
detailed_df.to_csv(TRAIN_VALID_DETAILED_DATASET_PATH)

In [55]:
# Show dataframe sample
display(detailed_df.head(10))

Unnamed: 0,sp_id,filepath,label,body_part,patient,study,split
0,SH00001_1p,MURA-v1.1/train/XR_SHOULDER/patient00001/study...,positive,SHOULDER,1,1,train
1,SH00001_1p,MURA-v1.1/train/XR_SHOULDER/patient00001/study...,positive,SHOULDER,1,1,train
2,SH00001_1p,MURA-v1.1/train/XR_SHOULDER/patient00001/study...,positive,SHOULDER,1,1,train
3,SH00002_1p,MURA-v1.1/train/XR_SHOULDER/patient00002/study...,positive,SHOULDER,2,1,train
4,SH00002_1p,MURA-v1.1/train/XR_SHOULDER/patient00002/study...,positive,SHOULDER,2,1,train
5,SH00002_1p,MURA-v1.1/train/XR_SHOULDER/patient00002/study...,positive,SHOULDER,2,1,train
6,SH00003_1p,MURA-v1.1/train/XR_SHOULDER/patient00003/study...,positive,SHOULDER,3,1,train
7,SH00003_1p,MURA-v1.1/train/XR_SHOULDER/patient00003/study...,positive,SHOULDER,3,1,train
8,SH00003_1p,MURA-v1.1/train/XR_SHOULDER/patient00003/study...,positive,SHOULDER,3,1,train
9,SH00004_1p,MURA-v1.1/train/XR_SHOULDER/patient00004/study...,positive,SHOULDER,4,1,train


# Analysis of dataset
Analysing the number of studies per body part in order to create test set similar to the original one, which is private, from our trainig split

In [8]:
# Get study counts for each body part
group1 = detailed_df[['split', 'sp_id', 'body_part', 'label']].groupby(['split', 'sp_id', 'body_part', 'label']).size()
study_counts = group1.groupby(['split', 'body_part', 'label']).size().to_frame(name = 'study_cnt').reset_index()

# Drop counts for validation set and split column
study_counts = study_counts[study_counts['split'] == 'train']
study_counts.drop(columns=['split'], inplace=True)

# Sum of studies
all_studies = study_counts['study_cnt'].sum()

# Get sum for each body part
study_sum = study_counts[['body_part', 'study_cnt']].groupby('body_part').sum()
study_sum = study_sum.reset_index(level=0)
study_sum['label'] = 'any'

# Add study sum for body parts to dataframe
study_counts = pd.concat([study_counts, study_sum], ignore_index=True)
study_counts.sort_values(by=['body_part'], inplace=True)
study_counts.reset_index(drop=True, inplace=True)

# Calculate portion of all studies
study_counts['share'] = np.round(study_counts['study_cnt'] / all_studies, decimals=3)

study_counts

Unnamed: 0,body_part,label,study_cnt,share
0,ELBOW,negative,1094,0.081
1,ELBOW,positive,660,0.049
2,ELBOW,any,1754,0.13
3,FINGER,negative,1280,0.095
4,FINGER,positive,655,0.049
5,FINGER,any,1935,0.144
6,FOREARM,negative,590,0.044
7,FOREARM,positive,287,0.021
8,FOREARM,any,877,0.065
9,HAND,negative,1497,0.111


In [9]:
# Get average number of images per study for each body part
detailed_df[['sp_id', 'body_part', 'filepath']].groupby(['body_part','sp_id']).size().groupby('body_part').mean()

body_part
ELBOW       2.822176
FINGER      2.638389
FOREARM     2.104950
HAND        2.747368
HUMERUS     2.145805
SHOULDER    2.965837
WRIST       2.816067
dtype: float64

# Creating test set

In [10]:
ORGINAL_TEST_SIZE = 207

# Create dataframe with share per body part
shares_df = study_counts[study_counts['label'] == 'any'].copy(deep=True)

# Calculate approximate number of studies in test split for each body part
shares_df['test_size'] = np.floor(shares_df['share'] * ORGINAL_TEST_SIZE).astype('int')

display(shares_df.test_size.sum())
display(shares_df)

204

Unnamed: 0,body_part,label,study_cnt,share,test_size
2,ELBOW,any,1754,0.13,26
5,FINGER,any,1935,0.144,29
8,FOREARM,any,877,0.065,13
11,HAND,any,2018,0.15,31
14,HUMERUS,any,592,0.044,9
17,SHOULDER,any,2821,0.21,43
20,WRIST,any,3460,0.257,53


In [53]:
# Select random studies from train split, which will be used as a test set
RD_SEED = 27

# Create dataframe with train split
train_df = detailed_df[detailed_df['split'] == 'train'].copy(deep=True)
# Create dataframe with only sp_id and body parts
studies_df = train_df[['sp_id', 'body_part']].groupby(['sp_id', 'body_part']).size().to_frame('img_cnt').reset_index()

# Get all body parts
body_parts = train_df.body_part.unique()

test_studies = []
for body_part in body_parts:
    # Get number of studies from previously created dataframe
    sample_size = shares_df[shares_df['body_part'] == body_part].test_size.values[0]
    
    # Select random studies for test split
    test_studies.extend(study_list[study_list['body_part'] == body_part].sample(n=sample_size, random_state=RD_SEED).sp_id.values)

print("Selected", len(test_studies), "studies for test set")

# Change split column for selected study images to "test"
detailed_df.loc[detailed_df['sp_id'].isin(test_studies), 'split'] = 'test'

# Show test split
display(detailed_df[detailed_df['split'] == 'test'])

Selected 204 studies for test set


Unnamed: 0,sp_id,filepath,label,body_part,patient,study,split
132,SH00044_1p,MURA-v1.1/train/XR_SHOULDER/patient00044/study...,positive,SHOULDER,00044,1,test
133,SH00044_1p,MURA-v1.1/train/XR_SHOULDER/patient00044/study...,positive,SHOULDER,00044,1,test
134,SH00044_1p,MURA-v1.1/train/XR_SHOULDER/patient00044/study...,positive,SHOULDER,00044,1,test
185,SH00060_1p,MURA-v1.1/train/XR_SHOULDER/patient00060/study...,positive,SHOULDER,00060,1,test
186,SH00060_1p,MURA-v1.1/train/XR_SHOULDER/patient00060/study...,positive,SHOULDER,00060,1,test
...,...,...,...,...,...,...,...
36213,HA11023_1n,MURA-v1.1/train/XR_HAND/patient11023/study1_ne...,negative,HAND,11023,1,test
36214,HA11023_1n,MURA-v1.1/train/XR_HAND/patient11023/study1_ne...,negative,HAND,11023,1,test
36796,HA11181_1n,MURA-v1.1/train/XR_HAND/patient11181/study1_ne...,negative,HAND,11181,1,test
36797,HA11181_1n,MURA-v1.1/train/XR_HAND/patient11181/study1_ne...,negative,HAND,11181,1,test


### Save created dataframe as csv

In [56]:
detailed_df.to_csv(TRAIN_VALID_TEST_DETAILED_DATASET_PATH)

# Calculating weights

In [21]:
# Get image counts for each split and body part
image_counts = detailed_df[['label', 'body_part', 'split']].groupby(['split', 'body_part', 'label']).size().to_frame(name = 'count').reset_index()
image_counts

Unnamed: 0,split,body_part,label,count
0,train,ELBOW,negative,2925
1,train,ELBOW,positive,2006
2,train,FINGER,negative,3138
3,train,FINGER,positive,1968
4,train,FOREARM,negative,1164
5,train,FOREARM,positive,661
6,train,HAND,negative,4059
7,train,HAND,positive,1484
8,train,HUMERUS,negative,673
9,train,HUMERUS,positive,599
