### Data Gathering

In [1]:
import pandas as pd
import os

In [2]:
raw_data_path = os.path.join(os.getcwd(), "./raw_data")
processed_data_path = os.path.join(os.getcwd(), "./processed_data")

def get_raw_data_path(filename):
  return os.path.join(raw_data_path, filename)

def get_processed_data_path(filename):
  return os.path.join(processed_data_path, filename)

In [3]:
split_df = pd.read_csv(get_raw_data_path("mimic-cxr-2.0.0-split.csv"))
print(f"shape:", split_df.shape)
split_df

shape: (377110, 4)


Unnamed: 0,dicom_id,study_id,subject_id,split
0,02aa804e-bde0afdd-112c0b34-7bc16630-4e384014,50414267,10000032,train
1,174413ec-4ec4c1f7-34ea26b7-c5f994f8-79ef1962,50414267,10000032,train
2,2a2277a9-b0ded155-c0de8eb9-c124d10e-82c5caab,53189527,10000032,train
3,e084de3b-be89b11e-20fe3f9f-9c8d8dfe-4cfd202c,53189527,10000032,train
4,68b5c4b1-227d0485-9cc38c3f-7b84ab51-4b472714,53911762,10000032,train
...,...,...,...,...
377105,428e2c18-5721d8f3-35a05001-36f3d080-9053b83c,57132437,19999733,train
377106,58c403aa-35ff8bd9-73e39f54-8dc9cc5d-e0ec3fa9,57132437,19999733,train
377107,58766883-376a15ce-3b323a28-6af950a0-16b793bd,55368167,19999987,train
377108,7ba273af-3d290f8d-e28d0ab4-484b7a86-7fc12b08,58621812,19999987,train


In [4]:
image_df = pd.read_csv(get_raw_data_path("cxr-record-list.csv"))

# rename the columns to prevent collision when merging the df
image_df = image_df.rename(columns={"path": "image_path"})


# edit the path to the jpg file by change the .dcm extension to .jpg
def replace_extension(df, column, init_ext, target_ext):
    df[column] = df[column].str.replace(init_ext, target_ext, regex=False)
    return df

image_df = replace_extension(image_df, "image_path", ".dcm", ".jpg")
print(f"shape:", image_df.shape)
image_df

shape: (377110, 4)


Unnamed: 0,subject_id,study_id,dicom_id,image_path
0,10000032,50414267,02aa804e-bde0afdd-112c0b34-7bc16630-4e384014,files/p10/p10000032/s50414267/02aa804e-bde0afd...
1,10000032,50414267,174413ec-4ec4c1f7-34ea26b7-c5f994f8-79ef1962,files/p10/p10000032/s50414267/174413ec-4ec4c1f...
2,10000032,53189527,2a2277a9-b0ded155-c0de8eb9-c124d10e-82c5caab,files/p10/p10000032/s53189527/2a2277a9-b0ded15...
3,10000032,53189527,e084de3b-be89b11e-20fe3f9f-9c8d8dfe-4cfd202c,files/p10/p10000032/s53189527/e084de3b-be89b11...
4,10000032,53911762,68b5c4b1-227d0485-9cc38c3f-7b84ab51-4b472714,files/p10/p10000032/s53911762/68b5c4b1-227d048...
...,...,...,...,...
377105,19999733,57132437,428e2c18-5721d8f3-35a05001-36f3d080-9053b83c,files/p19/p19999733/s57132437/428e2c18-5721d8f...
377106,19999733,57132437,58c403aa-35ff8bd9-73e39f54-8dc9cc5d-e0ec3fa9,files/p19/p19999733/s57132437/58c403aa-35ff8bd...
377107,19999987,55368167,58766883-376a15ce-3b323a28-6af950a0-16b793bd,files/p19/p19999987/s55368167/58766883-376a15c...
377108,19999987,58621812,7ba273af-3d290f8d-e28d0ab4-484b7a86-7fc12b08,files/p19/p19999987/s58621812/7ba273af-3d290f8...


In [5]:
report_df = pd.read_csv(get_raw_data_path("cxr-study-list.csv"))
report_df = report_df.rename(columns={"path": "report_path"})
print(f"shape:", report_df.shape)
report_df

shape: (227835, 3)


Unnamed: 0,subject_id,study_id,report_path
0,10000032,50414267,files/p10/p10000032/s50414267.txt
1,10000032,53189527,files/p10/p10000032/s53189527.txt
2,10000032,53911762,files/p10/p10000032/s53911762.txt
3,10000032,56699142,files/p10/p10000032/s56699142.txt
4,10000764,57375967,files/p10/p10000764/s57375967.txt
...,...,...,...
227830,19999442,58708861,files/p19/p19999442/s58708861.txt
227831,19999733,57132437,files/p19/p19999733/s57132437.txt
227832,19999987,55368167,files/p19/p19999987/s55368167.txt
227833,19999987,58621812,files/p19/p19999987/s58621812.txt


In [6]:
result_df = split_df.merge(
    image_df,
    on=["subject_id", "study_id", "dicom_id"],
    how="inner",
    suffixes=("", "_remove"),
)

# remove the duplicate columns
result_df.drop([i for i in result_df.columns if "remove" in i], axis=1, inplace=True)

result_df = report_df.merge(
    result_df,
    on=["subject_id", "study_id"],
    how="inner",
    suffixes=("", "_remove"),
)

# remove the duplicate columns
result_df.drop([i for i in result_df.columns if "remove" in i], axis=1, inplace=True)

result_df

Unnamed: 0,subject_id,study_id,report_path,dicom_id,split,image_path
0,10000032,50414267,files/p10/p10000032/s50414267.txt,02aa804e-bde0afdd-112c0b34-7bc16630-4e384014,train,files/p10/p10000032/s50414267/02aa804e-bde0afd...
1,10000032,50414267,files/p10/p10000032/s50414267.txt,174413ec-4ec4c1f7-34ea26b7-c5f994f8-79ef1962,train,files/p10/p10000032/s50414267/174413ec-4ec4c1f...
2,10000032,53189527,files/p10/p10000032/s53189527.txt,2a2277a9-b0ded155-c0de8eb9-c124d10e-82c5caab,train,files/p10/p10000032/s53189527/2a2277a9-b0ded15...
3,10000032,53189527,files/p10/p10000032/s53189527.txt,e084de3b-be89b11e-20fe3f9f-9c8d8dfe-4cfd202c,train,files/p10/p10000032/s53189527/e084de3b-be89b11...
4,10000032,53911762,files/p10/p10000032/s53911762.txt,68b5c4b1-227d0485-9cc38c3f-7b84ab51-4b472714,train,files/p10/p10000032/s53911762/68b5c4b1-227d048...
...,...,...,...,...,...,...
377105,19999733,57132437,files/p19/p19999733/s57132437.txt,428e2c18-5721d8f3-35a05001-36f3d080-9053b83c,train,files/p19/p19999733/s57132437/428e2c18-5721d8f...
377106,19999733,57132437,files/p19/p19999733/s57132437.txt,58c403aa-35ff8bd9-73e39f54-8dc9cc5d-e0ec3fa9,train,files/p19/p19999733/s57132437/58c403aa-35ff8bd...
377107,19999987,55368167,files/p19/p19999987/s55368167.txt,58766883-376a15ce-3b323a28-6af950a0-16b793bd,train,files/p19/p19999987/s55368167/58766883-376a15c...
377108,19999987,58621812,files/p19/p19999987/s58621812.txt,7ba273af-3d290f8d-e28d0ab4-484b7a86-7fc12b08,train,files/p19/p19999987/s58621812/7ba273af-3d290f8...


In [7]:
# ### save the df to a csv file
# result_df.to_csv(get_processed_data_path('paths.csv'), index=False)  

### EDA

In [8]:
result_df["split"].unique()

array(['train', 'validate', 'test'], dtype=object)

In [9]:
train_count = (result_df['split'] == 'train').sum()
print("Occurrences of 'train':", train_count)

valid_count = (result_df['split'] == 'validate').sum()
print("Occurrences of 'validate':", valid_count)

test_count = (result_df['split'] == 'test').sum()
print("Occurrences of 'test':", test_count)

Occurrences of 'train': 368960
Occurrences of 'validate': 2991
Occurrences of 'test': 5159


### Explore the number of reports with only 1 image

In [11]:
paths_df = pd.read_csv(get_processed_data_path("paths.csv"))
# paths_df

In [12]:
metadata_df = pd.read_csv(get_raw_data_path("mimic-cxr-2.0.0-metadata.csv"))
# metadata_df

In [13]:
# combined paths and metadata
combined_df = paths_df.merge(
    metadata_df,
    on="study_id",
    how="inner",
    suffixes=("", "_remove"),
)

combined_df.drop([i for i in combined_df.columns if "remove" in i], axis=1, inplace=True)

combined_df

Unnamed: 0,subject_id,study_id,report_path,dicom_id,split,image_path,PerformedProcedureStepDescription,ViewPosition,Rows,Columns,StudyDate,StudyTime,ProcedureCodeSequence_CodeMeaning,ViewCodeSequence_CodeMeaning,PatientOrientationCodeSequence_CodeMeaning
0,10000032,50414267,files/p10/p10000032/s50414267.txt,02aa804e-bde0afdd-112c0b34-7bc16630-4e384014,train,files/p10/p10000032/s50414267/02aa804e-bde0afd...,CHEST (PA AND LAT),PA,3056,2544,21800506,213014.531,CHEST (PA AND LAT),postero-anterior,Erect
1,10000032,50414267,files/p10/p10000032/s50414267.txt,02aa804e-bde0afdd-112c0b34-7bc16630-4e384014,train,files/p10/p10000032/s50414267/02aa804e-bde0afd...,CHEST (PA AND LAT),LATERAL,3056,2544,21800506,213014.531,CHEST (PA AND LAT),lateral,Erect
2,10000032,50414267,files/p10/p10000032/s50414267.txt,174413ec-4ec4c1f7-34ea26b7-c5f994f8-79ef1962,train,files/p10/p10000032/s50414267/174413ec-4ec4c1f...,CHEST (PA AND LAT),PA,3056,2544,21800506,213014.531,CHEST (PA AND LAT),postero-anterior,Erect
3,10000032,50414267,files/p10/p10000032/s50414267.txt,174413ec-4ec4c1f7-34ea26b7-c5f994f8-79ef1962,train,files/p10/p10000032/s50414267/174413ec-4ec4c1f...,CHEST (PA AND LAT),LATERAL,3056,2544,21800506,213014.531,CHEST (PA AND LAT),lateral,Erect
4,10000032,53189527,files/p10/p10000032/s53189527.txt,2a2277a9-b0ded155-c0de8eb9-c124d10e-82c5caab,train,files/p10/p10000032/s53189527/2a2277a9-b0ded15...,CHEST (PA AND LAT),PA,3056,2544,21800626,165500.312,CHEST (PA AND LAT),postero-anterior,Erect
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
729343,19999733,57132437,files/p19/p19999733/s57132437.txt,58c403aa-35ff8bd9-73e39f54-8dc9cc5d-e0ec3fa9,train,files/p19/p19999733/s57132437/58c403aa-35ff8bd...,CHEST (PA AND LAT),PA,3056,2544,21520708,224550.171,CHEST (PA AND LAT),postero-anterior,Erect
729344,19999733,57132437,files/p19/p19999733/s57132437.txt,58c403aa-35ff8bd9-73e39f54-8dc9cc5d-e0ec3fa9,train,files/p19/p19999733/s57132437/58c403aa-35ff8bd...,CHEST (PA AND LAT),LATERAL,3056,2544,21520708,224550.171,CHEST (PA AND LAT),lateral,Erect
729345,19999987,55368167,files/p19/p19999987/s55368167.txt,58766883-376a15ce-3b323a28-6af950a0-16b793bd,train,files/p19/p19999987/s55368167/58766883-376a15c...,CHEST (PORTABLE AP),AP,2544,3056,21451104,51448.218,CHEST (PORTABLE AP),antero-posterior,Erect
729346,19999987,58621812,files/p19/p19999987/s58621812.txt,7ba273af-3d290f8d-e28d0ab4-484b7a86-7fc12b08,train,files/p19/p19999987/s58621812/7ba273af-3d290f8...,CHEST (PORTABLE AP),AP,3056,2544,21451102,202809.234,CHEST (PORTABLE AP),antero-posterior,Erect


In [14]:
# group the rows by the study_id, and count the number of images per study
grouped_df = paths_df.groupby('study_id')
group_sizes = grouped_df.size()

# Convert to DataFrame with reset_index
group_sizes_df = group_sizes.reset_index(name='size')

print("number of images per study:", group_sizes_df["size"].unique())
group_sizes_df

number of images per study: [ 1  2  3  4  9  5  6  8  7 11]


Unnamed: 0,study_id,size
0,50000014,1
1,50000028,2
2,50000052,1
3,50000103,2
4,50000125,1
...,...,...
227830,59999832,2
227831,59999849,1
227832,59999880,2
227833,59999888,2


In [15]:
# get the studies with only 1 image
single_image_df = group_sizes_df[group_sizes_df["size"] == 1]
single_image_df

Unnamed: 0,study_id,size
0,50000014,1
2,50000052,1
4,50000125,1
5,50000173,1
7,50000198,1
...,...,...
227823,59999594,1
227825,59999682,1
227828,59999807,1
227829,59999824,1


In [16]:
# find the distribution of the type of images
single_image_df.merge(metadata_df, on="study_id", how="inner").groupby("ViewPosition").size()

ViewPosition
AP          100384
AP AXIAL         1
LATERAL         11
LL              15
LPO              1
PA              91
dtype: int64

### Only obtain the AP single image data points

In [40]:
AP_df = combined_df[combined_df["ViewPosition"] == 'AP']
single_image_df = single_image_df.merge(AP_df, on="study_id", how="inner")
single_image_df

Unnamed: 0,study_id,size,subject_id,report_path,dicom_id,split,image_path,PerformedProcedureStepDescription,ViewPosition,Rows,Columns,StudyDate,StudyTime,ProcedureCodeSequence_CodeMeaning,ViewCodeSequence_CodeMeaning,PatientOrientationCodeSequence_CodeMeaning
0,50000014,1,11941242,files/p11/p11941242/s50000014.txt,dffc8ab2-ff37704f-2fb29e6d-51e08075-88bca914,train,files/p11/p11941242/s50000014/dffc8ab2-ff37704...,CHEST (PORTABLE AP),AP,2544,3056,21720525,132301.281,CHEST (PORTABLE AP),antero-posterior,
1,50000052,1,15752761,files/p15/p15752761/s50000052.txt,a09e5a6c-9635efa7-de26c04a-28104ce9-e5f798d6,train,files/p15/p15752761/s50000052/a09e5a6c-9635efa...,CHEST (PORTABLE AP),AP,2544,3056,21660519,33434.562,CHEST (PORTABLE AP),antero-posterior,Erect
2,50000125,1,19309850,files/p19/p19309850/s50000125.txt,dfa001f0-9c3d0a8c-b61096ea-e3b90c21-6228ddff,train,files/p19/p19309850/s50000125/dfa001f0-9c3d0a8...,CHEST (PORTABLE AP),AP,2544,3056,21520129,74253.625,CHEST (PORTABLE AP),antero-posterior,Erect
3,50000173,1,19999287,files/p19/p19999287/s50000173.txt,c8bbb9ff-ecb81ef7-a1a6cecf-f535bd20-bd512ba0,train,files/p19/p19999287/s50000173/c8bbb9ff-ecb81ef...,CHEST (PORTABLE AP),AP,3056,2544,21970808,31549.890,CHEST (PORTABLE AP),antero-posterior,Erect
4,50000198,1,16548129,files/p16/p16548129/s50000198.txt,b66847d6-6848ea1f-58aa0c60-c38316ff-7e2171db,train,files/p16/p16548129/s50000198/b66847d6-6848ea1...,CHEST (PORTABLE AP),AP,3056,2544,21490101,100104.296,CHEST (PORTABLE AP),antero-posterior,Erect
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
100379,59999594,1,11278050,files/p11/p11278050/s59999594.txt,2d21fa2b-c28a3633-6a5f9561-37be4527-7a2b8475,train,files/p11/p11278050/s59999594/2d21fa2b-c28a363...,CHEST (PORTABLE AP),AP,2539,3050,21600322,153726.671,CHEST (PORTABLE AP),antero-posterior,
100380,59999682,1,15376482,files/p15/p15376482/s59999682.txt,6e2ad8f2-02cc64d6-b52add4b-526b833c-e2e185b7,train,files/p15/p15376482/s59999682/6e2ad8f2-02cc64d...,CHEST (PORTABLE AP),AP,2736,2544,21851119,80451.437,CHEST (PORTABLE AP),antero-posterior,
100381,59999807,1,19497741,files/p19/p19497741/s59999807.txt,e56dbf08-8b564eb4-831fa870-d590f7c7-423cb05f,train,files/p19/p19497741/s59999807/e56dbf08-8b564eb...,CHEST (PORTABLE AP),AP,2270,2532,21790809,132922.578,CHEST (PORTABLE AP),antero-posterior,Erect
100382,59999824,1,19148695,files/p19/p19148695/s59999824.txt,66bd155f-6b30082b-9a7aa677-b6ff2ac7-d29e2f49,train,files/p19/p19148695/s59999824/66bd155f-6b30082...,CHEST (PORTABLE AP),AP,3056,2544,21470418,140446.046,CHEST (PORTABLE AP),antero-posterior,Erect


### Find the number of single AP image datapoints in each split

In [41]:
print(f'train split shape:\t{single_image_df[single_image_df["split"] == "train"].shape}')
print(f'validate split shape:\t{single_image_df[single_image_df["split"] == "validate"].shape}')
print(f'test split shape:\t{single_image_df[single_image_df["split"] == "test"].shape}')

train split shape:	(97937, 16)
validate split shape:	(820, 16)
test split shape:	(1627, 16)


In [42]:
### save the single image datapoints and their paths
single_image_paths_df = single_image_df[
    ["study_id", "size", "subject_id", "report_path", "dicom_id", "split", "image_path"]
]
# single_image_paths_df.to_csv(get_processed_data_path("single_image_paths.csv"), index=False)

### Check for class balance: distribution of labels amongst the single image datapoints

1. Get the labels

In [61]:
chexpert_df = pd.read_csv(get_raw_data_path("mimic-cxr-2.0.0-chexpert.csv"))
chexpert_df

Unnamed: 0,subject_id,study_id,Atelectasis,Cardiomegaly,Consolidation,Edema,Enlarged Cardiomediastinum,Fracture,Lung Lesion,Lung Opacity,No Finding,Pleural Effusion,Pleural Other,Pneumonia,Pneumothorax,Support Devices
0,10000032,50414267,,,,,,,,,1.0,,,,,
1,10000032,53189527,,,,,,,,,1.0,,,,,
2,10000032,53911762,,,,,,,,,1.0,,,,,
3,10000032,56699142,,,,,,,,,1.0,,,,,
4,10000764,57375967,,,1.0,,,,,,,,,-1.0,,
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
227822,19999442,58708861,,,,,,,,,1.0,,,,,1.0
227823,19999733,57132437,,,,,,,,,1.0,,,,,
227824,19999987,55368167,1.0,-1.0,,,,,0.0,,,0.0,,,0.0,
227825,19999987,58621812,1.0,,,,,,,,,,,,,1.0


In [62]:
# get the labels by joining with chexpert_df
single_image_labels_df = single_image_paths_df.merge(chexpert_df, on="study_id", how="inner", suffixes=("", "_remove"))
single_image_labels_df.drop([i for i in single_image_labels_df.columns if "remove" in i], axis=1, inplace=True)
single_image_labels_df.info()

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 100382 entries, 0 to 100381
Data columns (total 21 columns):
 #   Column                      Non-Null Count   Dtype  
---  ------                      --------------   -----  
 0   study_id                    100382 non-null  int64  
 1   size                        100382 non-null  int64  
 2   subject_id                  100382 non-null  int64  
 3   report_path                 100382 non-null  object 
 4   dicom_id                    100382 non-null  object 
 5   split                       100382 non-null  object 
 6   image_path                  100382 non-null  object 
 7   Atelectasis                 35486 non-null   float64
 8   Cardiomegaly                39503 non-null   float64
 9   Consolidation               12407 non-null   float64
 10  Edema                       38893 non-null   float64
 11  Enlarged Cardiomediastinum  13322 non-null   float64
 12  Fracture                    2067 non-null    float64
 13  Lung Lesion   

2. Find the percentage distribution of each label in each of the splits (train, validate, test)

In [63]:
num_train_rows = single_image_labels_df[single_image_labels_df["split"] == "train"].shape[0]
single_image_train_class_distribution_df = pd.DataFrame(single_image_labels_df[single_image_labels_df["split"] == "train"].count()).T
single_image_train_class_distribution_df = single_image_train_class_distribution_df.apply(lambda x: x/num_train_rows)
single_image_train_class_distribution_df

Unnamed: 0,study_id,size,subject_id,report_path,dicom_id,split,image_path,Atelectasis,Cardiomegaly,Consolidation,...,Enlarged Cardiomediastinum,Fracture,Lung Lesion,Lung Opacity,No Finding,Pleural Effusion,Pleural Other,Pneumonia,Pneumothorax,Support Devices
0,1.0,1.0,1.0,1.0,1.0,1.0,1.0,0.353796,0.393006,0.123623,...,0.132312,0.020779,0.029223,0.327289,0.182815,0.501292,0.009721,0.250054,0.358789,0.523613


In [64]:
num_validate_rows = single_image_labels_df[single_image_labels_df["split"] == "validate"].shape[0]
single_image_validate_class_distribution_df = pd.DataFrame(single_image_labels_df[single_image_labels_df["split"] == "validate"].count()).T
single_image_validate_class_distribution_df = single_image_validate_class_distribution_df.apply(lambda x: x/num_validate_rows)
single_image_validate_class_distribution_df

Unnamed: 0,study_id,size,subject_id,report_path,dicom_id,split,image_path,Atelectasis,Cardiomegaly,Consolidation,...,Enlarged Cardiomediastinum,Fracture,Lung Lesion,Lung Opacity,No Finding,Pleural Effusion,Pleural Other,Pneumonia,Pneumothorax,Support Devices
0,1.0,1.0,1.0,1.0,1.0,1.0,1.0,0.331707,0.404878,0.120732,...,0.132927,0.004878,0.054878,0.326829,0.158537,0.49878,0.012195,0.25,0.356098,0.540244


In [65]:
num_test_rows = single_image_labels_df[single_image_labels_df["split"] == "test"].shape[0]
single_image_test_class_distribution_df = pd.DataFrame(single_image_labels_df[single_image_labels_df["split"] == "test"].count()).T
single_image_test_class_distribution_df = single_image_test_class_distribution_df.apply(lambda x: x/num_test_rows)
single_image_test_class_distribution_df

Unnamed: 0,study_id,size,subject_id,report_path,dicom_id,split,image_path,Atelectasis,Cardiomegaly,Consolidation,...,Enlarged Cardiomediastinum,Fracture,Lung Lesion,Lung Opacity,No Finding,Pleural Effusion,Pleural Other,Pneumonia,Pneumothorax,Support Devices
0,1.0,1.0,1.0,1.0,1.0,1.0,1.0,0.347265,0.419176,0.12354,...,0.15673,0.01721,0.031961,0.392747,0.137677,0.534726,0.012907,0.245851,0.336202,0.50338


In [73]:
### concat the distribution dfs
single_image_combined_class_distribution_df = pd.concat(
    [
        single_image_train_class_distribution_df,
        single_image_validate_class_distribution_df,
        single_image_test_class_distribution_df,
    ],
    axis=0,
).drop(labels=["study_id", "size", "subject_id", "report_path", "dicom_id", "split", "image_path"], axis=1)

single_image_combined_class_distribution_df

Unnamed: 0,Atelectasis,Cardiomegaly,Consolidation,Edema,Enlarged Cardiomediastinum,Fracture,Lung Lesion,Lung Opacity,No Finding,Pleural Effusion,Pleural Other,Pneumonia,Pneumothorax,Support Devices
0,0.353796,0.393006,0.123623,0.386083,0.132312,0.020779,0.029223,0.327289,0.182815,0.501292,0.009721,0.250054,0.358789,0.523613
0,0.331707,0.404878,0.120732,0.419512,0.132927,0.004878,0.054878,0.326829,0.158537,0.49878,0.012195,0.25,0.356098,0.540244
0,0.347265,0.419176,0.12354,0.453596,0.15673,0.01721,0.031961,0.392747,0.137677,0.534726,0.012907,0.245851,0.336202,0.50338


In [74]:
### Find the % difference between the max and the min of each label
(single_image_combined_class_distribution_df.max() - single_image_combined_class_distribution_df.min())/ single_image_combined_class_distribution_df.min() * 100

Atelectasis                     6.659054
Cardiomegaly                    6.659151
Consolidation                   2.394652
Edema                          17.486664
Enlarged Cardiomediastinum     18.454777
Fracture                      325.971307
Lung Lesion                    87.787621
Lung Opacity                   20.168977
No Finding                     32.785813
Pleural Effusion                7.206778
Pleural Other                  32.780017
Pneumonia                       1.709305
Pneumothorax                    6.718408
Support Devices                 7.323178
dtype: float64

3. See if merging the train and valid split will reduce the class imbalance

In [75]:
num_train_val_rows = single_image_labels_df[single_image_labels_df["split"] != "test"].shape[0]
single_image_train_val_class_distribution_df = pd.DataFrame(single_image_labels_df[single_image_labels_df["split"] != "test"].count()).T
single_image_train_val_class_distribution_df = single_image_train_val_class_distribution_df.apply(lambda x: x/num_train_val_rows)


single_image_train_val_combined_class_distribution_df = pd.concat(
  [
    single_image_train_val_class_distribution_df,
    single_image_test_class_distribution_df,
  ],
  axis=0,
).drop(labels=["study_id", "size", "subject_id", "split"], axis=1)


single_image_train_val_combined_class_distribution_df

# single_image_labels_df[single_image_labels_df["split"] != "test"]["split"].unique()

Unnamed: 0,report_path,dicom_id,image_path,Atelectasis,Cardiomegaly,Consolidation,Edema,Enlarged Cardiomediastinum,Fracture,Lung Lesion,Lung Opacity,No Finding,Pleural Effusion,Pleural Other,Pneumonia,Pneumothorax,Support Devices
0,1.0,1.0,1.0,0.353612,0.393104,0.123599,0.38636,0.132317,0.020647,0.029436,0.327285,0.182614,0.501271,0.009741,0.250053,0.358767,0.523751
0,1.0,1.0,1.0,0.347265,0.419176,0.12354,0.453596,0.15673,0.01721,0.031961,0.392747,0.137677,0.534726,0.012907,0.245851,0.336202,0.50338


In [76]:
### Find the % difference between the max and the min of each label
(single_image_train_val_combined_class_distribution_df.max() - single_image_train_val_combined_class_distribution_df.min())/ single_image_train_val_combined_class_distribution_df.min() * 100

report_path                    0.000000
dicom_id                       0.000000
image_path                     0.000000
Atelectasis                    1.827876
Cardiomegaly                   6.632403
Consolidation                  0.047391
Edema                         17.402257
Enlarged Cardiomediastinum    18.450209
Fracture                      19.974142
Lung Lesion                    8.575004
Lung Opacity                  20.001758
No Finding                    32.639387
Pleural Effusion               6.674170
Pleural Other                 32.499965
Pneumonia                      1.709124
Pneumothorax                   6.711761
Support Devices                4.046689
dtype: float64

### Conclusion: 
- We will use single image datapoints first.
- The difference between the class distribution for each of the original split is quite large.
- To overcome this problem, there are a few ways to do so:
  - Try to merge train and valid split and use it as the `new_train` split, and the test split as the `new_valid` split, and the IU dataset as the `new_test` split.
  - Create my own train, valid, test split from all the data that I have. We can do so using sth like one-hot encoding, and then try to split the data points evenly, using sth like this (http://scikit.ml/stratification.html)

### Current Approach:
- Use the lazy method and just train the model, and see how it goes. Use the ground truth for the test split to check for accuracy, recall, precision, ROC, F1 score, etc.

### Additional Problems observed from exploring the reports

When it comes to report generation tasks, there are actually a few variation:
- Using a single image amd describing what you see from them
- Using multiple images and describing what you see from them: sometimes, one image is not enough
- Compare with images taken previously

If you each image as a single image input, then some of the reports might refer to the other views, which would not be good.
If you only use single image inputs, some of the reports might refer to previous images and reports

In [77]:
### maybe only pick patients with 1 study
grouped_df_2 = paths_df.groupby("subject_id")
grouped_df_2 = grouped_df_2.size().reset_index(name="count")
grouped_df_2[grouped_df_2["count"] == 1]

Unnamed: 0,subject_id,count
69,10010635,1
101,10014577,1
104,10014756,1
110,10015272,1
121,10016810,1
...,...,...
65333,19993842,1
65350,19995593,1
65362,19998198,1
65366,19998485,1
