In [1]:
import pandas as pd 
import numpy as np

import torch
import torch.nn as nn
import torch.multiprocessing as mp

from PIL import Image
from torchvision.transforms import Compose, Resize, CenterCrop, ToTensor, Normalize, ToPILImage
import torchvision.transforms as transforms
import clip
from torchvision.transforms.functional import to_pil_image

from tqdm import tqdm
import os

In [2]:
# print all available cuda devices
print("Available devices:")
print(torch.cuda.device_count())
print("Current device:")
print(torch.cuda.current_device())
print(torch.cuda.get_device_name(torch.cuda.current_device()))


Available devices:
4
Current device:
0
NVIDIA GeForce GTX 1080 Ti


# Preperation

In [4]:
jpg_records_study = pd.read_csv("/data/csv/jpg_path_study_records.csv")
jpg_records_study

Unnamed: 0,subject_id_x,study_id,dicom_id,dicom_path,study_path,jpg_path
0,10000032,50414267,02aa804e-bde0afdd-112c0b34-7bc16630-4e384014,files/p10/p10000032/s50414267/02aa804e-bde0afd...,files/p10/p10000032/s50414267.txt,/system/user/publicdata/MIMIC_CXR/hageneder/JP...
1,10000032,50414267,174413ec-4ec4c1f7-34ea26b7-c5f994f8-79ef1962,files/p10/p10000032/s50414267/174413ec-4ec4c1f...,files/p10/p10000032/s50414267.txt,/system/user/publicdata/MIMIC_CXR/hageneder/JP...
2,10000032,53189527,2a2277a9-b0ded155-c0de8eb9-c124d10e-82c5caab,files/p10/p10000032/s53189527/2a2277a9-b0ded15...,files/p10/p10000032/s53189527.txt,/system/user/publicdata/MIMIC_CXR/hageneder/JP...
3,10000032,53189527,e084de3b-be89b11e-20fe3f9f-9c8d8dfe-4cfd202c,files/p10/p10000032/s53189527/e084de3b-be89b11...,files/p10/p10000032/s53189527.txt,/system/user/publicdata/MIMIC_CXR/hageneder/JP...
4,10000032,53911762,68b5c4b1-227d0485-9cc38c3f-7b84ab51-4b472714,files/p10/p10000032/s53911762/68b5c4b1-227d048...,files/p10/p10000032/s53911762.txt,/system/user/publicdata/MIMIC_CXR/hageneder/JP...
...,...,...,...,...,...,...
377105,19999733,57132437,428e2c18-5721d8f3-35a05001-36f3d080-9053b83c,files/p19/p19999733/s57132437/428e2c18-5721d8f...,files/p19/p19999733/s57132437.txt,/system/user/publicdata/MIMIC_CXR/hageneder/JP...
377106,19999733,57132437,58c403aa-35ff8bd9-73e39f54-8dc9cc5d-e0ec3fa9,files/p19/p19999733/s57132437/58c403aa-35ff8bd...,files/p19/p19999733/s57132437.txt,/system/user/publicdata/MIMIC_CXR/hageneder/JP...
377107,19999987,55368167,58766883-376a15ce-3b323a28-6af950a0-16b793bd,files/p19/p19999987/s55368167/58766883-376a15c...,files/p19/p19999987/s55368167.txt,/system/user/publicdata/MIMIC_CXR/hageneder/JP...
377108,19999987,58621812,7ba273af-3d290f8d-e28d0ab4-484b7a86-7fc12b08,files/p19/p19999987/s58621812/7ba273af-3d290f8...,files/p19/p19999987/s58621812.txt,/system/user/publicdata/MIMIC_CXR/hageneder/JP...


In [5]:
paths_df = jpg_records_study.drop(['dicom_id', 'dicom_path'], axis=1)
# in the study_path column in every row add /system/user/publicdata/MIMIC_CXR/MIMIC_CXR/physionet.org/files/mimic-cxr/2.0.0
paths_df['study_path'] = paths_df['study_path'] = '/data/MIMIC_CXR/MIMIC_CXR/physionet.org/files/mimic-cxr/2.0.0/' + paths_df['study_path']
#save paths_df as csv to /data/csv
paths_df.to_csv('/data/csv/paths_df.csv', index=False)
paths_df['study_id'] = paths_df['study_id'].astype(int)
paths_df

Unnamed: 0,subject_id_x,study_id,study_path,jpg_path
0,10000032,50414267,/system/user/publicdata/MIMIC_CXR/MIMIC_CXR/ph...,/system/user/publicdata/MIMIC_CXR/hageneder/JP...
1,10000032,50414267,/system/user/publicdata/MIMIC_CXR/MIMIC_CXR/ph...,/system/user/publicdata/MIMIC_CXR/hageneder/JP...
2,10000032,53189527,/system/user/publicdata/MIMIC_CXR/MIMIC_CXR/ph...,/system/user/publicdata/MIMIC_CXR/hageneder/JP...
3,10000032,53189527,/system/user/publicdata/MIMIC_CXR/MIMIC_CXR/ph...,/system/user/publicdata/MIMIC_CXR/hageneder/JP...
4,10000032,53911762,/system/user/publicdata/MIMIC_CXR/MIMIC_CXR/ph...,/system/user/publicdata/MIMIC_CXR/hageneder/JP...
...,...,...,...,...
377105,19999733,57132437,/system/user/publicdata/MIMIC_CXR/MIMIC_CXR/ph...,/system/user/publicdata/MIMIC_CXR/hageneder/JP...
377106,19999733,57132437,/system/user/publicdata/MIMIC_CXR/MIMIC_CXR/ph...,/system/user/publicdata/MIMIC_CXR/hageneder/JP...
377107,19999987,55368167,/system/user/publicdata/MIMIC_CXR/MIMIC_CXR/ph...,/system/user/publicdata/MIMIC_CXR/hageneder/JP...
377108,19999987,58621812,/system/user/publicdata/MIMIC_CXR/MIMIC_CXR/ph...,/system/user/publicdata/MIMIC_CXR/hageneder/JP...


In [6]:
#import only_finding_df.csv
only_finding_df = pd.read_csv('/data/csv/only_findings_df.csv')



In [7]:
#in the only_finding_df column id should be renamed to study_id and the s in every row should be removed and then converted to int
only_finding_df = only_finding_df.rename(columns={'id': 'study_id'})
only_finding_df['study_id'] = only_finding_df['study_id'].str[1:]
only_finding_df['study_id'] = only_finding_df['study_id'].astype(int)

jpg_path_fingings = paths_df.merge(only_finding_df, on='study_id', how='inner')
jpg_path_fingings

Unnamed: 0,subject_id_x,study_id,study_path,jpg_path,No Finding
0,10000032,50414267,/system/user/publicdata/MIMIC_CXR/MIMIC_CXR/ph...,/system/user/publicdata/MIMIC_CXR/hageneder/JP...,1
1,10000032,50414267,/system/user/publicdata/MIMIC_CXR/MIMIC_CXR/ph...,/system/user/publicdata/MIMIC_CXR/hageneder/JP...,1
2,10000032,53189527,/system/user/publicdata/MIMIC_CXR/MIMIC_CXR/ph...,/system/user/publicdata/MIMIC_CXR/hageneder/JP...,1
3,10000032,53189527,/system/user/publicdata/MIMIC_CXR/MIMIC_CXR/ph...,/system/user/publicdata/MIMIC_CXR/hageneder/JP...,1
4,10000032,53911762,/system/user/publicdata/MIMIC_CXR/MIMIC_CXR/ph...,/system/user/publicdata/MIMIC_CXR/hageneder/JP...,1
...,...,...,...,...,...
377090,19999733,57132437,/system/user/publicdata/MIMIC_CXR/MIMIC_CXR/ph...,/system/user/publicdata/MIMIC_CXR/hageneder/JP...,1
377091,19999733,57132437,/system/user/publicdata/MIMIC_CXR/MIMIC_CXR/ph...,/system/user/publicdata/MIMIC_CXR/hageneder/JP...,1
377092,19999987,55368167,/system/user/publicdata/MIMIC_CXR/MIMIC_CXR/ph...,/system/user/publicdata/MIMIC_CXR/hageneder/JP...,0
377093,19999987,58621812,/system/user/publicdata/MIMIC_CXR/MIMIC_CXR/ph...,/system/user/publicdata/MIMIC_CXR/hageneder/JP...,0


In [8]:
#revers the 0 and 1 in the No Finding column
jpg_path_fingings['No Finding'] = jpg_path_fingings['No Finding'].replace({0: 2, 1: 0})
jpg_path_fingings['No Finding'] = jpg_path_fingings['No Finding'].replace({2: 1})
#rename the No Finding column to Finding
jpg_path_fingings = jpg_path_fingings.rename(columns={'No Finding': 'Finding'})
jpg_path_fingings

Unnamed: 0,subject_id_x,study_id,study_path,jpg_path,Finding
0,10000032,50414267,/system/user/publicdata/MIMIC_CXR/MIMIC_CXR/ph...,/system/user/publicdata/MIMIC_CXR/hageneder/JP...,0
1,10000032,50414267,/system/user/publicdata/MIMIC_CXR/MIMIC_CXR/ph...,/system/user/publicdata/MIMIC_CXR/hageneder/JP...,0
2,10000032,53189527,/system/user/publicdata/MIMIC_CXR/MIMIC_CXR/ph...,/system/user/publicdata/MIMIC_CXR/hageneder/JP...,0
3,10000032,53189527,/system/user/publicdata/MIMIC_CXR/MIMIC_CXR/ph...,/system/user/publicdata/MIMIC_CXR/hageneder/JP...,0
4,10000032,53911762,/system/user/publicdata/MIMIC_CXR/MIMIC_CXR/ph...,/system/user/publicdata/MIMIC_CXR/hageneder/JP...,0
...,...,...,...,...,...
377090,19999733,57132437,/system/user/publicdata/MIMIC_CXR/MIMIC_CXR/ph...,/system/user/publicdata/MIMIC_CXR/hageneder/JP...,0
377091,19999733,57132437,/system/user/publicdata/MIMIC_CXR/MIMIC_CXR/ph...,/system/user/publicdata/MIMIC_CXR/hageneder/JP...,0
377092,19999987,55368167,/system/user/publicdata/MIMIC_CXR/MIMIC_CXR/ph...,/system/user/publicdata/MIMIC_CXR/hageneder/JP...,1
377093,19999987,58621812,/system/user/publicdata/MIMIC_CXR/MIMIC_CXR/ph...,/system/user/publicdata/MIMIC_CXR/hageneder/JP...,1


In [9]:
# drop subject_id_x and study_path, rename finding to target 
jpg_path_fingings = jpg_path_fingings.drop(['subject_id_x', 'study_path'], axis=1)
#jpg_path_fingings = jpg_path_fingings.rename(columns={'Finding': 'Fin'})
jpg_path_fingings.head()

Unnamed: 0,study_id,jpg_path,Finding
0,50414267,/system/user/publicdata/MIMIC_CXR/hageneder/JP...,0
1,50414267,/system/user/publicdata/MIMIC_CXR/hageneder/JP...,0
2,53189527,/system/user/publicdata/MIMIC_CXR/hageneder/JP...,0
3,53189527,/system/user/publicdata/MIMIC_CXR/hageneder/JP...,0
4,53911762,/system/user/publicdata/MIMIC_CXR/hageneder/JP...,0


In [10]:
#save jpg_path_fingings as csv to /data/csv
jpg_path_fingings.to_csv('/data/csv/jpg_path_fingings.csv', index=False)

In [11]:
#new dataframe copy of jpg_path_fingings and drop the finding column and the name should be jpg_path_all_findings
jpg_path_all_findings = jpg_path_fingings.drop(['Finding'], axis=1)
jpg_path_all_findings

Unnamed: 0,study_id,jpg_path
0,50414267,/system/user/publicdata/MIMIC_CXR/hageneder/JP...
1,50414267,/system/user/publicdata/MIMIC_CXR/hageneder/JP...
2,53189527,/system/user/publicdata/MIMIC_CXR/hageneder/JP...
3,53189527,/system/user/publicdata/MIMIC_CXR/hageneder/JP...
4,53911762,/system/user/publicdata/MIMIC_CXR/hageneder/JP...
...,...,...
377090,57132437,/system/user/publicdata/MIMIC_CXR/hageneder/JP...
377091,57132437,/system/user/publicdata/MIMIC_CXR/hageneder/JP...
377092,55368167,/system/user/publicdata/MIMIC_CXR/hageneder/JP...
377093,58621812,/system/user/publicdata/MIMIC_CXR/hageneder/JP...


In [2]:
#load csv file from "/data/csv/all_findings_df.csv"
all_findings_df = pd.read_csv('/data/csv/all_findings_df.csv')

In [3]:
#rename the id column to study_id and drop the s in every row in the study_id column and convert it to int
all_findings_df = all_findings_df.rename(columns={'id': 'study_id'})
all_findings_df['study_id'] = all_findings_df['study_id'].str[1:]
all_findings_df['study_id'] = all_findings_df['study_id'].astype(int)
#replace all -1 with 0 and all NaN with 0 in the all_findings_df
all_findings_df = all_findings_df.replace(-1, 0)
all_findings_df = all_findings_df.fillna(0)



In [33]:
#show row 140301
all_findings_df.iloc[140301]

study_id    50317974
Findings           9
Name: 140301, dtype: int64

In [32]:
#convert all columns to int
all_findings_df = all_findings_df.astype(int)
#add a new column called Findings and sum every row except the study_id column up and wirte the sum in the Findings column
all_findings_df['Findings'] = all_findings_df.sum(axis=1) - all_findings_df['study_id']
#drop every column except the study_id and Findings column
all_findings_df = all_findings_df.drop(['Atelectasis', 'Cardiomegaly', 'Consolidation', 'Edema', 'Enlarged Cardiomediastinum', 'Fracture', 'Lung Lesion', 'Lung Opacity', 'No Finding', 'Pleural Effusion', 'Pleural Other', 'Pneumonia', 'Pneumothorax', 'Support Devices'], axis=1)

In [36]:
# the all_findings_df should be merged with the jpg_path_all_findings on the study_id column 
jpg_path_all_findings = jpg_path_all_findings.merge(all_findings_df, on='study_id', how='inner')
jpg_path_all_findings

Unnamed: 0,study_id,jpg_path,Findings
0,50414267,/system/user/publicdata/MIMIC_CXR/hageneder/JP...,1
1,50414267,/system/user/publicdata/MIMIC_CXR/hageneder/JP...,1
2,53189527,/system/user/publicdata/MIMIC_CXR/hageneder/JP...,1
3,53189527,/system/user/publicdata/MIMIC_CXR/hageneder/JP...,1
4,53911762,/system/user/publicdata/MIMIC_CXR/hageneder/JP...,1
...,...,...,...
377090,57132437,/system/user/publicdata/MIMIC_CXR/hageneder/JP...,1
377091,57132437,/system/user/publicdata/MIMIC_CXR/hageneder/JP...,1
377092,55368167,/system/user/publicdata/MIMIC_CXR/hageneder/JP...,1
377093,58621812,/system/user/publicdata/MIMIC_CXR/hageneder/JP...,2


In [37]:
#save jpg_path_all_findings as csv to /data/csv
jpg_path_all_findings.to_csv('/data/csv/jpg_path_all_findings.csv', index=False)

In [41]:
#count the occurence of every finding in the Findings column
x= jpg_path_all_findings.head(100000)
print(x['Findings'].value_counts())

Findings
1    56355
2    20921
3    11016
4     5735
0     3280
5     2072
6      522
7       89
8       10
Name: count, dtype: int64


# prepare the hole dataset


In [4]:
# open the df_all_findings /data/csv/all_findings_df.csv
all_findings_df = pd.read_csv('/data/csv/all_findings_df.csv')

In [5]:
all_findings_df = all_findings_df.fillna(0)
all_findings_df = all_findings_df.replace(-1.0, 0)
all_findings_df = all_findings_df.replace(0.0, 0)
all_findings_df = all_findings_df.replace(1.0, 1)
all_findings_df = all_findings_df.fillna(0)
all_findings_df['id'] = all_findings_df['id'].apply(lambda x: int(x[1:]) if x.startswith('s') else int(x))
all_findings_df = all_findings_df.astype(int)


In [6]:
all_findings_df = all_findings_df.rename(columns={'id': 'study_id'})
all_findings_df

Unnamed: 0,study_id,Atelectasis,Cardiomegaly,Consolidation,Edema,Enlarged Cardiomediastinum,Fracture,Lung Lesion,Lung Opacity,Pleural Effusion,Pleural Other,Pneumonia,Pneumothorax,Support Devices,No Finding
0,50002405,0,0,0,0,0,0,0,0,0,0,0,0,0,1
1,50003651,0,0,0,0,1,0,0,0,0,0,0,0,0,0
2,50006246,1,0,0,0,0,0,0,0,1,0,0,0,1,0
3,50008565,0,0,0,0,0,0,0,0,1,0,0,0,0,0
4,50008601,0,0,0,0,0,0,0,0,0,0,0,0,0,1
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
227822,59992045,0,0,0,0,0,0,0,0,1,0,0,0,1,0
227823,59995675,1,1,0,0,0,0,0,0,1,0,0,0,1,0
227824,59995853,0,0,0,0,0,0,0,0,0,0,0,0,0,1
227825,59997822,0,1,0,1,0,0,0,0,1,0,0,0,1,0


In [17]:
#save jpg_path_fingings as csv to /data/csv
jpg_path = pd.read_csv('/data/csv/jpg_path_fingings.csv')
jpg_path = jpg_path.drop(['Finding'], axis=1)
jpg_path

Unnamed: 0,study_id,jpg_path
0,50414267,/system/user/publicdata/MIMIC_CXR/hageneder/JP...
1,50414267,/system/user/publicdata/MIMIC_CXR/hageneder/JP...
2,53189527,/system/user/publicdata/MIMIC_CXR/hageneder/JP...
3,53189527,/system/user/publicdata/MIMIC_CXR/hageneder/JP...
4,53911762,/system/user/publicdata/MIMIC_CXR/hageneder/JP...
...,...,...
377090,57132437,/system/user/publicdata/MIMIC_CXR/hageneder/JP...
377091,57132437,/system/user/publicdata/MIMIC_CXR/hageneder/JP...
377092,55368167,/system/user/publicdata/MIMIC_CXR/hageneder/JP...
377093,58621812,/system/user/publicdata/MIMIC_CXR/hageneder/JP...


In [18]:
jpg_path_all_sym = all_findings_df.merge(jpg_path, on='study_id', how='inner')


In [19]:
jpg_path_all_sym

Unnamed: 0,study_id,Atelectasis,Cardiomegaly,Consolidation,Edema,Enlarged Cardiomediastinum,Fracture,Lung Lesion,Lung Opacity,Pleural Effusion,Pleural Other,Pneumonia,Pneumothorax,Support Devices,No Finding,jpg_path
0,50002405,0,0,0,0,0,0,0,0,0,0,0,0,0,1,/system/user/publicdata/MIMIC_CXR/hageneder/JP...
1,50003651,0,0,0,0,1,0,0,0,0,0,0,0,0,0,/system/user/publicdata/MIMIC_CXR/hageneder/JP...
2,50003651,0,0,0,0,1,0,0,0,0,0,0,0,0,0,/system/user/publicdata/MIMIC_CXR/hageneder/JP...
3,50003651,0,0,0,0,1,0,0,0,0,0,0,0,0,0,/system/user/publicdata/MIMIC_CXR/hageneder/JP...
4,50006246,1,0,0,0,0,0,0,0,1,0,0,0,1,0,/system/user/publicdata/MIMIC_CXR/hageneder/JP...
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
377090,59995853,0,0,0,0,0,0,0,0,0,0,0,0,0,1,/system/user/publicdata/MIMIC_CXR/hageneder/JP...
377091,59997822,0,1,0,1,0,0,0,0,1,0,0,0,1,0,/system/user/publicdata/MIMIC_CXR/hageneder/JP...
377092,59998558,0,0,0,0,0,0,0,1,0,0,0,0,0,0,/system/user/publicdata/MIMIC_CXR/hageneder/JP...
377093,59998558,0,0,0,0,0,0,0,1,0,0,0,0,0,0,/system/user/publicdata/MIMIC_CXR/hageneder/JP...


In [20]:
#save as all findings binary 
jpg_path_all_sym.to_csv('/data/csv/jpg_path_all_sym.csv', index=False)