In [59]:
import pandas as pd
import os
import shutil

### Initial Reshape - Filter for a selection of findings, limit the max number of each findings class

In [54]:

# folders =['No Finding','Infiltration','Atelectasis','Effusion','Nodule','Pneumothorax','Pneumonia','Mass','Consolidation','Pleural_Thickening','Cardiomegaly','Fibrosis','Edema','Emphysema','Multiple']
findings_of_interest =['Fibrosis','Effusion','Mass','Infiltration','Cardiomegaly','No Finding']
MAX_NUMBER_OF_EACH_FINDING = 5270
df = pd.read_csv('Data_Entry_2017_v2020.csv')
df = df[df['Patient Age']<100] #removing datapoints which having age greater than 100
df = df[df['View Position'] == 'PA'] #Fileter just for PA view images (exclude AP)
print(f"Shape of full dataset: {df.shape}")
fullMatrchString="|".join(findings_of_interest)
filtered_df = df[df['Finding Labels'].str.fullmatch(fullMatrchString)];
print(f"NUMBER OF PA STUDIES WITH FINDING\n{ filtered_df['Finding Labels'].value_counts()}")  
final_df=filtered_df.groupby('Finding Labels').head(MAX_NUMBER_OF_EACH_FINDING) ### cut 
print(f"Shape of final dataset: {final_df.shape}")
final_df.to_csv("NIH_filtered_subset.csv");
print(f"FINAL NUMBER\n{ final_df['Finding Labels'].value_counts()}") 

destination_path = 'dicom/NIH_subset'



Shape of full dataset: (67310, 11)
NUMBER OF PA STUDIES WITH FINDING
No Finding      39302
Infiltration     5270
Effusion         2086
Mass             1367
Cardiomegaly      746
Fibrosis          648
Name: Finding Labels, dtype: int64
Shape of final dataset: (15387, 11)
FINAL NUMBER
No Finding      5270
Infiltration    5270
Effusion        2086
Mass            1367
Cardiomegaly     746
Fibrosis         648
Name: Finding Labels, dtype: int64


Second Reshape - Choose just PAs with just Normal and Fibrosis, no follow up studies

In [84]:

# folders =['No Finding','Infiltration','Atelectasis','Effusion','Nodule','Pneumothorax','Pneumonia','Mass','Consolidation','Pleural_Thickening','Cardiomegaly','Fibrosis','Edema','Emphysema','Multiple']
findings_of_interest =['Fibrosis','No Finding'] #used to by mkdir to make the subdirectories
MAX_NUMBER_OF_EACH_FINDING = 6370 #this is 10x the nuber of fibrosis images

df = pd.read_csv('Data_Entry_2017_v2020.csv')
df = df[df['Patient Age']<85] #removing datapoints which having age greater than 85
df = df[df['Patient Age']>=18] 
df = df[df['View Position'] == 'PA'] #Fileter just for PA view images (exclude AP)
# df = df[df['Follow-up #']==0] # get just the first study, no follow ups 
# print(f"Shape of full dataset: {df.shape}")
fullMatrchString="|".join(['Fibrosis'])
filtered_df = df[
  (df['Finding Labels'].str.fullmatch('Fibrosis')) |
  (df['Finding Labels'].str.fullmatch('No Finding') &
  (df['Follow-up #']==0))
];
filtered_df=filtered_df.groupby('Finding Labels').head(MAX_NUMBER_OF_EACH_FINDING) ### cut 
print(f"Shape of final dataset: {filtered_df.shape}")
filtered_df.to_csv("NIH_1stPA_Norm_Fib.csv");
print(f"FINAL NUMBER\n{ filtered_df['Finding Labels'].value_counts()}") 

destination_path = 'dicom/NIH_1stPA_Norm_Fib'

Shape of final dataset: (7007, 11)
FINAL NUMBER
No Finding    6370
Fibrosis       637
Name: Finding Labels, dtype: int64


In [87]:
working_dir = os.getcwd() 
folder_path= os.path.join( working_dir, destination_path)
os.mkdir(os.path.join(folder_path))
for name in findings_of_interest: 
  os.mkdir(os.path.join(folder_path,name))


In [89]:

# final_df.head()
# final_df.columns = final_df.columns.str.replace(' ','_')
for index, row in final_df.iterrows():
  # print(row)
  source= f"{working_dir}/dicom/NIH_images/{row['Image Index']}"
  dest = f"{folder_path}/{row['Finding Labels']}/{row['Image Index']}"
  # print(source,dest)
  shutil.copy(source,dest)

In [None]:
# import pathlib
# list(pathlib.Path(f"{os.getcwd()}/dicom/NIH_images").glob('*.png'))