# Imports

In [41]:
import pandas as pd
import os
import shutil
import mapply

mapply.init(
    n_workers=-1,
    chunk_size=100,
    max_chunks_per_worker=8,
    progressbar=False
)

Purpose of this script is to arrange any dataset into imagenet format. Imagenet format is commonly used for CV benchmarks

This file will assume you have a .csv file with a column "PATH"

TODO: Bake in assumption that user has "Class", "Label" columns

In [30]:
path_to_ds_file = "/zfs/wficai/chexpert/chexpertchestxrays-u20210408/train_cheXbert.csv"
destination_directory = "/fastscratch/jplineb/chexpert_imagenet"

# Read DF

In [7]:
df = pd.read_csv(path_to_ds_file)
df.head()

Unnamed: 0,Path,Sex,Age,Frontal/Lateral,AP/PA,Enlarged Cardiomediastinum,Cardiomegaly,Lung Opacity,Lung Lesion,Edema,Consolidation,Pneumonia,Atelectasis,Pneumothorax,Pleural Effusion,Pleural Other,Fracture,Support Devices,No Finding
0,CheXpert-v1.0/train/patient00001/study1/view1_...,Female,68,Frontal,AP,,,,,,,,,0.0,,,,1.0,1.0
1,CheXpert-v1.0/train/patient00002/study2/view1_...,Female,87,Frontal,AP,,1.0,1.0,,-1.0,-1.0,,-1.0,,-1.0,,1.0,,
2,CheXpert-v1.0/train/patient00002/study1/view1_...,Female,83,Frontal,AP,,,1.0,,,-1.0,,,,,,1.0,,
3,CheXpert-v1.0/train/patient00002/study1/view2_...,Female,83,Lateral,,,,1.0,,,-1.0,,,,,,1.0,,
4,CheXpert-v1.0/train/patient00003/study1/view1_...,Male,41,Frontal,AP,,,,,1.0,,,,0.0,,,,,


In [42]:
df.info()

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 223414 entries, 0 to 223413
Data columns (total 19 columns):
 #   Column                      Non-Null Count   Dtype  
---  ------                      --------------   -----  
 0   Path                        223414 non-null  object 
 1   Sex                         223414 non-null  object 
 2   Age                         223414 non-null  int64  
 3   Frontal/Lateral             223414 non-null  object 
 4   AP/PA                       191027 non-null  object 
 5   Enlarged Cardiomediastinum  45191 non-null   float64
 6   Cardiomegaly                50638 non-null   float64
 7   Lung Opacity                108503 non-null  float64
 8   Lung Lesion                 12268 non-null   float64
 9   Edema                       86512 non-null   float64
 10  Consolidation               71894 non-null   float64
 11  Pneumonia                   27556 non-null   float64
 12  Atelectasis                 69008 non-null   float64
 13  Pneumothorax  

# Create utility functions

In [76]:
def generate_destination_directory(row, class_name, dest_dir, split="train"):
    class_label = int(row[class_name])
    dest_path = f"{dest_dir}/{split}/{class_name}/{class_label}"
    return dest_path    

def move_files_from_df(row, origin_col, dest_col):
    # Grab dirs
    origin_path = "/zfs/wficai/chexpert/chexpertchestxrays-u20210408/" + row[origin_col]
    origin_file = os.path.basename(origin_path)
    dest_path = row[dest_col] + origin_file
    # Move files
    ## Make directory if not exists
    dest_dir = os.path.dirname(dest_path)
    if not os.path.exists(dest_dir):
        os.makedirs(dest_dir)
        
    shutil.copy(origin_path, dest_path)
    
    return None   

# Lets just try one diagnosis

## Lung Edema

In [14]:
## Lung Edema
df.Edema.value_counts()

 1.0    53058
 0.0    21243
-1.0    12211
Name: Edema, dtype: int64

In [23]:
df_edema  = df[df.Edema.notna()]
df_edema

Unnamed: 0,Path,Sex,Age,Frontal/Lateral,AP/PA,Enlarged Cardiomediastinum,Cardiomegaly,Lung Opacity,Lung Lesion,Edema,Consolidation,Pneumonia,Atelectasis,Pneumothorax,Pleural Effusion,Pleural Other,Fracture,Support Devices,No Finding
1,CheXpert-v1.0/train/patient00002/study2/view1_...,Female,87,Frontal,AP,,1.0,1.0,,-1.0,-1.0,,-1.0,,-1.0,,1.0,,
4,CheXpert-v1.0/train/patient00003/study1/view1_...,Male,41,Frontal,AP,,,,,1.0,,,,0.0,,,,,
16,CheXpert-v1.0/train/patient00009/study1/view1_...,Male,76,Frontal,PA,,1.0,,,0.0,,,1.0,,,,,,
17,CheXpert-v1.0/train/patient00009/study1/view2_...,Male,76,Lateral,,,1.0,,,0.0,,,1.0,,,,,,
24,CheXpert-v1.0/train/patient00011/study7/view1_...,Female,19,Frontal,AP,,,1.0,,0.0,,,,,,,,1.0,
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
223402,CheXpert-v1.0/train/patient64532/study1/view1_...,Female,52,Frontal,AP,,,,,0.0,0.0,,,,0.0,,,1.0,1.0
223403,CheXpert-v1.0/train/patient64533/study1/view1_...,Male,75,Frontal,AP,,1.0,,,1.0,,,,,1.0,,,1.0,
223407,CheXpert-v1.0/train/patient64536/study2/view1_...,Female,61,Frontal,AP,,,,,1.0,,,,,1.0,,,,
223408,CheXpert-v1.0/train/patient64536/study1/view1_...,Female,61,Frontal,AP,,,,,1.0,,,1.0,,,,,1.0,


In [34]:
# Generate Destination Directories
df_edema["destination_path"] = df_edema.apply(
    generate_destination_directory,
    class_name="Edema",
    dest_dir = destination_directory,
    split="train",
    axis=1
)

A value is trying to be set on a copy of a slice from a DataFrame.
Try using .loc[row_indexer,col_indexer] = value instead

See the caveats in the documentation: https://pandas.pydata.org/pandas-docs/stable/user_guide/indexing.html#returning-a-view-versus-a-copy
  df_edema["destination_path"] = df_edema.apply(


In [35]:
df_edema

Unnamed: 0,Path,Sex,Age,Frontal/Lateral,AP/PA,Enlarged Cardiomediastinum,Cardiomegaly,Lung Opacity,Lung Lesion,Edema,Consolidation,Pneumonia,Atelectasis,Pneumothorax,Pleural Effusion,Pleural Other,Fracture,Support Devices,No Finding,destination_path
1,CheXpert-v1.0/train/patient00002/study2/view1_...,Female,87,Frontal,AP,,1.0,1.0,,-1.0,-1.0,,-1.0,,-1.0,,1.0,,,/fastscratch/jplineb/chexpert_imagenet/train/E...
4,CheXpert-v1.0/train/patient00003/study1/view1_...,Male,41,Frontal,AP,,,,,1.0,,,,0.0,,,,,,/fastscratch/jplineb/chexpert_imagenet/train/E...
16,CheXpert-v1.0/train/patient00009/study1/view1_...,Male,76,Frontal,PA,,1.0,,,0.0,,,1.0,,,,,,,/fastscratch/jplineb/chexpert_imagenet/train/E...
17,CheXpert-v1.0/train/patient00009/study1/view2_...,Male,76,Lateral,,,1.0,,,0.0,,,1.0,,,,,,,/fastscratch/jplineb/chexpert_imagenet/train/E...
24,CheXpert-v1.0/train/patient00011/study7/view1_...,Female,19,Frontal,AP,,,1.0,,0.0,,,,,,,,1.0,,/fastscratch/jplineb/chexpert_imagenet/train/E...
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
223402,CheXpert-v1.0/train/patient64532/study1/view1_...,Female,52,Frontal,AP,,,,,0.0,0.0,,,,0.0,,,1.0,1.0,/fastscratch/jplineb/chexpert_imagenet/train/E...
223403,CheXpert-v1.0/train/patient64533/study1/view1_...,Male,75,Frontal,AP,,1.0,,,1.0,,,,,1.0,,,1.0,,/fastscratch/jplineb/chexpert_imagenet/train/E...
223407,CheXpert-v1.0/train/patient64536/study2/view1_...,Female,61,Frontal,AP,,,,,1.0,,,,,1.0,,,,,/fastscratch/jplineb/chexpert_imagenet/train/E...
223408,CheXpert-v1.0/train/patient64536/study1/view1_...,Female,61,Frontal,AP,,,,,1.0,,,1.0,,,,,1.0,,/fastscratch/jplineb/chexpert_imagenet/train/E...


In [77]:
# Move data over to fast_scratch
df_edema.mapply(
    move_files_from_df,
    origin_col = "Path",
    dest_col = "destination_path",
    axis = 1
)

  0%|          | 0/448 [00:00<?, ?it/s]

1         None
4         None
16        None
17        None
24        None
          ... 
223402    None
223403    None
223407    None
223408    None
223411    None
Length: 86512, dtype: object