In [1]:
import os
import pydicom
import matplotlib.pyplot as plt
import pylab
import gzip
import subprocess
from tqdm import tqdm_notebook as tqdm
import csv
import pandas as pd
import numpy as np
%matplotlib inline

from skimage.transform import downscale_local_mean, resize
import skimage.io as io
from skimage.external import tifffile

In [2]:
def downscale_image_padded(img, new_shape):
    if img.shape[0]*1.0/new_shape[0] >= img.shape[1]*1.0/new_shape[1]:
        lax = img.shape[0]
        sax = img.shape[1]
        smaller_dim = 1
    elif img.shape[0]*1.0/new_shape[0] < img.shape[1]*1.0/new_shape[1]:
        lax = img.shape[1]
        sax = img.shape[0] 
        smaller_dim = 0 
    interm_shape = list(new_shape) 
    interm_shape[smaller_dim] = int(sax*1.0/lax * new_shape[smaller_dim])
        
        
#     ratio = np.max([np.ceil(i*1.0 / n) for n, i in zip(new_shape, img.shape)])
#     #print(ratio)
#     interm_shape = np.rint([s / ratio for s in img.shape]).astype(np.int)
    #print(interm_shape)
    interm_img = resize(img, interm_shape)

    new_img = np.zeros(new_shape, dtype=interm_img.dtype)
    #print(new_img.shape)
    pad = [(n - s) for n,s in zip(new_shape, interm_shape)]
    #print(pad)
    new_img[:interm_shape[0],:interm_shape[1]] = interm_img

    return new_img.astype('float32')
def clean_df(df):
    # get rid of NaNs and values below 0 
    df_final = df.fillna(0)
#     b = df_final[df_final.columns[2:]]
#     b = (b > 0).astype('int')
#     df_final = pd.concat((df_final[df_final.columns[:2]], b), axis=1)
    return df_final

## For preprocessing even more data
This time, preprocess every study in the next several patients until we get to 80,000 more images 

## Look at the data

In [3]:
metadata = pd.read_csv('metadata.csv')
pd.set_option('max_rows',None)
pd.set_option('max_columns',None)
metadata.head(100)

Unnamed: 0,dicom_id,study_id,subject_id
0,02aa804e-bde0afdd-112c0b34-7bc16630-4e384014,50414267,10000032
1,174413ec-4ec4c1f7-34ea26b7-c5f994f8-79ef1962,50414267,10000032
2,2a2277a9-b0ded155-c0de8eb9-c124d10e-82c5caab,53189527,10000032
3,e084de3b-be89b11e-20fe3f9f-9c8d8dfe-4cfd202c,53189527,10000032
4,68b5c4b1-227d0485-9cc38c3f-7b84ab51-4b472714,53911762,10000032
5,fffabebf-74fd3a1f-673b6b41-96ec0ac9-2ab69818,53911762,10000032
6,ab988de4-f0b01276-2890a173-49b72df5-2c4202c1,56699142,10000032
7,ea030e7a-2e3b1346-bc518786-7a8fd698-f673b44c,56699142,10000032
8,096052b7-d256dc40-453a102b-fa7d01c6-1b22c6b4,57375967,10000764
9,b79e55c3-735ce5ac-64412506-cdc9ea79-f1af521f,57375967,10000764


In [4]:
patient_ids = np.load('patient_ids_shuffled.npy')

In [6]:
# Make the new list of patient_ids 
final_patient_ids = patient_ids[5000:10000]

In [7]:
train = pd.read_csv('train.csv')
valid = pd.read_csv('valid.csv')

In [8]:
train = clean_df(train)
valid = clean_df(valid)
total_df = pd.concat((train,valid),axis=0)
total_df.tail(10)

Unnamed: 0,path,view,No Finding,Enlarged Cardiomediastinum,Cardiomegaly,Airspace Opacity,Lung Lesion,Edema,Consolidation,Pneumonia,Atelectasis,Pneumothorax,Pleural Effusion,Pleural Other,Fracture,Support Devices
2722,valid/p19941474/s29/view2_lateral.jpg,lateral,0.0,0.0,0.0,1.0,-1.0,0.0,0.0,0.0,1.0,1.0,0.0,0.0,0.0,1.0
2723,valid/p19941474/s29/view3_lateral.jpg,lateral,0.0,0.0,0.0,1.0,-1.0,0.0,0.0,0.0,1.0,1.0,0.0,0.0,0.0,1.0
2724,valid/p19941474/s30/view1_frontal.jpg,frontal,1.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0
2725,valid/p19941474/s31/view1_frontal.jpg,frontal,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,1.0,0.0,0.0,1.0
2726,valid/p19941474/s32/view1_frontal.jpg,frontal,1.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0
2727,valid/p19941474/s32/view2_lateral.jpg,lateral,1.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0
2728,valid/p19941474/s33/view1_frontal.jpg,frontal,0.0,0.0,0.0,0.0,0.0,1.0,0.0,0.0,1.0,0.0,1.0,0.0,0.0,1.0
2729,valid/p19941474/s34/view1_frontal.jpg,frontal,1.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0
2730,valid/p19941474/s34/view2_lateral.jpg,lateral,1.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0
2731,valid/p19941474/s34/view3_lateral.jpg,lateral,1.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0


In [9]:
# Download a bunch of files 
# Download a file
restart = False # if true, then we don't reset the i 
restart_index = 3009 # the position in the final patientids that we ended on
i = 10318

# Parameter names
#numpy_path = 'data/dataset_2.npy'
save_img_path = 'data/tiffs_7'
labels_path = 'data/labels_7.npy'
reports_path = 'data/reports_7'

if os.path.isdir(save_img_path) == False:
    os.mkdir(save_img_path)
if os.path.isdir(reports_path) == False:
    os.mkdir(reports_path)

pids = []
sids = []

if not restart:
    i = 0 # counter 
    final_patient_ids_array = final_patient_ids.copy()
    #imgs = np.zeros((0, 224, 224, 1), dtype='uint16')
    labels = np.zeros((0,14))
else:
    final_patient_ids_array = final_patient_ids[restart_index:]
    #imgs = np.load(numpy_path)
    labels = np.load(labels_path)
    
for patient_id in tqdm(final_patient_ids_array):
    try:
        subprocess.check_call('gsutil -u seraphic-cocoa-239517 -m cp -r gs://physionet-data-mimic-cxr/files/p'+str(patient_id) + ' data/', shell=True)
    except:
        continue
    
    # Choose a study from this patient
    study_ids = np.unique(np.array(metadata[metadata['subject_id']==patient_id]['study_id'])).astype('int')
    a = total_df[total_df['path'].str.contains('p'+str(patient_id))]
    
    studs = [int(z.split('/')[2][1:]) for z in a['path'].tolist()]
    unique_studs = np.unique(studs)
    indexe = 0 
    for study_id in study_ids:
        try:
            filenames = os.listdir('data/p'+str(patient_id)+'/s'+str(study_id))
            
        except:
            continue 
        
        for fnom in filenames:
            img_path = os.path.join('data/p'+str(patient_id)+'/s'+str(study_id)+'/'+fnom) 
            with gzip.open(img_path) as f:
                imgg = pydicom.read_file(f)
            if imgg.ViewPosition == 'PA':
                break
        #print(imgg.ViewPosition)
        img = np.asarray(imgg.pixel_array).astype('uint16')
#         plt.imshow(img)
#         plt.show()
        #downscale_image()
        d_img = downscale_image_padded(img, (224,224))
        
        #tifffile.imsave('V:/webster/mlhc/tiffs_3/'+str(i)+'_'+str(study_id)+'.tiff',img)
        tifffile.imsave(save_img_path+'/'+str(i)+'_'+str(study_id)+'.tiff',d_img)
        #io.imsave('V:/webster/mlhc/pngs_3/'+str(i)+'_'+str(study_id)+'.png',img)
        subprocess.check_call('mv data/p'+str(patient_id)+'/s'+str(study_id)+'.txt' + ' ' + reports_path + '/' +str(i)+'_'+str(study_id)+'.txt')


        # Also look for the label 
        target = np.array(a.loc[a.index[studs.index(unique_studs[indexe])]][a.columns[2:]])
        labels = np.concatenate((labels, np.expand_dims(target,0)),axis=0)
        #imgs = np.vstack((imgs, np.expand_dims(np.expand_dims(d_img,2),0)))
        indexe += 1
        
        if i % 500 == 0:
            #np.save(numpy_path, imgs)
            np.save(labels_path, labels)
        #print(i, patient_id, study_id)    
        i += 1
    

    
    # Finally delete it
    subprocess.check_call('rm -r data/p'+str(patient_id))
    
    

HBox(children=(IntProgress(value=0, max=5000), HTML(value=u'')))

  warn("The default mode, 'constant', will be changed to 'reflect' in "
  warn("Anti-aliasing will be enabled by default in skimage 0.15 to "





In [10]:
print(len(os.listdir(save_img_path)), len(os.listdir(reports_path)))

(17784, 17784)


In [11]:
# np.save(numpy_path,imgs)
np.save(labels_path,labels)

## Convert to a tar.gz file 

In [12]:
import tarfile
final_dir = save_img_path
fnames = os.listdir(final_dir)
with tarfile.open(final_dir+'.tar.gz', "w:gz") as tar:
    for fname in tqdm(fnames):
        name = os.path.join(final_dir,fname)
        tar.add(name)

HBox(children=(IntProgress(value=0, max=17784), HTML(value=u'')))


