In [1]:
from helper_tools .tciaclient import TCIAClient
from helper_tools.dicom_tools import dcmread_image

from config import config

import pandas as pd
import numpy as np
import os
%matplotlib inline
import matplotlib.pyplot as plt
import json
import sys
import math
import errno
import pydicom as dicom
import urllib.request, urllib.error, urllib.parse, urllib.request, urllib.parse, urllib.error,sys






In [2]:
#directories for data and other files

PACKAGE_ROOT = config.PACKAGE_ROOT
TRAIN_DIR = config.TRAIN_DIR
TEST_DIR = config.TEST_DIR


In [3]:
#initializing tcia api client for easy queries
tcia_client = TCIAClient(baseUrl="https://services.cancerimagingarchive.net/services/v4",resource = "TCIA")

In [4]:
#reading in metadata including file paths
df_paths = pd.read_csv(TRAIN_DIR / 'file-paths-train.csv')

In [5]:
df_paths.info()

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 2596 entries, 0 to 2595
Data columns (total 5 columns):
 #   Column            Non-Null Count  Dtype 
---  ------            --------------  ----- 
 0   PatientID         2596 non-null   object
 1   StudyUID          2596 non-null   object
 2   View              2596 non-null   object
 3   descriptive_path  2596 non-null   object
 4   classic_path      2596 non-null   object
dtypes: object(5)
memory usage: 101.5+ KB


In [6]:
df_paths.head()

Unnamed: 0,PatientID,StudyUID,View,descriptive_path,classic_path
0,DBT-P00013,DBT-S00163,rmlo,Breast-Cancer-Screening-DBT/DBT-P00013/01-01-2...,Breast-Cancer-Screening-DBT/DBT-P00013/1.2.826...
1,DBT-P00023,DBT-S04378,lcc,Breast-Cancer-Screening-DBT/DBT-P00023/01-01-2...,Breast-Cancer-Screening-DBT/DBT-P00023/1.2.826...
2,DBT-P00023,DBT-S04378,lmlo,Breast-Cancer-Screening-DBT/DBT-P00023/01-01-2...,Breast-Cancer-Screening-DBT/DBT-P00023/1.2.826...
3,DBT-P00023,DBT-S04378,rcc,Breast-Cancer-Screening-DBT/DBT-P00023/01-01-2...,Breast-Cancer-Screening-DBT/DBT-P00023/1.2.826...
4,DBT-P00023,DBT-S04378,rmlo,Breast-Cancer-Screening-DBT/DBT-P00023/01-01-2...,Breast-Cancer-Screening-DBT/DBT-P00023/1.2.826...


In [7]:
df_paths['id'] = df_paths['classic_path'].apply(lambda x: x.split('/')[3])

In [8]:
df_paths.head()

Unnamed: 0,PatientID,StudyUID,View,descriptive_path,classic_path,id
0,DBT-P00013,DBT-S00163,rmlo,Breast-Cancer-Screening-DBT/DBT-P00013/01-01-2...,Breast-Cancer-Screening-DBT/DBT-P00013/1.2.826...,1.2.826.0.1.3680043.8.498.97979602815077649368...
1,DBT-P00023,DBT-S04378,lcc,Breast-Cancer-Screening-DBT/DBT-P00023/01-01-2...,Breast-Cancer-Screening-DBT/DBT-P00023/1.2.826...,1.2.826.0.1.3680043.8.498.12136582480949936067...
2,DBT-P00023,DBT-S04378,lmlo,Breast-Cancer-Screening-DBT/DBT-P00023/01-01-2...,Breast-Cancer-Screening-DBT/DBT-P00023/1.2.826...,1.2.826.0.1.3680043.8.498.90045035130681803298...
3,DBT-P00023,DBT-S04378,rcc,Breast-Cancer-Screening-DBT/DBT-P00023/01-01-2...,Breast-Cancer-Screening-DBT/DBT-P00023/1.2.826...,1.2.826.0.1.3680043.8.498.10822555886306795549...
4,DBT-P00023,DBT-S04378,rmlo,Breast-Cancer-Screening-DBT/DBT-P00023/01-01-2...,Breast-Cancer-Screening-DBT/DBT-P00023/1.2.826...,1.2.826.0.1.3680043.8.498.29938515490857039234...


In [9]:
# funciton to download DICOM images from 
def image_download(df, start, end):
    for i in range(start, end):
        tcia_client.get_image(
            seriesInstanceUid = df.iloc[i,5], 
            file_path = TRAIN_DIR / str(df.iloc[i,3])
        )

In [10]:
#testing download of first two images
#image_download(df_paths,0,2)

In [11]:
def view_image(df, row):

    view_series = df.iloc[row]
    view = view_series["View"]
    image_path = TRAIN_DIR / str(view_series["descriptive_path"])
    
    assert os.path.exists(image_path), 'image does not exist'
    
    
    image = dcmread_image(fp=image_path, view=view)
    return image

def path_exists(df,row):
    view_series = df.iloc[row]
    view = view_series["View"]
    image_path = TRAIN_DIR / str(view_series["descriptive_path"])
    return os.path.exists(image_path)

In [12]:
#image = view_image(df_paths,10)

In [13]:
# response = urllib.request.Request('https://services.cancerimagingarchive.net/services/v4/TCIA/query/getCollectionValues')
# resp = urllib.request.urlopen(response)
# resp_json = json.loads(resp.read())
# df_collections = pd.json_normalize(resp_json)
# df_collections
# mask = ['Breast' in x for x in df_collections['Collection']]
# df_collections[mask]
# response2 = urllib.request.Request('https://services.cancerimagingarchive.net/services/v4/TCIA/query/getPatientStudy?PatientID=DBT-P00029')
# resp2 = urllib.request.urlopen(response)


In [14]:
df_target_train = pd.read_csv(TRAIN_DIR / 'BCS-DBT labels-train.csv')

In [15]:
df_target_train.head()

Unnamed: 0,PatientID,StudyUID,View,Normal,Actionable,Benign,Cancer
0,DBT-P00013,DBT-S00163,rmlo,0,0,1,0
1,DBT-P00023,DBT-S04378,lcc,0,1,0,0
2,DBT-P00023,DBT-S04378,lmlo,0,1,0,0
3,DBT-P00023,DBT-S04378,rcc,0,1,0,0
4,DBT-P00023,DBT-S04378,rmlo,0,1,0,0


In [16]:
df_paths = df_paths.merge(df_target_train.drop(['StudyUID','View'],axis=1), on='PatientID')

In [17]:
df_paths.head()

Unnamed: 0,PatientID,StudyUID,View,descriptive_path,classic_path,id,Normal,Actionable,Benign,Cancer
0,DBT-P00013,DBT-S00163,rmlo,Breast-Cancer-Screening-DBT/DBT-P00013/01-01-2...,Breast-Cancer-Screening-DBT/DBT-P00013/1.2.826...,1.2.826.0.1.3680043.8.498.97979602815077649368...,0,0,1,0
1,DBT-P00023,DBT-S04378,lcc,Breast-Cancer-Screening-DBT/DBT-P00023/01-01-2...,Breast-Cancer-Screening-DBT/DBT-P00023/1.2.826...,1.2.826.0.1.3680043.8.498.12136582480949936067...,0,1,0,0
2,DBT-P00023,DBT-S04378,lcc,Breast-Cancer-Screening-DBT/DBT-P00023/01-01-2...,Breast-Cancer-Screening-DBT/DBT-P00023/1.2.826...,1.2.826.0.1.3680043.8.498.12136582480949936067...,0,1,0,0
3,DBT-P00023,DBT-S04378,lcc,Breast-Cancer-Screening-DBT/DBT-P00023/01-01-2...,Breast-Cancer-Screening-DBT/DBT-P00023/1.2.826...,1.2.826.0.1.3680043.8.498.12136582480949936067...,0,1,0,0
4,DBT-P00023,DBT-S04378,lcc,Breast-Cancer-Screening-DBT/DBT-P00023/01-01-2...,Breast-Cancer-Screening-DBT/DBT-P00023/1.2.826...,1.2.826.0.1.3680043.8.498.12136582480949936067...,0,1,0,0


In [18]:
images = [TRAIN_DIR / str(r[3]) if os.path.isfile(TRAIN_DIR / str(r[3])) else np.nan for i,r in df_paths.iterrows()]

In [19]:
images = pd.Series(images)
mask = ~images.isnull()
df_subset = df_paths[mask]
df_subset.head()


Unnamed: 0,PatientID,StudyUID,View,descriptive_path,classic_path,id,Normal,Actionable,Benign,Cancer
0,DBT-P00013,DBT-S00163,rmlo,Breast-Cancer-Screening-DBT/DBT-P00013/01-01-2...,Breast-Cancer-Screening-DBT/DBT-P00013/1.2.826...,1.2.826.0.1.3680043.8.498.97979602815077649368...,0,0,1,0
1,DBT-P00023,DBT-S04378,lcc,Breast-Cancer-Screening-DBT/DBT-P00023/01-01-2...,Breast-Cancer-Screening-DBT/DBT-P00023/1.2.826...,1.2.826.0.1.3680043.8.498.12136582480949936067...,0,1,0,0
2,DBT-P00023,DBT-S04378,lcc,Breast-Cancer-Screening-DBT/DBT-P00023/01-01-2...,Breast-Cancer-Screening-DBT/DBT-P00023/1.2.826...,1.2.826.0.1.3680043.8.498.12136582480949936067...,0,1,0,0
3,DBT-P00023,DBT-S04378,lcc,Breast-Cancer-Screening-DBT/DBT-P00023/01-01-2...,Breast-Cancer-Screening-DBT/DBT-P00023/1.2.826...,1.2.826.0.1.3680043.8.498.12136582480949936067...,0,1,0,0
4,DBT-P00023,DBT-S04378,lcc,Breast-Cancer-Screening-DBT/DBT-P00023/01-01-2...,Breast-Cancer-Screening-DBT/DBT-P00023/1.2.826...,1.2.826.0.1.3680043.8.498.12136582480949936067...,0,1,0,0


In [20]:
df_sub_target = df_subset.iloc[:,6:]
df_sub_target.head()

Unnamed: 0,Normal,Actionable,Benign,Cancer
0,0,0,1,0
1,0,1,0,0
2,0,1,0,0
3,0,1,0,0
4,0,1,0,0


In [21]:
for i in range(0,df_sub_target.shape[1]):
    print(df_sub_target.iloc[:,i].unique())

[0 1]
[0 1]
[1 0]
[0 1]


In [22]:
from skimage.transform import resize

def dicom_to_numpy(view, path, file_num):
    
    im_path = config.TRAIN_DIR / path
    assert os.path.exists(im_path), 'image does not exist'
                    
    image = dcmread_image(fp=im_path, view=view)
    image = image.astype(float)
    image = image/65535.0
    
    scaled_image = resize(image, output_shape=(6,123,95), preserve_range=True,anti_aliasing=True)
    
    save_path = config.LOCAL_TRAIN_IMAGE_DIR / str(file_num)
    np.save(save_path, scaled_image)
    
    return save_path

In [None]:
numpy_paths = []
for i, r in df_subset.iterrows():
    numpy_paths.append(dicom_to_numpy(r[2],r[3],i))
    percent = float(i) / df_subset.shape[0]
    print(percent)
    
numpy_paths = pd.Series(numpy_paths)



0.0
0.0009337068160597573
0.0018674136321195146
0.0028011204481792717
0.003734827264239029
0.004668534080298786
0.0056022408963585435
0.006535947712418301
0.007469654528478058
0.008403361344537815
0.009337068160597572
0.01027077497665733
0.011204481792717087
0.012138188608776844
0.013071895424836602
0.014005602240896359
0.014939309056956116
0.015873015873015872
0.01680672268907563
0.017740429505135387
0.018674136321195144
0.0196078431372549
0.02054154995331466
0.021475256769374416
0.022408963585434174
0.02334267040149393
0.02427637721755369
0.025210084033613446
0.026143790849673203
0.02707749766573296
0.028011204481792718
0.028944911297852476
0.029878618113912233
0.03081232492997199
0.031746031746031744
0.032679738562091505
0.03361344537815126
0.03454715219421102
0.035480859010270774
0.036414565826330535
0.03734827264239029
0.03828197945845005
0.0392156862745098
0.040149393090569564
0.04108309990662932
0.04201680672268908
0.04295051353874883
0.04388422035480859
0.04481792717086835
0.04

In [None]:
numpy_paths.to_pickle(config.LOCAL_TRAIN_SUBSET)
df_sub_target.to_pickle(config.LOCAL_TRAIN_TARGET)