In [13]:
from helper_tools .tciaclient import TCIAClient
from helper_tools.dicom_tools import dcmread_image

from config import config

import pandas as pd
import numpy as np
import os
%matplotlib inline
import matplotlib.pyplot as plt
import json
import sys
import math
import errno
import pydicom as dicom
import urllib.request, urllib.error, urllib.parse, urllib.request, urllib.parse, urllib.error,sys





In [14]:
#directories for data and other files

PACKAGE_ROOT = config.PACKAGE_ROOT
TRAIN_DIR = config.TRAIN_DIR
TEST_DIR = config.TEST_DIR


In [15]:
#initializing tcia api client for easy queries
tcia_client = TCIAClient(baseUrl="https://services.cancerimagingarchive.net/services/v4",resource = "TCIA")

In [16]:
#reading in metadata including file paths
df_paths = pd.read_csv(TRAIN_DIR / 'file-paths-train.csv')

In [17]:
df_paths.info()

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 2596 entries, 0 to 2595
Data columns (total 5 columns):
 #   Column            Non-Null Count  Dtype 
---  ------            --------------  ----- 
 0   PatientID         2596 non-null   object
 1   StudyUID          2596 non-null   object
 2   View              2596 non-null   object
 3   descriptive_path  2596 non-null   object
 4   classic_path      2596 non-null   object
dtypes: object(5)
memory usage: 101.5+ KB


In [18]:
df_paths.head()

Unnamed: 0,PatientID,StudyUID,View,descriptive_path,classic_path
0,DBT-P00013,DBT-S00163,rmlo,Breast-Cancer-Screening-DBT/DBT-P00013/01-01-2...,Breast-Cancer-Screening-DBT/DBT-P00013/1.2.826...
1,DBT-P00023,DBT-S04378,lcc,Breast-Cancer-Screening-DBT/DBT-P00023/01-01-2...,Breast-Cancer-Screening-DBT/DBT-P00023/1.2.826...
2,DBT-P00023,DBT-S04378,lmlo,Breast-Cancer-Screening-DBT/DBT-P00023/01-01-2...,Breast-Cancer-Screening-DBT/DBT-P00023/1.2.826...
3,DBT-P00023,DBT-S04378,rcc,Breast-Cancer-Screening-DBT/DBT-P00023/01-01-2...,Breast-Cancer-Screening-DBT/DBT-P00023/1.2.826...
4,DBT-P00023,DBT-S04378,rmlo,Breast-Cancer-Screening-DBT/DBT-P00023/01-01-2...,Breast-Cancer-Screening-DBT/DBT-P00023/1.2.826...


In [19]:
df_paths['id'] = df_paths['classic_path'].apply(lambda x: x.split('/')[3])

In [20]:
df_paths.head()

Unnamed: 0,PatientID,StudyUID,View,descriptive_path,classic_path,id
0,DBT-P00013,DBT-S00163,rmlo,Breast-Cancer-Screening-DBT/DBT-P00013/01-01-2...,Breast-Cancer-Screening-DBT/DBT-P00013/1.2.826...,1.2.826.0.1.3680043.8.498.97979602815077649368...
1,DBT-P00023,DBT-S04378,lcc,Breast-Cancer-Screening-DBT/DBT-P00023/01-01-2...,Breast-Cancer-Screening-DBT/DBT-P00023/1.2.826...,1.2.826.0.1.3680043.8.498.12136582480949936067...
2,DBT-P00023,DBT-S04378,lmlo,Breast-Cancer-Screening-DBT/DBT-P00023/01-01-2...,Breast-Cancer-Screening-DBT/DBT-P00023/1.2.826...,1.2.826.0.1.3680043.8.498.90045035130681803298...
3,DBT-P00023,DBT-S04378,rcc,Breast-Cancer-Screening-DBT/DBT-P00023/01-01-2...,Breast-Cancer-Screening-DBT/DBT-P00023/1.2.826...,1.2.826.0.1.3680043.8.498.10822555886306795549...
4,DBT-P00023,DBT-S04378,rmlo,Breast-Cancer-Screening-DBT/DBT-P00023/01-01-2...,Breast-Cancer-Screening-DBT/DBT-P00023/1.2.826...,1.2.826.0.1.3680043.8.498.29938515490857039234...


In [21]:
# funciton to download DICOM images from 
def image_download(df, start, end):
    for i in range(start, end):
        tcia_client.get_image(
            seriesInstanceUid = df.iloc[i,5], 
            file_path = TRAIN_DIR / str(df.iloc[i,3])
        )

In [22]:
#testing download of first two images
image_download(df_paths,0,2)

In [25]:
def view_image(df, row):
    
    view_series = df.iloc[row]
    view = view_series["View"]
    image_path = TRAIN_DIR / str(view_series["descriptive_path"])
    
    assert os.path.exists(image_path), 'image does not exist'
                    
    image = dcmread_image(fp=image_path, view=view)
    plt.imshow(image[0], cmap=plt.cm.gray);

In [26]:
view_image(df_paths,10)

AssertionError: image does not exist

In [15]:
response = urllib.request.Request('https://services.cancerimagingarchive.net/services/v4/TCIA/query/getCollectionValues')
resp = urllib.request.urlopen(response)
resp_json = json.loads(resp.read())
df_collections = pd.json_normalize(resp_json)

In [29]:
df_collections

Unnamed: 0,Collection
0,TCGA-GBM
1,LIDC-IDRI
2,BREAST-DIAGNOSIS
3,PROSTATE-MRI
4,PROSTATE-DIAGNOSIS
...,...
101,PDMR-292921-168-R
102,Prostate-MRI-US-Biopsy
103,DRO-Toolkit
104,COVID-19


In [28]:
mask = ['Breast' in x for x in df_collections['Collection']]
df_collections[mask]

Unnamed: 0,Collection
8,RIDER Breast MRI
39,QIN Breast DCE-MRI
57,Breast-MRI-NACT-Pilot
67,ACRIN-FLT-Breast


In [None]:
Breast-Cancer-Screening-DBT	DBT-P00029	1.2.826.0.1.3680043.8.498.10501549244263206170119288752475799160	1.2.826.0.1.3680043.8.498.63435394700083659941067595830930930080	59.8 MB	1	0.0	Not Started

In [34]:
response2 = urllib.request.Request('https://services.cancerimagingarchive.net/services/v4/TCIA/query/getPatientStudy?PatientID=DBT-P00029')
resp2 = urllib.request.urlopen(response)


In [32]:
resp2.getcode()

200

In [35]:
resp2.read()

b'[]'