# Setup

In [None]:
# for CTDC
!pip install gql

# for TCIA
!pip install tcia_utils

In [18]:
import pandas as pd

# for TCIA
from tcia_utils import nbia
from tcia_utils import wordpress

# for GDC
import json
import requests
import io

# for CTDC
from gql import gql, Client
from gql.transport.requests import RequestsHTTPTransport

# Get inventory of Radiology and Pathology data from TCIA

Find list of all Biobank, CPTAC and TCGA collections from TCIA collection manager.

In [3]:
fields = ['id', 'link', 'collection_short_title']

collections = wordpress.getCollections(format = 'df', fields = fields)

In [4]:
nci_programs = wordpress.searchDf(['TCGA', 'CPTAC', 'CMB'], collections, 'collection_short_title')

nci_programs = nci_programs.sort_values(by='collection_short_title').reset_index(drop=True)

nci_programs


Unnamed: 0,id,link,collection_short_title
0,41649,https://www.cancerimagingarchive.net/collectio...,CMB-AML
1,49063,https://www.cancerimagingarchive.net/collectio...,CMB-BRCA
2,41671,https://www.cancerimagingarchive.net/collectio...,CMB-CRC
3,41681,https://www.cancerimagingarchive.net/collectio...,CMB-GEC
4,41703,https://www.cancerimagingarchive.net/collectio...,CMB-LCA
5,41725,https://www.cancerimagingarchive.net/collectio...,CMB-MEL
6,41741,https://www.cancerimagingarchive.net/collectio...,CMB-MML
7,49073,https://www.cancerimagingarchive.net/collectio...,CMB-OV
8,41763,https://www.cancerimagingarchive.net/collectio...,CMB-PCA
9,44417,https://www.cancerimagingarchive.net/collectio...,CPTAC-AML


Get list of subjects that have radiology images.

In [5]:
nbia.getToken()

Enter User: 


 kirbyju
Enter Password:  ········


200

In [6]:
radiology = nbia.getPatient(format = 'df')

In [7]:
radiology

Unnamed: 0,PatientId,PatientName,PatientSex,Collection,Phantom,SpeciesCode,SpeciesDescription,EthnicGroup,PatientBirthDate
0,TCGA-08-0244,,M,TCGA-GBM,NO,337915000,Homo sapiens,,
1,TCGA-08-0246,,F,TCGA-GBM,NO,337915000,Homo sapiens,,
2,TCGA-08-0348,,M,TCGA-GBM,NO,337915000,Homo sapiens,,
3,TCGA-08-0349,,M,TCGA-GBM,NO,337915000,Homo sapiens,,
4,TCGA-08-0350,,M,TCGA-GBM,NO,337915000,Homo sapiens,,
...,...,...,...,...,...,...,...,...,...
48711,AP-6M60,AP-6M60,F,APOLLO-5-LSCC,NO,337915000,Homo sapiens,,
48712,AP-6H6G,AP-6H6G,M,APOLLO-5-LSCC,NO,337915000,Homo sapiens,,
48713,AP-95DK,AP-95DK,M,APOLLO-5-LSCC,NO,337915000,Homo sapiens,,
48714,AP-9GTR,AP-9GTR,M,APOLLO-5-LSCC,NO,337915000,Homo sapiens,,


Get list of subjects that have pathology images.

In [8]:
# Define the URL of the CSV file
url = "https://pathdb.cancerimagingarchive.net/system/files/collectionmetadata/202401/cohort_builder_v1_01-16-2024.csv"

try:
  # Download and read the CSV data
  pathology = pd.read_csv(url)

  # Drop unwanted columns
 # df = df[['collection', 'collection_doi', 'patient_id', 'slide_id']]

  # Create a new column with the base URL and slide ID
  pathology['viewer_url'] = "https://pathdb.cancerimagingarchive.net/caMicroscope/apps/mini/viewer.html?mode=pathdb&slideid=" + pathology['slide_id'].astype(str)

  # Prefix collection_doi with https://doi.org/
  pathology['collection_doi'] = "https://doi.org/" + pathology['collection_doi'].astype(str)

  print("Successfully processed data into DataFrame.")
except Exception as e:
  print(f"An error occurred: {e}")

Successfully processed data into DataFrame.


  pathology = pd.read_csv(url)


In [9]:
pathology

Unnamed: 0,collection,collection_doi,patient_id,slide_id,view,camic_id,has_radiology,has_genomics,has_proteomics,species,...,cancer_location,data_format,supporting_data_type,modality,protocol,par,magnification,access,update,viewer_url
0,CMB-AML,https://doi.org/10.7937/PCTE-6M66,MSB-01723,MSB-01723-10-02,View,226264,yes,no,no,Human,...,Blood,SVS,Clinical,Whole slide image,Hematoxylin and eosin,"(0.252,0.252)mpp",40x,Public,2024-08-30,https://pathdb.cancerimagingarchive.net/caMicr...
1,CMB-AML,https://doi.org/10.7937/PCTE-6M66,MSB-01723,MSB-01723-04-02,View,226265,yes,no,no,Human,...,Blood,SVS,Clinical,Whole slide image,Hematoxylin and eosin,"(0.252,0.252)mpp",40x,Public,2024-08-30,https://pathdb.cancerimagingarchive.net/caMicr...
2,CMB-AML,https://doi.org/10.7937/PCTE-6M66,MSB-01723,MSB-01723-09-06,View,226266,yes,no,no,Human,...,Blood,SVS,Clinical,Whole slide image,Hematoxylin and eosin,"(0.252,0.252)mpp",40x,Public,2024-08-30,https://pathdb.cancerimagingarchive.net/caMicr...
3,CMB-AML,https://doi.org/10.7937/PCTE-6M66,MSB-02960,MSB-02960-04-02,View,311781,no,no,no,Human,...,Blood,SVS,Clinical,Whole slide image,Hematoxylin and eosin,"(0.252,0.252)mpp",40x,Public,2024-08-30,https://pathdb.cancerimagingarchive.net/caMicr...
4,CMB-AML,https://doi.org/10.7937/PCTE-6M66,MSB-02960,MSB-02960-08-02,View,311782,no,no,no,Human,...,Blood,SVS,Clinical,Whole slide image,Hematoxylin and eosin,"(0.252,0.252)mpp",40x,Public,2024-08-30,https://pathdb.cancerimagingarchive.net/caMicr...
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
305767,PROSTATE-MRI,https://doi.org/10.7937/K9/TCIA.2016.6046GUDv,MIP-PROSTATE-01-0022,MIP-PROSTATE-01-0022,View,2081,yes,no,no,Human,...,Prostate,JPG,,,,,,Public,2011-06-30,https://pathdb.cancerimagingarchive.net/caMicr...
305768,PROSTATE-MRI,https://doi.org/10.7937/K9/TCIA.2016.6046GUDv,MIP-PROSTATE-01-0023,MIP-PROSTATE-01-0023,View,2082,yes,no,no,Human,...,Prostate,JPG,,,,,,Public,2011-06-30,https://pathdb.cancerimagingarchive.net/caMicr...
305769,PROSTATE-MRI,https://doi.org/10.7937/K9/TCIA.2016.6046GUDv,MIP-PROSTATE-01-0024,MIP-PROSTATE-01-0024,View,2083,yes,no,no,Human,...,Prostate,JPG,,,,,,Public,2011-06-30,https://pathdb.cancerimagingarchive.net/caMicr...
305770,PROSTATE-MRI,https://doi.org/10.7937/K9/TCIA.2016.6046GUDv,MIP-PROSTATE-01-0025,MIP-PROSTATE-01-0025,View,2084,yes,no,no,Human,...,Prostate,JPG,,,,,,Public,2011-06-30,https://pathdb.cancerimagingarchive.net/caMicr...


Create function to add status of available images to clinical dataframes.

In [10]:
def add_image_availability(df, pathology_patients, radiology_patients):
  """Adds a new column 'available_images' to the DataFrame indicating image availability.

  Args:
    df: The DataFrame to modify.
    pathology_patients: A list of unique patient IDs from the pathology data.
    radiology_patients: A list of unique patient IDs from the radiology data.

  Returns:
    The modified DataFrame.
  """

  df['available_images'] = df['Case ID'].apply(lambda x: 'Both' if x in pathology_patients and x in radiology_patients 
                                                              else 'Pathology' if x in pathology_patients 
                                                              else 'Radiology' if x in radiology_patients 
                                                              else 'Neither')
  return df

# Create lists of rad/path patient IDs
pathology_patients = pathology['patient_id'].unique().tolist()
radiology_patients = radiology['PatientId'].unique().tolist()

# Get Biobank clinical data from Clinical Trials Data Commons

In [49]:
# Define the transport
transport = RequestsHTTPTransport(
    url="https://clinical.datacommons.cancer.gov/v1/graphql/",
    verify=True,
    retries=3,
)

# Create a client instance, disabling schema fetching
client = Client(transport=transport, fetch_schema_from_transport=False)

# Define the query and variables
query = gql("""
  query participantOverview($subject_id: [String], $ctep_disease_term: [String], $stage_of_disease: [String], $tumor_grade: [String], $sex: [String], $reported_gender: [String], $race: [String], $ethnicity: [String], $carcinogen_exposure: [String], $targeted_therapy: [String], $anatomical_collection_site: [String], $tissue_category: [String], $assessment_timepoint: [String], $data_file_type: [String], $data_file_format: [String], $first: Int, $offset: Int, $order_by: String, $sort_direction: String) {
    participantOverview(subject_id: $subject_id, ctep_disease_term: $ctep_disease_term, stage_of_disease: $stage_of_disease, tumor_grade: $tumor_grade, sex: $sex, reported_gender: $reported_gender, race: $race, ethnicity: $ethnicity, carcinogen_exposure: $carcinogen_exposure, targeted_therapy: $targeted_therapy, anatomical_collection_site: $anatomical_collection_site, tissue_category: $tissue_category, assessment_timepoint: $assessment_timepoint, data_file_type: $data_file_type, data_file_format: $data_file_format, first: $first, offset: $offset, order_by: $order_by, sort_direction: $sort_direction) {
      subject_id
      ctep_disease_term
      stage_of_disease
      tumor_grade
      age_at_enrollment
      sex
      reported_gender
      race
      ethnicity
      carcinogen_exposure
      targeted_therapy
      data_file_uuid
      __typename
    }
  }
""")

variables = {
    "first": 10000
}

# Execute the query
result = client.execute(query, variable_values=variables)

# Execute the query
result = client.execute(query, variable_values=variables)

# Extract the data from the result
data = result.get("participantOverview", [])

# Convert the data into a pandas DataFrame
biobank = pd.DataFrame(data)

biobank = biobank.rename(columns={'subject_id': 'Case ID'})

#biobank.to_excel('ctdc.xlsx')
biobank

Unnamed: 0,Case ID,ctep_disease_term,stage_of_disease,tumor_grade,age_at_enrollment,sex,reported_gender,race,ethnicity,carcinogen_exposure,targeted_therapy,data_file_uuid,__typename
0,MSB-00089,Plasma Cell Myeloma,,,67.0,Female,,Black or African American,Not Hispanic or Latino,No,"[Bortezomib, Lenalidomide]",[],ParticipantOverview
1,MSB-00140,Plasma Cell Myeloma,,,65.0,Female,,White,Not Hispanic or Latino,No,"[Bortezomib, Lenalidomide]","[dg.4DFC/CC00EEF6-1730-40B3-9FA6-3D8FCD39EB83,...",ParticipantOverview
2,MSB-00205,Melanoma,,Grade cannot be assessed,63.0,Male,Male,White,Not Hispanic or Latino,No,[],[],ParticipantOverview
3,MSB-00241,Colorectal Carcinoma,,,59.0,Female,,White,Not Hispanic or Latino,Unknown,[],[dg.4DFC/7E4EE0AD-DFCC-4AEA-BA47-9C94E6A6E116],ParticipantOverview
4,MSB-00263,Melanoma,,Grade cannot be assessed,56.0,Male,Male,White,Not Hispanic or Latino,Unknown,[],"[dg.4DFC/640C5DFC-B8A1-44AD-B950-467ABC123F45,...",ParticipantOverview
...,...,...,...,...,...,...,...,...,...,...,...,...,...
243,MSB-09886,Colorectal Carcinoma,Stage IVB,Moderately Differentiated,67.0,Male,,White,Not Hispanic or Latino,Unknown,[Bevacizumab],[],ParticipantOverview
244,MSB-09897,Colorectal Carcinoma,Stage IVA,Moderately Differentiated,59.0,Male,,White,Hispanic or Latino,Unknown,[Bevacizumab],[],ParticipantOverview
245,MSB-09977,Small Cell Lung Carcinoma,Stage IV,Grade cannot be assessed,79.0,Male,,White,Not Hispanic or Latino,Unknown,[],"[dg.4DFC/A94E8653-C843-4D2F-9860-67E1BAFAD3F0,...",ParticipantOverview
246,MSB-09991,Colorectal Carcinoma,Stage IV,Moderately Differentiated,66.0,Male,,White,Not Hispanic or Latino,No,"[Bevacizumab, Cetuximab]","[dg.4DFC/D69D7DA7-D19F-4A6A-9523-3026F6B03A22,...",ParticipantOverview


In [50]:
def add_collection_from_other_dfs(df, pathology, radiology):
  """Adds a 'Collection' column to the DataFrame based on matches in pathology and radiology DataFrames.

  Args:
    df: The input DataFrame.
    pathology: The pathology DataFrame.
    radiology: The radiology DataFrame.

  Returns:
    The modified DataFrame.
  """

  # Create a dictionary mapping patient IDs to collections from pathology
  pathology_map = pathology.set_index('patient_id')['collection'].to_dict()

  # Create a dictionary mapping patient IDs to collections from radiology
  radiology_map = radiology.set_index('PatientId')['Collection'].to_dict()

  # Add the 'Collection' column to the biobank DataFrame
  df['Project Short Name'] = df['Case ID'].map(pathology_map).fillna(df['Case ID'].map(radiology_map))

  return df

# Add the 'Collection' column to the biobank DataFrame
biobank_with_project_name = add_collection_from_other_dfs(biobank, pathology, radiology)

biobank_with_project_name

Unnamed: 0,Case ID,ctep_disease_term,stage_of_disease,tumor_grade,age_at_enrollment,sex,reported_gender,race,ethnicity,carcinogen_exposure,targeted_therapy,data_file_uuid,__typename,Project Short Name
0,MSB-00089,Plasma Cell Myeloma,,,67.0,Female,,Black or African American,Not Hispanic or Latino,No,"[Bortezomib, Lenalidomide]",[],ParticipantOverview,CMB-MML
1,MSB-00140,Plasma Cell Myeloma,,,65.0,Female,,White,Not Hispanic or Latino,No,"[Bortezomib, Lenalidomide]","[dg.4DFC/CC00EEF6-1730-40B3-9FA6-3D8FCD39EB83,...",ParticipantOverview,CMB-MML
2,MSB-00205,Melanoma,,Grade cannot be assessed,63.0,Male,Male,White,Not Hispanic or Latino,No,[],[],ParticipantOverview,CMB-MEL
3,MSB-00241,Colorectal Carcinoma,,,59.0,Female,,White,Not Hispanic or Latino,Unknown,[],[dg.4DFC/7E4EE0AD-DFCC-4AEA-BA47-9C94E6A6E116],ParticipantOverview,CMB-CRC
4,MSB-00263,Melanoma,,Grade cannot be assessed,56.0,Male,Male,White,Not Hispanic or Latino,Unknown,[],"[dg.4DFC/640C5DFC-B8A1-44AD-B950-467ABC123F45,...",ParticipantOverview,CMB-MEL
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
243,MSB-09886,Colorectal Carcinoma,Stage IVB,Moderately Differentiated,67.0,Male,,White,Not Hispanic or Latino,Unknown,[Bevacizumab],[],ParticipantOverview,CMB-CRC
244,MSB-09897,Colorectal Carcinoma,Stage IVA,Moderately Differentiated,59.0,Male,,White,Hispanic or Latino,Unknown,[Bevacizumab],[],ParticipantOverview,CMB-CRC
245,MSB-09977,Small Cell Lung Carcinoma,Stage IV,Grade cannot be assessed,79.0,Male,,White,Not Hispanic or Latino,Unknown,[],"[dg.4DFC/A94E8653-C843-4D2F-9860-67E1BAFAD3F0,...",ParticipantOverview,CMB-LCA
246,MSB-09991,Colorectal Carcinoma,Stage IV,Moderately Differentiated,66.0,Male,,White,Not Hispanic or Latino,No,"[Bevacizumab, Cetuximab]","[dg.4DFC/D69D7DA7-D19F-4A6A-9523-3026F6B03A22,...",ParticipantOverview,CMB-CRC


In [21]:
value_counts = biobank_with_project_name['Project Short Name'].value_counts(dropna=False)

print(value_counts)

Project Short Name
CMB-MML    64
CMB-LCA    61
CMB-CRC    49
CMB-MEL    43
CMB-PCA    12
CMB-AML     8
CMB-GEC     7
NaN         4
Name: count, dtype: int64


In [22]:
null_values = biobank_with_project_name[biobank_with_project_name['Project Short Name'].isnull()]['subject_id'].tolist()

null_values

['MSB-01363', 'MSB-02746', 'MSB-03272', 'MSB-07370']

In [23]:
# rename subject_id to Case ID
biobank_with_project_name = biobank_with_project_name.rename(columns={'subject_id': 'Case ID'})

# Add the 'available_images' column to the 'pathology' DataFrame
biobank_with_project_and_image_availability = add_image_availability(biobank_with_project_name, pathology_patients, radiology_patients)

biobank_with_project_and_image_availability.to_excel('ctdc-biobank.xlsx')

biobank_with_project_and_image_availability

Unnamed: 0,Case ID,ctep_disease_term,stage_of_disease,tumor_grade,age_at_enrollment,sex,reported_gender,race,ethnicity,carcinogen_exposure,targeted_therapy,data_file_uuid,__typename,Project Short Name,available_images
0,MSB-00089,Plasma Cell Myeloma,,,67.0,Female,,Black or African American,Not Hispanic or Latino,No,"[Bortezomib, Lenalidomide]",[],ParticipantOverview,CMB-MML,Pathology
1,MSB-00140,Plasma Cell Myeloma,,,65.0,Female,,White,Not Hispanic or Latino,No,"[Bortezomib, Lenalidomide]","[dg.4DFC/CC00EEF6-1730-40B3-9FA6-3D8FCD39EB83,...",ParticipantOverview,CMB-MML,Both
2,MSB-00205,Melanoma,,Grade cannot be assessed,63.0,Male,Male,White,Not Hispanic or Latino,No,[],[],ParticipantOverview,CMB-MEL,Pathology
3,MSB-00241,Colorectal Carcinoma,,,59.0,Female,,White,Not Hispanic or Latino,Unknown,[],[dg.4DFC/7E4EE0AD-DFCC-4AEA-BA47-9C94E6A6E116],ParticipantOverview,CMB-CRC,Pathology
4,MSB-00263,Melanoma,,Grade cannot be assessed,56.0,Male,Male,White,Not Hispanic or Latino,Unknown,[],"[dg.4DFC/640C5DFC-B8A1-44AD-B950-467ABC123F45,...",ParticipantOverview,CMB-MEL,Both
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
243,MSB-09886,Colorectal Carcinoma,Stage IVB,Moderately Differentiated,67.0,Male,,White,Not Hispanic or Latino,Unknown,[Bevacizumab],[],ParticipantOverview,CMB-CRC,Both
244,MSB-09897,Colorectal Carcinoma,Stage IVA,Moderately Differentiated,59.0,Male,,White,Hispanic or Latino,Unknown,[Bevacizumab],[],ParticipantOverview,CMB-CRC,Pathology
245,MSB-09977,Small Cell Lung Carcinoma,Stage IV,Grade cannot be assessed,79.0,Male,,White,Not Hispanic or Latino,Unknown,[],"[dg.4DFC/A94E8653-C843-4D2F-9860-67E1BAFAD3F0,...",ParticipantOverview,CMB-LCA,Both
246,MSB-09991,Colorectal Carcinoma,Stage IV,Moderately Differentiated,66.0,Male,,White,Not Hispanic or Latino,No,"[Bevacizumab, Cetuximab]","[dg.4DFC/D69D7DA7-D19F-4A6A-9523-3026F6B03A22,...",ParticipantOverview,CMB-CRC,Both


# in https://tcia-clinical-validator.streamlit.app/

* Create a new dataframe called 'master_clinical' from 'biobank_with_project_and_image_availability' dataframe
* Include and rename 'ctep_disease_term' -> 'Primary Diagnosis' column. 
* Include and rename 'age_at_enrollment' -> 'Age at Enrollment' column.
* Include 'Race' and 'Ethnicity' columns, but capitalize them (they're called 'race' and 'ethnicity' in the biobank dataframe).  
* Include and rename 'sex' -> 'Sex at Birth' column.
* Create 'Age UOM' column and populate all rows with 'Year' as the value
* Convert the word 'or' in all values that are found in the Race and Ethnicity columns to be lower case
* Include 'Project Short Name' and 'available_images' 


In [62]:
biobank_clinical = biobank_with_project_and_image_availability[['Project Short Name', 'Case ID', 'available_images', 'ctep_disease_term', 'race', 'ethnicity', 'sex', 'age_at_enrollment']].copy()

# Rename columns
biobank_clinical.rename(columns={'ctep_disease_term': 'Primary Diagnosis',
                               'age_at_enrollment': 'Age at Enrollment',
                               'sex': 'Sex at Birth',
                               'race': 'Race',
                               'ethnicity':'Ethnicity'}, inplace=True)


# Convert 'or' to lowercase in Race and Ethnicity
biobank_clinical['Race'] = master_clinical['Race'].str.replace('Or', 'or', case=False)
biobank_clinical['Ethnicity'] = master_clinical['Ethnicity'].str.replace('Or', 'or', case=False)

# Create 'Age UOM' column
biobank_clinical['Age UOM'] = 'Year'

In [63]:
biobank_clinical

Unnamed: 0,Project Short Name,Case ID,available_images,Primary Diagnosis,Race,Ethnicity,Sex at Birth,Age at Enrollment,Age UOM
0,CMB-MML,MSB-00089,Pathology,Plasma Cell Myeloma,Black or African American,Not Hispanic or Latino,Female,67.0,Year
1,CMB-MML,MSB-00140,Both,Plasma Cell Myeloma,White,Not Hispanic or Latino,Female,65.0,Year
2,CMB-MEL,MSB-00205,Pathology,Melanoma,White,Not Hispanic or Latino,Male,63.0,Year
3,CMB-CRC,MSB-00241,Pathology,Colorectal Carcinoma,White,Not Hispanic or Latino,Female,59.0,Year
4,CMB-MEL,MSB-00263,Both,Melanoma,White,Not Hispanic or Latino,Male,56.0,Year
...,...,...,...,...,...,...,...,...,...
243,CMB-CRC,MSB-09886,Both,Colorectal Carcinoma,White,Not Hispanic or Latino,Male,67.0,Year
244,CMB-CRC,MSB-09897,Pathology,Colorectal Carcinoma,White,Hispanic or Latino,Male,59.0,Year
245,CMB-LCA,MSB-09977,Both,Small Cell Lung Carcinoma,White,Not Hispanic or Latino,Male,79.0,Year
246,CMB-CRC,MSB-09991,Both,Colorectal Carcinoma,White,Not Hispanic or Latino,Male,66.0,Year


# Get CPTAC and TCGA clinical data from GDC

In [37]:
gdc_collection_list = nci_programs['collection_short_title'].tolist()

gdc_collection_list = [item for item in gdc_collection_list if 'cptac' not in item.lower()]
gdc_collection_list.extend(['CPTAC-2', 'CPTAC-3'])

In [40]:
cases_endpt = 'https://api.gdc.cancer.gov/cases'

filters = {
    "op": "in",
    "content":{
        "field": "project.project_id",
        "value": gdc_collection_list
        }
    }

fields = [
    "project.project_id",
    "submitter_id",
    ]

fields = ','.join(fields)

expand = [ ## For the allowable values for this list, look under "mapping" at https://api.gdc.cancer.gov/cases/_mapping
    "demographic",
    "diagnoses"
    #"diagnoses.treatments",
    #"exposures",
    #"family_histories"
    ]

expand = ','.join(expand)

params = {
    "filters": json.dumps(filters),
    "expand": expand,
    "fields": fields,
    "format": "TSV", ## This can be "JSON" too
    "size": "20000", ## If you are re-using this for other projects, you may need to modify this and the "from" number.
    "from":"0"
    }

response = requests.get(cases_endpt, params = params)

output = response.content.decode('UTF-8')
clinicalDf = pd.read_csv(io.StringIO(output), sep='\t')

#clinicalDf.to_excel('gdc.xlsx')

  clinicalDf = pd.read_csv(io.StringIO(output), sep='\t')


In [41]:
# rename subject_id to Case ID
gdc = clinicalDf.rename(columns={'submitter_id': 'Case ID'})

# Add the 'available_images' column to the 'pathology' DataFrame
gdc_with_project_and_image_availability = add_image_availability(gdc, pathology_patients, radiology_patients)

#gdc_with_project_and_image_availability.to_excel('gdc.xlsx')

gdc_with_project_and_image_availability

Unnamed: 0,demographic.age_at_index,demographic.age_is_obfuscated,demographic.cause_of_death,demographic.cause_of_death_source,demographic.country_of_residence_at_enrollment,demographic.created_datetime,demographic.days_to_birth,demographic.days_to_death,demographic.demographic_id,demographic.ethnicity,...,diagnoses.0.tumor_grade,diagnoses.0.tumor_regression_grade,diagnoses.0.updated_datetime,diagnoses.0.weiss_assessment_score,diagnoses.0.wilms_tumor_histologic_subtype,diagnoses.0.year_of_diagnosis,id,project.project_id,Case ID,available_images
0,65.0,,,,,,-24064.0,1484.0,4d9e60db-29bf-55a5-b771-45b205a0536b,not hispanic or latino,...,Not Reported,,2023-10-06T12:23:52.066938-05:00,,,2007.0,85a85a11-7200-4e96-97af-6ba26d680d59,TCGA-OV,TCGA-13-0920,Radiology
1,49.0,,,,,,-17961.0,1447.0,bf1408e4-38fe-591d-b84c-033112b7d068,not hispanic or latino,...,Not Reported,,2023-10-06T12:23:52.066938-05:00,,,2009.0,7922df77-f09a-488c-a1be-58646ceb9b3e,TCGA-OV,TCGA-42-2582,Neither
2,80.0,,,,,,-29501.0,563.0,6a95eab4-efb0-5637-8480-f4a864f3c478,not reported,...,Not Reported,,2023-10-06T12:23:52.066938-05:00,,,2006.0,8727855e-120a-4216-a803-8cc6cd1159be,TCGA-OV,TCGA-04-1342,Neither
3,59.0,,,,,,-21635.0,2025.0,55b47e7c-07df-5c84-a4c5-11752b4a80e1,not reported,...,Not Reported,,2023-10-06T12:23:52.066938-05:00,,,2004.0,82e96c6c-a88c-4e52-be56-7f24f6c7b835,TCGA-OV,TCGA-13-1819,Neither
4,60.0,,,,,,-21963.0,55.0,e6acce8e-ae80-5f1b-846a-24a6d203fe5a,not hispanic or latino,...,Not Reported,,2023-10-06T12:23:52.066938-05:00,,,2005.0,ab3dbbbe-eed6-4a35-a505-1815225e86c9,TCGA-OV,TCGA-04-1335,Neither
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
11264,53.0,,,,,,-19547.0,,eb111509-466e-5511-a767-c881fb9375e6,not hispanic or latino,...,Not Reported,,2023-10-06T12:22:33.388084-05:00,,,2009.0,a798a8cc-4e72-4a8c-9e20-74a14deafd12,TCGA-UCEC,TCGA-BG-A0MS,Neither
11265,80.0,,,,,,-29260.0,,4bed5f1e-e85b-5763-bb8b-ea0029cf902d,hispanic or latino,...,Not Reported,,2023-10-06T12:22:33.388084-05:00,,,2011.0,fe2e89f7-8f4d-420a-a551-4877cf0fd1d3,TCGA-UCEC,TCGA-EO-A3KX,Neither
11266,83.0,,,,,,-30517.0,456.0,76d2f9cb-ddcb-59c4-9316-1df7c9f3062d,not hispanic or latino,...,Not Reported,,2023-10-06T12:22:33.388084-05:00,,,2008.0,db3a4986-55d5-4ecc-be73-59725dce3c33,TCGA-UCEC,TCGA-EY-A1G8,Neither
11267,73.0,,,,,,-26992.0,,16d35689-c5a4-50b8-b9ee-af408c742373,not hispanic or latino,...,Not Reported,,2023-10-06T12:22:33.388084-05:00,,,2013.0,ff6b5fc8-0572-4b58-b3a5-bcda41badbc8,TCGA-UCEC,TCGA-PG-A914,Neither


In [42]:
gdc_with_project_and_image_availability = gdc_with_project_and_image_availability[gdc_with_project_and_image_availability['available_images'] != 'Neither']

#gdc_with_project_and_image_availability.to_excel('gdc-clinical.xlsx')

# in https://tcia-clinical-validator.streamlit.app/
Please create a copy of 'gdc_with_project_and_image_availability' dataframe called 'gdc_tmp' that only contains:
* demographic.ethnicity -> but change header to be Ethnicity
* demographic.gender -> rename to Sex at Birth
* demographic.race -> rename to Race
* diagnoses.0.age_at_diagnosis -> rename to Age at Diagnosis
* diagnoses.0.primary_diagnosis -> Primary Diagnosis
* diagnoses.0.tissue_or_organ_of_origin -> Tissue or Organ of Origin
* project.project_id -> Project Short Name
* create an 'Age UOM' column and populate all values with 'Day'
* Convert the word 'or' in all values that are found in the Tissue or Organ of Origin, Race, Ethnicity columns to be lower case
* set 'Nan' and 'nan' values to be null/NaN values (not strings that say 'Nan' or 'nan')

Side note: there are 40 diagnosis values that don't fall into the GDC CDE?

In [59]:
gdc_tmp = gdc_with_project_and_image_availability[['project.project_id', 'Case ID', 'available_images', 'demographic.ethnicity', 'demographic.gender', 'demographic.race', 'diagnoses.0.primary_diagnosis', 'diagnoses.0.tissue_or_organ_of_origin', 'diagnoses.0.age_at_diagnosis']].copy()

# Rename columns
gdc_tmp.rename(columns={'demographic.ethnicity': 'Ethnicity',
                         'demographic.gender': 'Sex at Birth',
                         'demographic.race': 'Race',
                         'diagnoses.0.age_at_diagnosis': 'Age at Diagnosis',
                         'diagnoses.0.primary_diagnosis': 'Primary Diagnosis',
                         'diagnoses.0.tissue_or_organ_of_origin': 'Tissue or Organ of Origin',
                         'project.project_id': 'Project Short Name'}, inplace=True)

# Convert 'or' to lowercase in specified columns
gdc_tmp['Tissue or Organ of Origin'] = gdc_tmp['Tissue or Organ of Origin'].str.replace('Or', 'or', case=False)
gdc_tmp['Race'] = gdc_tmp['Race'].str.replace('Or', 'or', case=False)
gdc_tmp['Ethnicity'] = gdc_tmp['Ethnicity'].str.replace('Or', 'or', case=False)

# Create 'Age UOM' column and populate with 'Day'
gdc_tmp['Age UOM'] = 'Day'

# Convert 'Nan' and 'nan' to NaN
gdc_tmp = gdc_tmp.replace(['Nan', 'nan'], pd.NA)

# Add the 'Collection' column to the DataFrame
gdc_tmp = add_collection_from_other_dfs(gdc_tmp, pathology, radiology)

In [53]:
# Add the 'Collection' column to the biobank DataFrame
master_clinical = add_collection_from_other_dfs(master_clinical, pathology, radiology)

master_clinical

Unnamed: 0,Project Short Name,Case ID,available_images,Primary Diagnosis,Race,Ethnicity,Sex at Birth,Age at Enrollment,Age UOM,Tissue or Organ of Origin,Age at Diagnosis
0,CMB-MML,MSB-00089,Pathology,Plasma Cell Myeloma,Black or African American,Not Hispanic or Latino,Female,67.0,Year,,
1,CMB-MML,MSB-00140,Both,Plasma Cell Myeloma,White,Not Hispanic or Latino,Female,65.0,Year,,
2,CMB-MEL,MSB-00205,Pathology,Melanoma,White,Not Hispanic or Latino,Male,63.0,Year,,
3,CMB-CRC,MSB-00241,Pathology,Colorectal Carcinoma,White,Not Hispanic or Latino,Female,59.0,Year,,
4,CMB-MEL,MSB-00263,Both,Melanoma,White,Not Hispanic or Latino,Male,56.0,Year,,
...,...,...,...,...,...,...,...,...,...,...,...
2077,TCGA-UCEC,TCGA-D1-A15Z,Radiology,"Endometrioid adenocarcinoma, NOS",white,not hispanic or latino,female,,Day,Endometrium,26638.0
2078,TCGA-UCEC,TCGA-D1-A175,Radiology,"Endometrioid adenocarcinoma, NOS",white,not hispanic or latino,female,,Day,Endometrium,17802.0
2079,TCGA-UCEC,TCGA-D1-A15V,Radiology,"Serous cystadenocarcinoma, NOS",white,not hispanic or latino,female,,Day,Endometrium,24886.0
2080,TCGA-UCEC,TCGA-FI-A2CX,Radiology,"Endometrioid adenocarcinoma, NOS",white,not hispanic or latino,female,,Day,Endometrium,30224.0


In [60]:
gdc_tmp

Unnamed: 0,Project Short Name,Case ID,available_images,Ethnicity,Sex at Birth,Race,Primary Diagnosis,Tissue or Organ of Origin,Age at Diagnosis,Age UOM
0,TCGA-OV,TCGA-13-0920,Radiology,not hispanic or latino,female,white,"Serous cystadenocarcinoma, NOS",Ovary,24064.0,Day
9,TCGA-OV,TCGA-13-1408,Radiology,not hispanic or latino,female,white,"Serous cystadenocarcinoma, NOS",Ovary,21635.0,Day
15,TCGA-OV,TCGA-09-1675,Radiology,not hispanic or latino,female,not reported,"Serous cystadenocarcinoma, NOS",Ovary,18596.0,Day
30,TCGA-OV,TCGA-10-0937,Radiology,not reported,female,white,"Serous cystadenocarcinoma, NOS",Ovary,16222.0,Day
32,TCGA-OV,TCGA-13-1488,Radiology,not hispanic or latino,female,white,"Serous cystadenocarcinoma, NOS",Ovary,21726.0,Day
...,...,...,...,...,...,...,...,...,...,...
11249,TCGA-UCEC,TCGA-D1-A15Z,Radiology,not hispanic or latino,female,white,"Endometrioid adenocarcinoma, NOS",Endometrium,26638.0,Day
11252,TCGA-UCEC,TCGA-D1-A175,Radiology,not hispanic or latino,female,white,"Endometrioid adenocarcinoma, NOS",Endometrium,17802.0,Day
11256,TCGA-UCEC,TCGA-D1-A15V,Radiology,not hispanic or latino,female,white,"Serous cystadenocarcinoma, NOS",Endometrium,24886.0,Day
11261,TCGA-UCEC,TCGA-FI-A2CX,Radiology,not hispanic or latino,female,white,"Endometrioid adenocarcinoma, NOS",Endometrium,30224.0,Day


In [64]:
master_clinical = pd.concat([biobank_clinical, gdc_tmp], ignore_index=True)

In [65]:
master_clinical

Unnamed: 0,Project Short Name,Case ID,available_images,Primary Diagnosis,Race,Ethnicity,Sex at Birth,Age at Enrollment,Age UOM,Tissue or Organ of Origin,Age at Diagnosis
0,CMB-MML,MSB-00089,Pathology,Plasma Cell Myeloma,Black or African American,Not Hispanic or Latino,Female,67.0,Year,,
1,CMB-MML,MSB-00140,Both,Plasma Cell Myeloma,White,Not Hispanic or Latino,Female,65.0,Year,,
2,CMB-MEL,MSB-00205,Pathology,Melanoma,White,Not Hispanic or Latino,Male,63.0,Year,,
3,CMB-CRC,MSB-00241,Pathology,Colorectal Carcinoma,White,Not Hispanic or Latino,Female,59.0,Year,,
4,CMB-MEL,MSB-00263,Both,Melanoma,White,Not Hispanic or Latino,Male,56.0,Year,,
...,...,...,...,...,...,...,...,...,...,...,...
3688,TCGA-UCEC,TCGA-D1-A15Z,Radiology,"Endometrioid adenocarcinoma, NOS",white,not hispanic or latino,female,,Day,Endometrium,26638.0
3689,TCGA-UCEC,TCGA-D1-A175,Radiology,"Endometrioid adenocarcinoma, NOS",white,not hispanic or latino,female,,Day,Endometrium,17802.0
3690,TCGA-UCEC,TCGA-D1-A15V,Radiology,"Serous cystadenocarcinoma, NOS",white,not hispanic or latino,female,,Day,Endometrium,24886.0
3691,TCGA-UCEC,TCGA-FI-A2CX,Radiology,"Endometrioid adenocarcinoma, NOS",white,not hispanic or latino,female,,Day,Endometrium,30224.0


In [68]:
nbia.searchDf('cmb', dataframe=master_clinical)

Unnamed: 0,Project Short Name,Case ID,available_images,Primary Diagnosis,Race,Ethnicity,Sex at Birth,Age at Enrollment,Age UOM,Tissue or Organ of Origin,Age at Diagnosis
0,CMB-MML,MSB-00089,Pathology,Plasma Cell Myeloma,Black or African American,Not Hispanic or Latino,Female,67.0,Year,,
1,CMB-MML,MSB-00140,Both,Plasma Cell Myeloma,White,Not Hispanic or Latino,Female,65.0,Year,,
2,CMB-MEL,MSB-00205,Pathology,Melanoma,White,Not Hispanic or Latino,Male,63.0,Year,,
3,CMB-CRC,MSB-00241,Pathology,Colorectal Carcinoma,White,Not Hispanic or Latino,Female,59.0,Year,,
4,CMB-MEL,MSB-00263,Both,Melanoma,White,Not Hispanic or Latino,Male,56.0,Year,,
...,...,...,...,...,...,...,...,...,...,...,...
243,CMB-CRC,MSB-09886,Both,Colorectal Carcinoma,White,Not Hispanic or Latino,Male,67.0,Year,,
244,CMB-CRC,MSB-09897,Pathology,Colorectal Carcinoma,White,Hispanic or Latino,Male,59.0,Year,,
245,CMB-LCA,MSB-09977,Both,Small Cell Lung Carcinoma,White,Not Hispanic or Latino,Male,79.0,Year,,
246,CMB-CRC,MSB-09991,Both,Colorectal Carcinoma,White,Not Hispanic or Latino,Male,66.0,Year,,


In [69]:
master_clinical.to_excel('nci_program_clinical.xlsx')