# Summary

This notebook outlines the steps used to extract clinical data from the NCI Clinical Trial Data Commons and Genomic Data commons related to the Biobank, CPTAC and TCGA projects which have imaging data in The Cancer Imaging Archive (TCIA).  The resulting data will be merged with TCIA "Community" proposals that have clinical data to allow cohort building across as many TCIA datasets as possible.

# Setup

In [None]:
# for CTDC
!pip install gql

# for TCIA
!pip install tcia_utils

In [2]:
import pandas as pd

# for TCIA
from tcia_utils import nbia
from tcia_utils import wordpress

# for GDC
import json
import requests
import io

# for CTDC
from gql import gql, Client
from gql.transport.requests import RequestsHTTPTransport

# Get inventory of Radiology and Pathology data from TCIA

Get list of subjects that have radiology images.

In [3]:
nbia.getToken()

Enter User: 


 kirbyju
Enter Password:  ········


200

In [4]:
radiology = nbia.getPatient(format = 'df')

Get list of subjects that have pathology images.

In [6]:
# Define the URL of the CSV file feeding Eaglescope
url = "https://pathdb.cancerimagingarchive.net/system/files/collectionmetadata/202401/cohort_builder_v1_01-16-2024.csv"

try:
  # Download and read the CSV data
  pathology = pd.read_csv(url)

  # Drop unwanted columns
 # df = df[['collection', 'collection_doi', 'patient_id', 'slide_id']]

  # Create a new column with the base URL and slide ID
  pathology['viewer_url'] = "https://pathdb.cancerimagingarchive.net/caMicroscope/apps/mini/viewer.html?mode=pathdb&slideid=" + pathology['slide_id'].astype(str)

  # Prefix collection_doi with https://doi.org/ and make it lowercase
  pathology['collection_doi'] = "https://doi.org/" + pathology['collection_doi'].astype(str).str.lower()

  print("Successfully processed data into DataFrame.")
except Exception as e:
  print(f"An error occurred: {e}")

Successfully processed data into DataFrame.


  pathology = pd.read_csv(url)


Create function to add status of available images to clinical dataframes and generate variables for rad/path patient lists.

In [8]:
def add_image_availability(df, pathology_patients, radiology_patients):
  """Adds a new column 'available_images' to the DataFrame indicating image availability.

  Args:
    df: The DataFrame to modify.
    pathology_patients: A list of unique patient IDs from the pathology data.
    radiology_patients: A list of unique patient IDs from the radiology data.

  Returns:
    The modified DataFrame.
  """

  df['Available Images'] = df['Case ID'].apply(lambda x: 'Both' if x in pathology_patients and x in radiology_patients 
                                                              else 'Pathology' if x in pathology_patients 
                                                              else 'Radiology' if x in radiology_patients 
                                                              else 'Neither')
  return df

# Create lists of rad/path patient IDs
pathology_patients = pathology['patient_id'].unique().tolist()
radiology_patients = radiology['PatientId'].unique().tolist()

# Get Biobank clinical data from Clinical Trials Data Commons

In [9]:
# Define the transport
transport = RequestsHTTPTransport(
    url="https://clinical.datacommons.cancer.gov/v1/graphql/",
    verify=True,
    retries=3,
)

# Create a client instance, disabling schema fetching
client = Client(transport=transport, fetch_schema_from_transport=False)

# Define the query and variables
query = gql("""
  query participantOverview($subject_id: [String], $ctep_disease_term: [String], $stage_of_disease: [String], $tumor_grade: [String], $sex: [String], $reported_gender: [String], $race: [String], $ethnicity: [String], $carcinogen_exposure: [String], $targeted_therapy: [String], $anatomical_collection_site: [String], $tissue_category: [String], $assessment_timepoint: [String], $data_file_type: [String], $data_file_format: [String], $first: Int, $offset: Int, $order_by: String, $sort_direction: String) {
    participantOverview(subject_id: $subject_id, ctep_disease_term: $ctep_disease_term, stage_of_disease: $stage_of_disease, tumor_grade: $tumor_grade, sex: $sex, reported_gender: $reported_gender, race: $race, ethnicity: $ethnicity, carcinogen_exposure: $carcinogen_exposure, targeted_therapy: $targeted_therapy, anatomical_collection_site: $anatomical_collection_site, tissue_category: $tissue_category, assessment_timepoint: $assessment_timepoint, data_file_type: $data_file_type, data_file_format: $data_file_format, first: $first, offset: $offset, order_by: $order_by, sort_direction: $sort_direction) {
      subject_id
      ctep_disease_term
      stage_of_disease
      tumor_grade
      age_at_enrollment
      sex
      reported_gender
      race
      ethnicity
      carcinogen_exposure
      targeted_therapy
      data_file_uuid
      __typename
    }
  }
""")

variables = {
    "first": 10000
}

# Execute the query
result = client.execute(query, variable_values=variables)

# Execute the query
result = client.execute(query, variable_values=variables)

# Extract the data from the result
data = result.get("participantOverview", [])

# Convert the data into a pandas DataFrame
biobank = pd.DataFrame(data)

biobank = biobank.rename(columns={'subject_id': 'Case ID'})

#biobank.to_excel('ctdc.xlsx')
biobank

Unnamed: 0,Case ID,ctep_disease_term,stage_of_disease,tumor_grade,age_at_enrollment,sex,reported_gender,race,ethnicity,carcinogen_exposure,targeted_therapy,data_file_uuid,__typename
0,MSB-00089,Plasma Cell Myeloma,,,67.0,Female,,Black or African American,Not Hispanic or Latino,No,"[Bortezomib, Lenalidomide]",[],ParticipantOverview
1,MSB-00140,Plasma Cell Myeloma,,,65.0,Female,,White,Not Hispanic or Latino,No,"[Bortezomib, Lenalidomide]","[dg.4DFC/CC00EEF6-1730-40B3-9FA6-3D8FCD39EB83,...",ParticipantOverview
2,MSB-00205,Melanoma,,Grade cannot be assessed,63.0,Male,Male,White,Not Hispanic or Latino,No,[],[],ParticipantOverview
3,MSB-00241,Colorectal Carcinoma,,,59.0,Female,,White,Not Hispanic or Latino,Unknown,[],[dg.4DFC/7E4EE0AD-DFCC-4AEA-BA47-9C94E6A6E116],ParticipantOverview
4,MSB-00263,Melanoma,,Grade cannot be assessed,56.0,Male,Male,White,Not Hispanic or Latino,Unknown,[],"[dg.4DFC/640C5DFC-B8A1-44AD-B950-467ABC123F45,...",ParticipantOverview
...,...,...,...,...,...,...,...,...,...,...,...,...,...
243,MSB-09886,Colorectal Carcinoma,Stage IVB,Moderately Differentiated,67.0,Male,,White,Not Hispanic or Latino,Unknown,[Bevacizumab],[],ParticipantOverview
244,MSB-09897,Colorectal Carcinoma,Stage IVA,Moderately Differentiated,59.0,Male,,White,Hispanic or Latino,Unknown,[Bevacizumab],[],ParticipantOverview
245,MSB-09977,Small Cell Lung Carcinoma,Stage IV,Grade cannot be assessed,79.0,Male,,White,Not Hispanic or Latino,Unknown,[],"[dg.4DFC/A94E8653-C843-4D2F-9860-67E1BAFAD3F0,...",ParticipantOverview
246,MSB-09991,Colorectal Carcinoma,Stage IV,Moderately Differentiated,66.0,Male,,White,Not Hispanic or Latino,No,"[Bevacizumab, Cetuximab]","[dg.4DFC/D69D7DA7-D19F-4A6A-9523-3026F6B03A22,...",ParticipantOverview


In [10]:
def add_collection_from_other_dfs(df, pathology, radiology):
  """Adds a 'Collection' column to the DataFrame based on matches in pathology and radiology DataFrames.

  Args:
    df: The input DataFrame.
    pathology: The pathology DataFrame.
    radiology: The radiology DataFrame.

  Returns:
    The modified DataFrame.
  """

  # Create a dictionary mapping patient IDs to collections from pathology
  pathology_map = pathology.set_index('patient_id')['collection'].to_dict()

  # Create a dictionary mapping patient IDs to collections from radiology
  radiology_map = radiology.set_index('PatientId')['Collection'].to_dict()

  # Add the 'Collection' column to the biobank DataFrame
  df['Project Short Name'] = df['Case ID'].map(pathology_map).fillna(df['Case ID'].map(radiology_map))

  return df



In [11]:
# Add the 'Project Short Name' column to the biobank DataFrame
biobank_with_project_name = add_collection_from_other_dfs(biobank, pathology, radiology)

biobank_with_project_name

Unnamed: 0,Case ID,ctep_disease_term,stage_of_disease,tumor_grade,age_at_enrollment,sex,reported_gender,race,ethnicity,carcinogen_exposure,targeted_therapy,data_file_uuid,__typename,Project Short Name
0,MSB-00089,Plasma Cell Myeloma,,,67.0,Female,,Black or African American,Not Hispanic or Latino,No,"[Bortezomib, Lenalidomide]",[],ParticipantOverview,CMB-MML
1,MSB-00140,Plasma Cell Myeloma,,,65.0,Female,,White,Not Hispanic or Latino,No,"[Bortezomib, Lenalidomide]","[dg.4DFC/CC00EEF6-1730-40B3-9FA6-3D8FCD39EB83,...",ParticipantOverview,CMB-MML
2,MSB-00205,Melanoma,,Grade cannot be assessed,63.0,Male,Male,White,Not Hispanic or Latino,No,[],[],ParticipantOverview,CMB-MEL
3,MSB-00241,Colorectal Carcinoma,,,59.0,Female,,White,Not Hispanic or Latino,Unknown,[],[dg.4DFC/7E4EE0AD-DFCC-4AEA-BA47-9C94E6A6E116],ParticipantOverview,CMB-CRC
4,MSB-00263,Melanoma,,Grade cannot be assessed,56.0,Male,Male,White,Not Hispanic or Latino,Unknown,[],"[dg.4DFC/640C5DFC-B8A1-44AD-B950-467ABC123F45,...",ParticipantOverview,CMB-MEL
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
243,MSB-09886,Colorectal Carcinoma,Stage IVB,Moderately Differentiated,67.0,Male,,White,Not Hispanic or Latino,Unknown,[Bevacizumab],[],ParticipantOverview,CMB-CRC
244,MSB-09897,Colorectal Carcinoma,Stage IVA,Moderately Differentiated,59.0,Male,,White,Hispanic or Latino,Unknown,[Bevacizumab],[],ParticipantOverview,CMB-CRC
245,MSB-09977,Small Cell Lung Carcinoma,Stage IV,Grade cannot be assessed,79.0,Male,,White,Not Hispanic or Latino,Unknown,[],"[dg.4DFC/A94E8653-C843-4D2F-9860-67E1BAFAD3F0,...",ParticipantOverview,CMB-LCA
246,MSB-09991,Colorectal Carcinoma,Stage IV,Moderately Differentiated,66.0,Male,,White,Not Hispanic or Latino,No,"[Bevacizumab, Cetuximab]","[dg.4DFC/D69D7DA7-D19F-4A6A-9523-3026F6B03A22,...",ParticipantOverview,CMB-CRC


In [13]:
# Add the 'available_images' column 
biobank_with_project_and_image_availability = add_image_availability(biobank_with_project_name, pathology_patients, radiology_patients)

biobank_with_project_and_image_availability

Unnamed: 0,Case ID,ctep_disease_term,stage_of_disease,tumor_grade,age_at_enrollment,sex,reported_gender,race,ethnicity,carcinogen_exposure,targeted_therapy,data_file_uuid,__typename,Project Short Name,Available Images
0,MSB-00089,Plasma Cell Myeloma,,,67.0,Female,,Black or African American,Not Hispanic or Latino,No,"[Bortezomib, Lenalidomide]",[],ParticipantOverview,CMB-MML,Pathology
1,MSB-00140,Plasma Cell Myeloma,,,65.0,Female,,White,Not Hispanic or Latino,No,"[Bortezomib, Lenalidomide]","[dg.4DFC/CC00EEF6-1730-40B3-9FA6-3D8FCD39EB83,...",ParticipantOverview,CMB-MML,Both
2,MSB-00205,Melanoma,,Grade cannot be assessed,63.0,Male,Male,White,Not Hispanic or Latino,No,[],[],ParticipantOverview,CMB-MEL,Pathology
3,MSB-00241,Colorectal Carcinoma,,,59.0,Female,,White,Not Hispanic or Latino,Unknown,[],[dg.4DFC/7E4EE0AD-DFCC-4AEA-BA47-9C94E6A6E116],ParticipantOverview,CMB-CRC,Pathology
4,MSB-00263,Melanoma,,Grade cannot be assessed,56.0,Male,Male,White,Not Hispanic or Latino,Unknown,[],"[dg.4DFC/640C5DFC-B8A1-44AD-B950-467ABC123F45,...",ParticipantOverview,CMB-MEL,Both
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
243,MSB-09886,Colorectal Carcinoma,Stage IVB,Moderately Differentiated,67.0,Male,,White,Not Hispanic or Latino,Unknown,[Bevacizumab],[],ParticipantOverview,CMB-CRC,Both
244,MSB-09897,Colorectal Carcinoma,Stage IVA,Moderately Differentiated,59.0,Male,,White,Hispanic or Latino,Unknown,[Bevacizumab],[],ParticipantOverview,CMB-CRC,Pathology
245,MSB-09977,Small Cell Lung Carcinoma,Stage IV,Grade cannot be assessed,79.0,Male,,White,Not Hispanic or Latino,Unknown,[],"[dg.4DFC/A94E8653-C843-4D2F-9860-67E1BAFAD3F0,...",ParticipantOverview,CMB-LCA,Both
246,MSB-09991,Colorectal Carcinoma,Stage IV,Moderately Differentiated,66.0,Male,,White,Not Hispanic or Latino,No,"[Bevacizumab, Cetuximab]","[dg.4DFC/D69D7DA7-D19F-4A6A-9523-3026F6B03A22,...",ParticipantOverview,CMB-CRC,Both


# apply data standardization steps to align with output from https://tcia-clinical-validator.streamlit.app/

* Create a new dataframe called 'master_clinical' from 'biobank_with_project_and_image_availability' dataframe
* Include and rename 'ctep_disease_term' -> 'Primary Diagnosis' column. 
* Include and rename 'age_at_enrollment' -> 'Age at Enrollment' column.
* Include 'Race' and 'Ethnicity' columns, but capitalize them (they're called 'race' and 'ethnicity' in the biobank dataframe).  
* Include and rename 'sex' -> 'Sex at Birth' column.
* Create 'Age UOM' column and populate all rows with 'Year' as the value
* Convert the word 'or' in all values that are found in the Race and Ethnicity columns to be lower case
* Include 'Project Short Name' and 'available_images' 


In [12]:
# check for Case ID values that we didn't find a corresponding imaging Project Short Name
null_values = biobank_with_project_name[biobank_with_project_name['Project Short Name'].isnull()]['Case ID'].tolist()

null_values

['MSB-01363', 'MSB-02746', 'MSB-03272', 'MSB-07370']

In [56]:
# Extract unique values from the specified columns to ensure data formatting
unique_race = biobank_clinical['Race'].unique().tolist()
unique_ethnicity = biobank_clinical['Ethnicity'].unique().tolist()
unique_sex_at_birth = biobank_clinical['Sex at Birth'].unique().tolist()

print("Unique Race values:", unique_race)
print("Unique Ethnicity values:", unique_ethnicity)
print("Unique Sex at Birth values:", unique_sex_at_birth)

Unique Race values: ['Black or African American', 'White', 'Not Reported', 'Asian', 'Unknown', 'American Indian or Alaska Native', 'Native Hawaiian or other Pacific Islander']
Unique Ethnicity values: ['Not Hispanic or Latino', 'Hispanic or Latino', 'Not Reported', 'Unknown']
Unique Sex at Birth values: ['Female', 'Male']


In [15]:
biobank_clinical = biobank_with_project_and_image_availability[['Project Short Name', 'Case ID', 'Available Images', 'ctep_disease_term', 'race', 'ethnicity', 'sex', 'age_at_enrollment']].copy()

# drop rows where 'subject_id' is null (not clear why 4 subjects show up in CTDC but not in TCIA)
biobank_clinical = biobank_clinical.dropna(subset=['Project Short Name'])

# Rename columns
biobank_clinical.rename(columns={'ctep_disease_term': 'Primary Diagnosis',
                               'age_at_enrollment': 'Age at Enrollment',
                               'sex': 'Sex at Birth',
                               'race': 'Race',
                               'ethnicity':'Ethnicity'}, inplace=True)


# Convert 'or' to lowercase in Race and Ethnicity
biobank_clinical['Race'] = biobank_clinical['Race'].str.replace('Or', 'or', case=False)
biobank_clinical['Ethnicity'] = biobank_clinical['Ethnicity'].str.replace('Or', 'or', case=False)

# Create 'Age UOM' column
biobank_clinical['Age UOM'] = 'Year'

In [103]:
biobank_clinical

Unnamed: 0,Project Short Name,Case ID,Available Images,Primary Diagnosis,Race,Ethnicity,Sex at Birth,Age at Enrollment,Age UOM
0,CMB-MML,MSB-00089,Pathology,Plasma Cell Myeloma,Black or African American,Not Hispanic or Latino,Female,67.0,Year
1,CMB-MML,MSB-00140,Both,Plasma Cell Myeloma,White,Not Hispanic or Latino,Female,65.0,Year
2,CMB-MEL,MSB-00205,Pathology,Melanoma,White,Not Hispanic or Latino,Male,63.0,Year
3,CMB-CRC,MSB-00241,Pathology,Colorectal Carcinoma,White,Not Hispanic or Latino,Female,59.0,Year
4,CMB-MEL,MSB-00263,Both,Melanoma,White,Not Hispanic or Latino,Male,56.0,Year
...,...,...,...,...,...,...,...,...,...
243,CMB-CRC,MSB-09886,Both,Colorectal Carcinoma,White,Not Hispanic or Latino,Male,67.0,Year
244,CMB-CRC,MSB-09897,Pathology,Colorectal Carcinoma,White,Hispanic or Latino,Male,59.0,Year
245,CMB-LCA,MSB-09977,Both,Small Cell Lung Carcinoma,White,Not Hispanic or Latino,Male,79.0,Year
246,CMB-CRC,MSB-09991,Both,Colorectal Carcinoma,White,Not Hispanic or Latino,Male,66.0,Year


In [16]:
# Double check for subjects without imaging using 'Available Images' (i.e. 'Neither')
neither_images = biobank_clinical[biobank_clinical['Available Images'] == 'Neither']

# Display the filtered rows
print(neither_images)


Empty DataFrame
Columns: [Project Short Name, Case ID, Available Images, Primary Diagnosis, Race, Ethnicity, Sex at Birth, Age at Enrollment, Age UOM]
Index: []


# Get CPTAC and TCGA clinical data from GDC

In [23]:
# Check for collections related to TCGA radiology images
# Note: TCIA doesn't host any pathology for TCGA
gdc_collection_list = radiology[radiology['Collection'].str.contains('tcga', case=False, na=False)]['Collection'].unique().tolist()

# add CPTAC datasets (Note: these are referred to as programs in GDC rather than being specific to cancer type like TCIA. We'll resolve this later.)
gdc_collection_list.extend(['CPTAC-2', 'CPTAC-3'])

print(len(gdc_collection_list))
print(gdc_collection_list)

23
['TCGA-GBM', 'TCGA-BRCA', 'TCGA-LGG', 'TCGA-KIRC', 'TCGA-LUAD', 'TCGA-PRAD', 'TCGA-KIRP', 'TCGA-OV', 'TCGA-LIHC', 'TCGA-HNSC', 'TCGA-KICH', 'TCGA-BLCA', 'TCGA-LUSC', 'TCGA-COAD', 'TCGA-THCA', 'TCGA-READ', 'TCGA-UCEC', 'TCGA-ESCA', 'TCGA-STAD', 'TCGA-CESC', 'TCGA-SARC', 'CPTAC-2', 'CPTAC-3']


In [24]:
cases_endpt = 'https://api.gdc.cancer.gov/cases'

filters = {
    "op": "in",
    "content":{
        "field": "project.project_id",
        "value": gdc_collection_list
        }
    }

fields = [
    "project.project_id",
    "submitter_id",
    ]

fields = ','.join(fields)

expand = [ ## For the allowable values for this list, look under "mapping" at https://api.gdc.cancer.gov/cases/_mapping
    "demographic",
    "diagnoses"
    #"diagnoses.treatments",
    #"exposures",
    #"family_histories"
    ]

expand = ','.join(expand)

params = {
    "filters": json.dumps(filters),
    "expand": expand,
    "fields": fields,
    "format": "TSV", ## This can be "JSON" too
    "size": "20000", ## If you are re-using this for other projects, you may need to modify this and the "from" number.
    "from":"0"
    }

response = requests.get(cases_endpt, params = params)

output = response.content.decode('UTF-8')
clinicalDf = pd.read_csv(io.StringIO(output), sep='\t')

#clinicalDf.to_excel('gdc.xlsx')

  clinicalDf = pd.read_csv(io.StringIO(output), sep='\t')


In [25]:
# rename subject_id to Case ID
gdc = clinicalDf.rename(columns={'submitter_id': 'Case ID'})

# Add the 'available_images' column to the 'pathology' DataFrame
gdc_with_project_and_image_availability = add_image_availability(gdc, pathology_patients, radiology_patients)

#gdc_with_project_and_image_availability.to_excel('gdc.xlsx')

gdc_with_project_and_image_availability

Unnamed: 0,demographic.age_at_index,demographic.age_is_obfuscated,demographic.cause_of_death,demographic.cause_of_death_source,demographic.country_of_residence_at_enrollment,demographic.created_datetime,demographic.days_to_birth,demographic.days_to_death,demographic.demographic_id,demographic.ethnicity,...,diagnoses.0.tumor_grade,diagnoses.0.tumor_regression_grade,diagnoses.0.updated_datetime,diagnoses.0.weiss_assessment_score,diagnoses.0.wilms_tumor_histologic_subtype,diagnoses.0.year_of_diagnosis,id,project.project_id,Case ID,Available Images
0,65.0,,,,,,-24064.0,1484.0,4d9e60db-29bf-55a5-b771-45b205a0536b,not hispanic or latino,...,Not Reported,,2023-10-06T12:23:52.066938-05:00,,,2007.0,85a85a11-7200-4e96-97af-6ba26d680d59,TCGA-OV,TCGA-13-0920,Radiology
1,49.0,,,,,,-17961.0,1447.0,bf1408e4-38fe-591d-b84c-033112b7d068,not hispanic or latino,...,Not Reported,,2023-10-06T12:23:52.066938-05:00,,,2009.0,7922df77-f09a-488c-a1be-58646ceb9b3e,TCGA-OV,TCGA-42-2582,Neither
2,80.0,,,,,,-29501.0,563.0,6a95eab4-efb0-5637-8480-f4a864f3c478,not reported,...,Not Reported,,2023-10-06T12:23:52.066938-05:00,,,2006.0,8727855e-120a-4216-a803-8cc6cd1159be,TCGA-OV,TCGA-04-1342,Neither
3,59.0,,,,,,-21635.0,2025.0,55b47e7c-07df-5c84-a4c5-11752b4a80e1,not reported,...,Not Reported,,2023-10-06T12:23:52.066938-05:00,,,2004.0,82e96c6c-a88c-4e52-be56-7f24f6c7b835,TCGA-OV,TCGA-13-1819,Neither
4,60.0,,,,,,-21963.0,55.0,e6acce8e-ae80-5f1b-846a-24a6d203fe5a,not hispanic or latino,...,Not Reported,,2023-10-06T12:23:52.066938-05:00,,,2005.0,ab3dbbbe-eed6-4a35-a505-1815225e86c9,TCGA-OV,TCGA-04-1335,Neither
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
11264,53.0,,,,,,-19547.0,,eb111509-466e-5511-a767-c881fb9375e6,not hispanic or latino,...,Not Reported,,2023-10-06T12:22:33.388084-05:00,,,2009.0,a798a8cc-4e72-4a8c-9e20-74a14deafd12,TCGA-UCEC,TCGA-BG-A0MS,Neither
11265,80.0,,,,,,-29260.0,,4bed5f1e-e85b-5763-bb8b-ea0029cf902d,hispanic or latino,...,Not Reported,,2023-10-06T12:22:33.388084-05:00,,,2011.0,fe2e89f7-8f4d-420a-a551-4877cf0fd1d3,TCGA-UCEC,TCGA-EO-A3KX,Neither
11266,83.0,,,,,,-30517.0,456.0,76d2f9cb-ddcb-59c4-9316-1df7c9f3062d,not hispanic or latino,...,Not Reported,,2023-10-06T12:22:33.388084-05:00,,,2008.0,db3a4986-55d5-4ecc-be73-59725dce3c33,TCGA-UCEC,TCGA-EY-A1G8,Neither
11267,73.0,,,,,,-26992.0,,16d35689-c5a4-50b8-b9ee-af408c742373,not hispanic or latino,...,Not Reported,,2023-10-06T12:22:33.388084-05:00,,,2013.0,ff6b5fc8-0572-4b58-b3a5-bcda41badbc8,TCGA-UCEC,TCGA-PG-A914,Neither


In [28]:
gdc_with_project_and_image_availability = gdc_with_project_and_image_availability[gdc_with_project_and_image_availability['Available Images'] != 'Neither']

#gdc_with_project_and_image_availability.to_excel('gdc-clinical.xlsx')

gdc_with_project_and_image_availability

Unnamed: 0,demographic.age_at_index,demographic.age_is_obfuscated,demographic.cause_of_death,demographic.cause_of_death_source,demographic.country_of_residence_at_enrollment,demographic.created_datetime,demographic.days_to_birth,demographic.days_to_death,demographic.demographic_id,demographic.ethnicity,...,diagnoses.0.tumor_grade,diagnoses.0.tumor_regression_grade,diagnoses.0.updated_datetime,diagnoses.0.weiss_assessment_score,diagnoses.0.wilms_tumor_histologic_subtype,diagnoses.0.year_of_diagnosis,id,project.project_id,Case ID,Available Images
0,65.0,,,,,,-24064.0,1484.0,4d9e60db-29bf-55a5-b771-45b205a0536b,not hispanic or latino,...,Not Reported,,2023-10-06T12:23:52.066938-05:00,,,2007.0,85a85a11-7200-4e96-97af-6ba26d680d59,TCGA-OV,TCGA-13-0920,Radiology
9,59.0,,,,,,-21635.0,1680.0,45f8a427-8d1d-5aae-b1f3-7f698c460b17,not hispanic or latino,...,Not Reported,,2023-10-06T12:23:52.066938-05:00,,,2008.0,8fac8e40-beac-4059-9d54-7ee530598cfd,TCGA-OV,TCGA-13-1408,Radiology
15,50.0,,,,,,-18596.0,,c46f9687-5a61-5a44-b927-5cdc22391434,not hispanic or latino,...,Not Reported,,2023-10-06T12:23:52.066938-05:00,,,2001.0,0fc8777c-8f12-472f-a0ea-085139b35d1f,TCGA-OV,TCGA-09-1675,Radiology
30,44.0,,,,,,-16222.0,608.0,ec4793f1-4af3-5052-bc55-d865c8d8a000,not reported,...,Not Reported,,2023-10-06T12:23:52.066938-05:00,,,2005.0,a5030259-cf9c-4a58-8710-b9da8ee59320,TCGA-OV,TCGA-10-0937,Radiology
32,59.0,,,,,,-21726.0,2154.0,b6bd947a-dae0-5ef3-905e-02630b777a39,not hispanic or latino,...,Not Reported,,2023-10-06T12:23:52.066938-05:00,,,2002.0,a85f6f9c-1e1d-44fc-85eb-3b2d96cfbc61,TCGA-OV,TCGA-13-1488,Radiology
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
11249,72.0,,,,,,-26638.0,58.0,69387b62-2c68-5b85-a959-7200561ef196,not hispanic or latino,...,Not Reported,,2023-10-06T12:22:33.388084-05:00,,,2008.0,c5df9142-27f6-4d97-933e-c79ea9199362,TCGA-UCEC,TCGA-D1-A15Z,Radiology
11252,48.0,,,,,,-17802.0,,5adae4ab-9d4c-5380-ba44-2fc49810993d,not hispanic or latino,...,Not Reported,,2023-10-06T12:22:33.388084-05:00,,,2009.0,f0b3ab10-43f8-48a8-b460-66ceaaf637db,TCGA-UCEC,TCGA-D1-A175,Radiology
11256,68.0,,,,,,-24886.0,,5e0b5d34-f9df-502e-94e6-7bfeeb92a539,not hispanic or latino,...,Not Reported,,2023-10-06T12:22:33.388084-05:00,,,2008.0,f7010b87-a559-4501-99f9-0fcca1e986e1,TCGA-UCEC,TCGA-D1-A15V,Radiology
11261,82.0,,,,,,-30224.0,,8bd8505e-e3b3-5167-adc2-d6abce781dc6,not hispanic or latino,...,Not Reported,,2023-10-06T12:22:33.388084-05:00,,,2008.0,dffc7091-bc27-452c-ad94-1ee214107593,TCGA-UCEC,TCGA-FI-A2CX,Radiology


# in order to align with output from https://tcia-clinical-validator.streamlit.app/
Please create a copy of 'gdc_with_project_and_image_availability' dataframe called 'gdc_tmp' that only contains:
* demographic.ethnicity -> but change header to be Ethnicity
* demographic.gender -> rename to Sex at Birth
* demographic.race -> rename to Race
* diagnoses.0.age_at_diagnosis -> rename to Age at Diagnosis
* diagnoses.0.primary_diagnosis -> Primary Diagnosis
* diagnoses.0.tissue_or_organ_of_origin -> Tissue or Organ of Origin
* project.project_id -> Project Short Name
* create an 'Age UOM' column and populate all values with 'Day'
* Convert the word 'or' in all values that are found in the Tissue or Organ of Origin, Race, Ethnicity columns to be lower case
* set 'Nan' and 'nan' values to be null/NaN values (not strings that say 'Nan' or 'nan')

Side note: there are 40 diagnosis values that don't fall into the GDC CDE?

In [47]:
gdc_tmp = gdc_with_project_and_image_availability[['project.project_id', 'Case ID', 'Available Images', 'diagnoses.0.primary_diagnosis', 'diagnoses.0.tissue_or_organ_of_origin', 'demographic.race', 'demographic.ethnicity', 'demographic.gender', 'diagnoses.0.age_at_diagnosis']].copy()

# Rename columns
gdc_tmp.rename(columns={'demographic.ethnicity': 'Ethnicity',
                         'demographic.gender': 'Sex at Birth',
                         'demographic.race': 'Race',
                         'diagnoses.0.age_at_diagnosis': 'Age at Diagnosis',
                         'diagnoses.0.primary_diagnosis': 'Primary Diagnosis',
                         'diagnoses.0.tissue_or_organ_of_origin': 'Primary Site',
                         'project.project_id': 'Project Short Name'}, inplace=True)

# Convert 'or' to lowercase in specified columns
gdc_tmp['Primary Site'] = gdc_tmp['Primary Site'].str.replace('Or', 'or', case=False)
gdc_tmp['Race'] = gdc_tmp['Race'].str.replace('Or', 'or', case=False)
gdc_tmp['Ethnicity'] = gdc_tmp['Ethnicity'].str.replace('Or', 'or', case=False)

# Create 'Age UOM' column and populate with 'Day'
gdc_tmp['Age UOM'] = 'Day'

# Convert 'Nan' and 'nan' to NaN
gdc_tmp = gdc_tmp.replace(['Nan', 'nan'], pd.NA)

# drop probable GDC data entry issues for 2 subjects in CPTAC-BRCA -- they have no images in TCIA,
#   but the patient ID for 604 aligns with a subject from a different collection
gdc_tmp = gdc_tmp[~gdc_tmp['Case ID'].isin(['604', '1488'])]

# Add the 'Collection' column to the DataFrame to fix CPTAC collections (not just CPTAC-2 and CPTAC-3)
gdc_clinical = add_collection_from_other_dfs(gdc_tmp, pathology, radiology)

In [48]:
gdc_clinical

Unnamed: 0,Project Short Name,Case ID,Available Images,Primary Diagnosis,Primary Site,Race,Ethnicity,Sex at Birth,Age at Diagnosis,Age UOM
0,TCGA-OV,TCGA-13-0920,Radiology,"Serous cystadenocarcinoma, NOS",Ovary,white,not hispanic or latino,female,24064.0,Day
9,TCGA-OV,TCGA-13-1408,Radiology,"Serous cystadenocarcinoma, NOS",Ovary,white,not hispanic or latino,female,21635.0,Day
15,TCGA-OV,TCGA-09-1675,Radiology,"Serous cystadenocarcinoma, NOS",Ovary,not reported,not hispanic or latino,female,18596.0,Day
30,TCGA-OV,TCGA-10-0937,Radiology,"Serous cystadenocarcinoma, NOS",Ovary,white,not reported,female,16222.0,Day
32,TCGA-OV,TCGA-13-1488,Radiology,"Serous cystadenocarcinoma, NOS",Ovary,white,not hispanic or latino,female,21726.0,Day
...,...,...,...,...,...,...,...,...,...,...
11249,TCGA-UCEC,TCGA-D1-A15Z,Radiology,"Endometrioid adenocarcinoma, NOS",Endometrium,white,not hispanic or latino,female,26638.0,Day
11252,TCGA-UCEC,TCGA-D1-A175,Radiology,"Endometrioid adenocarcinoma, NOS",Endometrium,white,not hispanic or latino,female,17802.0,Day
11256,TCGA-UCEC,TCGA-D1-A15V,Radiology,"Serous cystadenocarcinoma, NOS",Endometrium,white,not hispanic or latino,female,24886.0,Day
11261,TCGA-UCEC,TCGA-FI-A2CX,Radiology,"Endometrioid adenocarcinoma, NOS",Endometrium,white,not hispanic or latino,female,30224.0,Day


In [42]:
# Get the unique values for 'project.project_id' column
unique_project_ids = gdc_clinical['Project Short Name'].unique()

# Display the unique project IDs
print(unique_project_ids)

['TCGA-OV' 'TCGA-BLCA' 'TCGA-COAD' 'TCGA-CESC' 'TCGA-PRAD' 'TCGA-READ'
 'TCGA-SARC' 'TCGA-LUSC' 'TCGA-LIHC' 'TCGA-LUAD' 'TCGA-KIRC' 'TCGA-KIRP'
 'CPTAC-PDA' 'CPTAC-GBM' 'CPTAC-CCRCC' 'UPENN-GBM' 'CPTAC-non-CCRCC'
 'CPTAC-UCEC' 'CPTAC-HNSCC' 'CPTAC-LUAD' 'CPTAC-LSCC' 'TCGA-LGG'
 'TCGA-HNSC' 'TCGA-KICH' 'TCGA-BRCA' 'CPTAC-BRCA' 'CPTAC-COAD' 'CPTAC-OV'
 'TCGA-GBM' 'TCGA-ESCA' 'TCGA-STAD' 'TCGA-THCA' 'TCGA-UCEC']


### Note: the UPENN-GBM overlap is valid
Some cases that are marked as CPTAC-GBM in GDC were purposefully put in the UPENN-GBM collection on TCIA per the submitters' request to reduce potential confusion about previously existing radiology images that had been published under UPENN-GBM prior to CPTAC deciding to include these subjects in their genomics/proteomics data.

In [54]:
allowable_race = {
    'white': 'White',
    'not reported': 'Not Reported',
    'black or african american': 'Black or African American',
    'american indian or alaska native': 'American Indian or Alaska Native',
    'asian': 'Asian',
    'other': 'Unknown',  # Mapping 'other' to 'Unknown'
    'Unknown': 'Unknown',
    'native hawaiian or other pacific islander': 'Native Hawaiian or Other Pacific Islander'
}

allowable_ethnicity = {
    'not hispanic or latino': 'Not Hispanic or Latino',
    'not reported': 'Not Reported',
    'hispanic or latino': 'Hispanic or Latino',
    'Unknown': 'Unknown'
}

allowable_sex_at_birth = {
    'female': 'Female',
    'male': 'Male',
    'not reported': 'Unknown'
}

# Apply the mappings to the dataframe
gdc_clinical['Race'] = gdc_clinical['Race'].apply(lambda x: race_mapping.get(x, x) if x not in allowable_race else x).fillna('Unknown')
gdc_clinical['Ethnicity'] = gdc_clinical['Ethnicity'].apply(lambda x: ethnicity_mapping.get(x, x) if x not in allowable_ethnicity else x).fillna('Unknown')
gdc_clinical['Sex at Birth'] = gdc_clinical['Sex at Birth'].apply(lambda x: sex_at_birth_mapping.get(x, x) if x not in allowable_sex_at_birth else x).fillna('Unknown')

gdc_clinical


Unnamed: 0,Project Short Name,Case ID,Available Images,Primary Diagnosis,Primary Site,Race,Ethnicity,Sex at Birth,Age at Diagnosis,Age UOM
0,TCGA-OV,TCGA-13-0920,Radiology,"Serous cystadenocarcinoma, NOS",Ovary,White,Not Hispanic or Latino,Female,24064.0,Day
9,TCGA-OV,TCGA-13-1408,Radiology,"Serous cystadenocarcinoma, NOS",Ovary,White,Not Hispanic or Latino,Female,21635.0,Day
15,TCGA-OV,TCGA-09-1675,Radiology,"Serous cystadenocarcinoma, NOS",Ovary,Not Reported,Not Hispanic or Latino,Female,18596.0,Day
30,TCGA-OV,TCGA-10-0937,Radiology,"Serous cystadenocarcinoma, NOS",Ovary,White,Not Reported,Female,16222.0,Day
32,TCGA-OV,TCGA-13-1488,Radiology,"Serous cystadenocarcinoma, NOS",Ovary,White,Not Hispanic or Latino,Female,21726.0,Day
...,...,...,...,...,...,...,...,...,...,...
11249,TCGA-UCEC,TCGA-D1-A15Z,Radiology,"Endometrioid adenocarcinoma, NOS",Endometrium,White,Not Hispanic or Latino,Female,26638.0,Day
11252,TCGA-UCEC,TCGA-D1-A175,Radiology,"Endometrioid adenocarcinoma, NOS",Endometrium,White,Not Hispanic or Latino,Female,17802.0,Day
11256,TCGA-UCEC,TCGA-D1-A15V,Radiology,"Serous cystadenocarcinoma, NOS",Endometrium,White,Not Hispanic or Latino,Female,24886.0,Day
11261,TCGA-UCEC,TCGA-FI-A2CX,Radiology,"Endometrioid adenocarcinoma, NOS",Endometrium,White,Not Hispanic or Latino,Female,30224.0,Day


## create master clinical df to combine CTDC Biobank and GDC (TCGA/CPTAC)

In [57]:
master_clinical = pd.concat([biobank_clinical, gdc_tmp], ignore_index=True)

In [58]:
# Extract unique values from the specified columns
unique_race = master_clinical['Race'].unique().tolist()
unique_ethnicity = master_clinical['Ethnicity'].unique().tolist()
unique_sex_at_birth = master_clinical['Sex at Birth'].unique().tolist()

print("Unique Race values:", unique_race)
print("Unique Ethnicity values:", unique_ethnicity)
print("Unique Sex at Birth values:", unique_sex_at_birth)

Unique Race values: ['Black or African American', 'White', 'Not Reported', 'Asian', 'Unknown', 'American Indian or Alaska Native', 'Native Hawaiian or other Pacific Islander', 'Native Hawaiian or Other Pacific Islander']
Unique Ethnicity values: ['Not Hispanic or Latino', 'Hispanic or Latino', 'Not Reported', 'Unknown']
Unique Sex at Birth values: ['Female', 'Male', 'Unknown']


In [59]:
master_clinical

Unnamed: 0,Project Short Name,Case ID,Available Images,Primary Diagnosis,Race,Ethnicity,Sex at Birth,Age at Enrollment,Age UOM,Primary Site,Age at Diagnosis
0,CMB-MML,MSB-00089,Pathology,Plasma Cell Myeloma,Black or African American,Not Hispanic or Latino,Female,67.0,Year,,
1,CMB-MML,MSB-00140,Both,Plasma Cell Myeloma,White,Not Hispanic or Latino,Female,65.0,Year,,
2,CMB-MEL,MSB-00205,Pathology,Melanoma,White,Not Hispanic or Latino,Male,63.0,Year,,
3,CMB-CRC,MSB-00241,Pathology,Colorectal Carcinoma,White,Not Hispanic or Latino,Female,59.0,Year,,
4,CMB-MEL,MSB-00263,Both,Melanoma,White,Not Hispanic or Latino,Male,56.0,Year,,
...,...,...,...,...,...,...,...,...,...,...,...
3683,TCGA-UCEC,TCGA-D1-A15Z,Radiology,"Endometrioid adenocarcinoma, NOS",White,Not Hispanic or Latino,Female,,Day,Endometrium,26638.0
3684,TCGA-UCEC,TCGA-D1-A175,Radiology,"Endometrioid adenocarcinoma, NOS",White,Not Hispanic or Latino,Female,,Day,Endometrium,17802.0
3685,TCGA-UCEC,TCGA-D1-A15V,Radiology,"Serous cystadenocarcinoma, NOS",White,Not Hispanic or Latino,Female,,Day,Endometrium,24886.0
3686,TCGA-UCEC,TCGA-FI-A2CX,Radiology,"Endometrioid adenocarcinoma, NOS",White,Not Hispanic or Latino,Female,,Day,Endometrium,30224.0


In [61]:
master_clinical.to_excel('nci_program_clinical.xlsx')