In [77]:
import requests
import json

In [78]:
r = requests.get(url="https://www.cbioportal.org/api/health")
print (r.json())
print (r.status_code)

{'status': 'UP'}
200


Part 1: Retrieve and display patient data. User must provide a valid patient and study Id. 
Example uses
patient: P-0000004 
study: msk_impact_2017

In [79]:
patientId = "P-0000004"
studyId = "msk_impact_2017"
clinicalDataRequest = requests.get(url=f"https://www.cbioportal.org/api/studies/{studyId}/patients/{patientId}/clinical-data")
clinicalData = clinicalDataRequest.json()
for item in clinicalData:
    print(item)
    print("\n")

{'uniquePatientKey': 'UC0wMDAwMDA0Om1za19pbXBhY3RfMjAxNw', 'patientId': 'P-0000004', 'studyId': 'msk_impact_2017', 'clinicalAttributeId': 'OS_STATUS', 'value': '0:LIVING'}


{'uniquePatientKey': 'UC0wMDAwMDA0Om1za19pbXBhY3RfMjAxNw', 'patientId': 'P-0000004', 'studyId': 'msk_impact_2017', 'clinicalAttributeId': 'SAMPLE_COUNT', 'value': '1'}


{'uniquePatientKey': 'UC0wMDAwMDA0Om1za19pbXBhY3RfMjAxNw', 'patientId': 'P-0000004', 'studyId': 'msk_impact_2017', 'clinicalAttributeId': 'SEX', 'value': 'Female'}


{'uniquePatientKey': 'UC0wMDAwMDA0Om1za19pbXBhY3RfMjAxNw', 'patientId': 'P-0000004', 'studyId': 'msk_impact_2017', 'clinicalAttributeId': 'SMOKING_HISTORY', 'value': 'Unknown'}


{'uniquePatientKey': 'UC0wMDAwMDA0Om1za19pbXBhY3RfMjAxNw', 'patientId': 'P-0000004', 'studyId': 'msk_impact_2017', 'clinicalAttributeId': 'VITAL_STATUS', 'value': 'ALIVE'}




Part 2: Find patients similar to a specified patient. Search within the same study, uses one criteria. 
Example uses
patient: P-0000024 
study: msk_impact_2017
attribute (not recorded across all studies): SMOKING_HISTORY

In [80]:
# Patient and Study variables
patientId = "P-0000024"
studyId = "msk_impact_2017"
clinicalAttributeId = "SMOKING_HISTORY"
# get patient attribute
patientAttributeRequest = requests.get(url=f"https://www.cbioportal.org/api/studies/{studyId}/patients/{patientId}/clinical-data?attributeId={clinicalAttributeId}&projection=SUMMARY&pageSize=10000000&pageNumber=0&direction=ASC")
# parse for correct item (attribute)
patientAttributeItem = patientAttributeRequest.json()
y = patientAttributeItem[0]
patientValue = y['value']
print (y['clinicalAttributeId'])
print (patientValue)
# initialize counter
patientCounter = 0
# Get list of patients from study
print ("matching patients")
patientListAndAttributesRequest = requests.get(url=f"https://www.cbioportal.org/api/studies/{studyId}/clinical-data?attributeId={clinicalAttributeId}&clinicalDataType=PATIENT&projection=SUMMARY&pageSize=20000&pageNumber=0&direction=ASC")
compareList = patientListAndAttributesRequest.json()
for a in compareList:
    if a['value'] == y['value']:
        print (a['patientId'])
        patientCounter += 1
print(f"number of matching patients: {patientCounter}")

SMOKING_HISTORY
Prev/Curr Smoker
matching patients
P-0000024
P-0000034
P-0000039
P-0000042
P-0000043
P-0000047
P-0000053
P-0000056
P-0000057
P-0000058
P-0000059
P-0000065
P-0000066
P-0000068
P-0000071
P-0000079
P-0000081
P-0000082
P-0000083
P-0000084
P-0000085
P-0000086
P-0000093
P-0000096
P-0000098
P-0000102
P-0000104
P-0000105
P-0000110
P-0000113
P-0000115
P-0000120
P-0000123
P-0000127
P-0000129
P-0000130
P-0000131
P-0000132
P-0000136
P-0000142
P-0000144
P-0000149
P-0000152
P-0000157
P-0000158
P-0000161
P-0000163
P-0000167
P-0000168
P-0000174
P-0000181
P-0000182
P-0000184
P-0000185
P-0000186
P-0000189
P-0000190
P-0000195
P-0000197
P-0000201
P-0000202
P-0000208
P-0000211
P-0000212
P-0000214
P-0000216
P-0000219
P-0000220
P-0000221
P-0000222
P-0000232
P-0000235
P-0000236
P-0000239
P-0000242
P-0000245
P-0000246
P-0000252
P-0000257
P-0000258
P-0000267
P-0000268
P-0000273
P-0000275
P-0000283
P-0000284
P-0000286
P-0000287
P-0000293
P-0000295
P-0000299
P-0000301
P-0000302
P-0000303
P-0000313

Part 3: Find patients similar to a specified patient across multiple studies. Uses one criterion, needs to check if the criteria is available in other studies. If using a criteria not availible in all studies, it may be possible to find attributes with similar names or purposes (ex SMOKING_HISTORY vs SMOKE STATUS vs SMOKING_STATUS) that may not be picked up.
https://www.cbioportal.org/study/summary?id=nsclc_tcga_broad_2016 and https://www.cbioportal.org/study/summary?id=msk_impact_2017 both have SMOKING_HISTORY, but the response types are different. This requires parsing sample data instead of patient data.
Example uses
patient:  P-0000057
studies: tmb_mskcc_2018, metastatic_solid_tumors_mich_2017
attribute: CANCER_TYPE

First draft: 10836 samples, approx 52 minutes
- need to break after first sample match found 
- reduce number of database queries? 

Second draft: 2161 samples, approx 12 minutes


In [88]:
# initialize values
patient = "P-0000057"
patientOriginalStudy = "tmb_mskcc_2018"
studyList = ["tmb_mskcc_2018", "metastatic_solid_tumors_mich_2017"]
compareAttribute = "CANCER_TYPE"
counter = 0

# methods
# fetchPatientSamples uses requests to get all of a pateint's samples
def fetchPatientSamples(patientId, studyId):
    return requests.get(url=f"https://www.cbioportal.org/api/studies/{studyId}/patients/{patientId}/samples?projection=SUMMARY&pageSize=10&pageNumber=0&direction=ASC")

# fetchSampleClinicalDataAttribute
def fetchSampleClinicalDataAttribute(studyId, sampleId):
    return requests.get(url=f"https://www.cbioportal.org/api/studies/{studyId}/samples/{sampleId}/clinical-data?attributeId={compareAttribute}&projection=SUMMARY&pageSize=20000&pageNumber=0&direction=ASC")

# fetchClinicalAttributesStudy
def fetchClinicalAttributesStudy(studyId):
    return requests.get(url=f"https://www.cbioportal.org/api/studies/{studyId}/clinical-attributes?projection=SUMMARY&pageSize=20000&pageNumber=0&direction=ASC")

# fetchPateintList
def fetchPatientList(studyId):
    return requests.get(url=f"https://www.cbioportal.org/api/studies/{studyId}/patients?projection=SUMMARY&pageSize=200000&pageNumber=0&direction=ASC")

# Request list of patient's samples. Take first sampleId
patientSamples = fetchPatientSamples(patient, patientOriginalStudy)
pPatientData = patientSamples.json()
pSample = pPatientData[0]
pSampleId = pSample['sampleId']
print (pPatientData)
print(pSampleId)
# Get clincial data from the first sample
patientClinicalDataAttribute = fetchSampleClinicalDataAttribute(patientOriginalStudy, pSampleId)
pAttribute = patientClinicalDataAttribute.json()[0]
pAttributeValue = pAttribute['value']
print(f"{compareAttribute}: {pAttributeValue}")
print("matched patients:")
# loop through studies
for study in studyList:
    # check if attribute is applicable. if not, break
    listOfAttributesRequest = fetchClinicalAttributesStudy(study)
    listOfAttributes = listOfAttributesRequest.json()
    # print (listOfAttributes)
    for attribute in listOfAttributes:
        if attribute['clinicalAttributeId'] == compareAttribute:
            # get list of patients
            patientListRequest = fetchPatientList(study)
            patientIdList = patientListRequest.json()
            # loop through patients, requesting their samples and then using the Ids to find the attribute (see first patient)
            for Id in patientIdList:
                currentPatientId = Id['patientId']
                currentPatientSamplesRequest = fetchPatientSamples(currentPatientId, study)
                currentSamples = currentPatientSamplesRequest.json()
                # following loop requests each sample and checks for the attribute, prints if match
                for sample in currentSamples:
                    currentSampleId = sample['sampleId']
                    currentClinicalDataRequest = fetchSampleClinicalDataAttribute(study, currentSampleId)
                    currentSample = currentClinicalDataRequest.json()[0]
                    currentSampleValue = currentSample['value']
                    if currentSampleValue == pAttributeValue:
                        print(currentPatientId)
                        counter += 1
                        # break when one correct sample is found
                        break
print(f"total matched patients: {counter - 1}")
                    




[{'uniqueSampleKey': 'UC0wMDAwMDU3LVQwMS1JTTM6dG1iX21za2NjXzIwMTg', 'uniquePatientKey': 'UC0wMDAwMDU3OnRtYl9tc2tjY18yMDE4', 'sampleType': 'Primary Solid Tumor', 'sampleId': 'P-0000057-T01-IM3', 'patientId': 'P-0000057', 'studyId': 'tmb_mskcc_2018'}]
P-0000057-T01-IM3
CANCER_TYPE: Breast Cancer
matched patients:
P-0000057
P-0000247
P-0000392
P-0000422
P-0000447
P-0000547
P-0000638
P-0000704
P-0001312
P-0001785
P-0002041
P-0002626
P-0002789
P-0003224
P-0003241
P-0003265
P-0003273
P-0005131
P-0005176
P-0005274
P-0005712
P-0005855
P-0006227
P-0006842
P-0007014
P-0007127
P-0007349
P-0008571
P-0009297
P-0009364
P-0009498
P-0009727
P-0010252
P-0010669
P-0010800
P-0010994
P-0011552
P-0012803
P-0012825
P-0013210
P-0015445
P-0015713
P-0015905
P-0016226
MO_1030
MO_1031
MO_1032
MO_1051
MO_1065
MO_1068
MO_1069
MO_1090
MO_1107
MO_1126
MO_1129
MO_1159
MO_1167
MO_1178
MO_1185
MO_1188
MO_1190
MO_1213
MO_1227
MO_1230
MO_1237
MO_1239
MO_1247
MO_1266
MO_1267
MO_1269
MO_1271
MO_1276
MO_1287
MO_1288
MO_1289

Part 4: Find patients among multiple studies that match more than one criteria of a specified patient. Need to check for all criteria in the studies checked. 
- patient: P-0000057 from tmb_mskcc_2018 (First draft), P-0000422 (second draft)
- studies: tmb_mskcc_2018, metastatic_solid_tumors_mich_2017
- attributes: CANCER_TYPE and ONCOTREE_CODE

First Draft, first run: 2161 samples, approx 13 minutes
attributes values are Breast cancer and MLDC
one match

First Draft, second run: 2161 samples, approx 12.5 minutes
attribute values ae Breast cancer and ILC
19 matches



In [87]:
# initialize values
patient = "P-0000422"
patientOriginalStudy = "tmb_mskcc_2018"
studyList = ["tmb_mskcc_2018", "metastatic_solid_tumors_mich_2017"]
compareAttributeList = ["CANCER_TYPE", "ONCOTREE_CODE"]
compareValueList = []
counter = 0
attributeTrack = 0

# methods
# fetchPatientSamples uses requests to get all of a pateint's samples
def fetchPatientSamples(patientId, studyId):
    return requests.get(url=f"https://www.cbioportal.org/api/studies/{studyId}/patients/{patientId}/samples?projection=SUMMARY&pageSize=10&pageNumber=0&direction=ASC")

# fetchSampleClinicalDataAttribute takes a sample and pulls only the specified attribute
def fetchSampleClinicalDataAttribute(studyId, sampleId, attribute):
    return requests.get(url=f"https://www.cbioportal.org/api/studies/{studyId}/samples/{sampleId}/clinical-data?attributeId={attribute}&projection=SUMMARY&pageSize=20000&pageNumber=0&direction=ASC")

# fetchClinicalAttributesStudy gets a list of all clinical attribute fields in a study
def fetchClinicalAttributesStudy(studyId):
    return requests.get(url=f"https://www.cbioportal.org/api/studies/{studyId}/clinical-attributes?projection=SUMMARY&pageSize=20000&pageNumber=0&direction=ASC")

# fetchPateintList gets a list of all patients in a study (does not include detailed info)
def fetchPatientList(studyId):
    return requests.get(url=f"https://www.cbioportal.org/api/studies/{studyId}/patients?projection=SUMMARY&pageSize=200000&pageNumber=0&direction=ASC")

# compare attribute 
def compareAttributes(patientSampleList, num):
    for sample in patientSampleList:
        currentSampleId = sample['sampleId']
        currentClinicalDataRequest = fetchSampleClinicalDataAttribute(study, currentSampleId, compareAttributeList[num])
        currentSample = currentClinicalDataRequest.json()[0]
        currentSampleValue = currentSample['value']
        if currentSampleValue != compareValueList[num]:
            return 1
    # no breaks, so all attributes matched. Add to counter and print
    return 0

# Request patient's sample and take values
originalPatientSampleRequest = fetchPatientSamples(patient, patientOriginalStudy)
pOriginalSamples = originalPatientSampleRequest.json()[0]
pSampleId = pOriginalSamples['sampleId']
print(pSampleId)
# Get original patient's clinical sample data attributes, loop to populate list of values
for attributeId in compareAttributeList:
    pClinicalDataAttribute = fetchSampleClinicalDataAttribute(patientOriginalStudy, pSampleId, attributeId)
    pCurrentAttribute = pClinicalDataAttribute.json()[0]
    pcurrentValue = pCurrentAttribute['value']
    compareValueList.append(pcurrentValue)
    print (f"{attributeId} : {pcurrentValue}")
# loop through studies 
for study in studyList:
    # fetch attributes 
    currentStudyAttributesRequest = fetchClinicalAttributesStudy(study)
    currentStudyAttributesList = currentStudyAttributesRequest.json()
    # check if all attributes are appliciable
    for attribute in compareAttributeList:
        for listAttribute in currentStudyAttributesList:
            if attribute == listAttribute['clinicalAttributeId']:
                attributeTrack += 1
    # number of attributes is on track
    if attributeTrack == len(compareAttributeList):
        # get list of patients 
        attributeTrack = 0
        patientListRequest = fetchPatientList(study) 
        patientList = patientListRequest.json()
        # loop through patients
        for patientInfo in patientList:
            # get patient samples
            currentPatientId = patientInfo['patientId']
            currentPatientSamplesRequest = fetchPatientSamples(currentPatientId, study)
            currentPatientSamples = currentPatientSamplesRequest.json()
            # start loop for attributes 
            for x in range(len(compareAttributeList)):
                if compareAttributes(currentPatientSamples, x) == 1:
                    break
                if x == (len(compareAttributeList) - 1):
                    print(currentPatientId)
                    counter += 1
    attributeTrack = 0
print (f"total matched patients: {counter - 1}")

P-0000422-T01-IM3
CANCER_TYPE : Breast Cancer
ONCOTREE_CODE : ILC
P-0000422
P-0000547
P-0006227
P-0009498
P-0010800
MO_1030
MO_1068
MO_1069
MO_1185
MO_1190
MO_1227
MO_1300
MO_1411
MO_1425
MO_1435
MO_1454
MO_1455
MO_1509
MO_1524
TP_2109
total matched patients: 19
