In [656]:
from IPython.core.interactiveshell import InteractiveShell
InteractiveShell.ast_node_interactivity = "all"
# possible values are: 'all', 'none', 'last' and 'last_expr'

In [657]:
from bravado.client import SwaggerClient
import pandas as pd
import re

cbioportal = SwaggerClient.from_url('https://www.cbioportal.org/api/api-docs',
                                config={"validate_requests":False
                                        ,"validate_responses":False})

In [714]:
# extract all the molecular profiles.
molecular_profiles=cbioportal.Molecular_Profiles \
                              .getAllMolecularProfilesUsingGET().result()
print(f"There are {len(molecular_profiles)} molecular profiles.")
molProfIDs=[m.molecularProfileId for m in molecular_profiles]

There are 1196 molecular profiles.


In [702]:
#######################################################################
# Discrepancy # 1, 33 of unique samples
#######################################################################

# This one is weird. Its name says 'uncalled'.
# The name does not conform to common nomenclature for molecular profiles.
# It should ends with '_mutations'.
# This set is not extracted by the "hard way" using the pair of molecular
# profile and sample list Id as only molecular profiles ending with
# '_mutations' are used.
# However, it is extracted by the "easy way" using multiple profiles.
# Since its name says 'uncalled', are the mutations legit?
'glioma_msk_2018_mutations_uncalled' in molProfIDs

True

In [660]:
# test genes: EGFR, TP53, HRAS and NRAS
geneIDs=[1956, 7157, 3265, 4893] # in the same order

In [703]:
# This is the method to extract all the mutations in one step
mutations=cbioportal.Mutations \
                    .fetchMutationsInMultipleMolecularProfilesUsingPOST(
                     mutationMultipleStudyFilter={"entrezGeneIds":geneIDs
                            ,"molecularProfileIds":molProfIDs}).result()
len(mutations)

38175

In [704]:
sampleReturnedIDs=[m.sampleId for m in mutations]
sampleUniqueReturnedIDs=[m.uniqueSampleKey for m in mutations]
len(set(sampleReturnedIDs))
len(set(sampleUniqueReturnedIDs))

20486

31919

In [705]:
#######################################################################
# Discrepancy # 2, one unique sample 
#######################################################################

# The sample 'TCGA-13-0765-01' showed up in three studies from 'TCGA' but
# with sometimes different uniqueSampleKey's using the 'easy way' --- 

# the one step method.
sampleOv1ID='TCGA-13-0765-01'
sampleOv1ID in sampleReturnedIDs
mutationOne=[[m.sampleId, m.studyId, m.molecularProfileId, m.uniqueSampleKey] 
                       for m in mutations if m.sampleId ==sampleOv1ID]
len(mutationOne)
xSampleID, xStudyID, xMolProfID, uniqueSampleKey=mutationOne[0]
xSampleID, xStudyID, xMolProfID, uniqueSampleKey
xSampleID, xStudyID, xMolProfID, uniqueSampleKey=mutationOne[1]
xSampleID, xStudyID, xMolProfID, uniqueSampleKey
xSampleID, xStudyID, xMolProfID, uniqueSampleKey=mutationOne[2]
xSampleID, xStudyID, xMolProfID, uniqueSampleKey

True

3

('TCGA-13-0765-01',
 'ov_tcga_pub',
 'ov_tcga_pub_mutations',
 'VENHQS0xMy0wNzY1LTAxOm92X3RjZ2FfcHVi')

('TCGA-13-0765-01',
 'ov_tcga',
 'ov_tcga_mutations',
 'VENHQS0xMy0wNzY1LTAxOm92X3RjZ2E')

('TCGA-13-0765-01',
 'ov_tcga_pan_can_atlas_2018',
 'ov_tcga_pan_can_atlas_2018_mutations',
 'VENHQS0xMy0wNzY1LTAxOm92X3RjZ2FfcGFuX2Nhbl9hdGxhc18yMDE4')

In [715]:
# This cell shows that the same sample has no hit corresponding to the
# second hit above by the combination of molecular profile Id and 
# sample list Id.

molProfID_t='ov_tcga_mutations'
sampleListID_t='ov_tcga_all'
filter_t={'entrezGeneIds':geneIDs, 'sampleListId':sampleListID_t}
mutationsOv=cbioportal.Mutations  \
                       .fetchMutationsInMolecularProfileUsingPOST(
                        molecularProfileId=molProfID_t
                       , mutationFilter=filter_t).result()
len(mutationsOv)

sampleReturnedOvIDs=[m.sampleId for m in mutationsOv \
                                     if m.sampleId ==sampleOv1ID]
len(sampleReturnedOvIDs)

288

0

In [709]:
# As a positive control, the following sample works
sampleOv2ID='TCGA-13-1408-01'
sampleReturnedOvIDs=[m.sampleId for m in mutationsOv \
                                     if m.sampleId ==sampleOv2ID]
len(sampleReturnedOvIDs)

1

In [711]:
# This is because the sample is not in the sample list Id.
sample_ov=cbioportal.Sample_Lists \
                    .getAllSampleIdsInSampleListUsingGET(\
                     sampleListId=sampleListID_t).result()

len(sample_ov)
sampleOv1ID in sample_ov
sampleOv2ID in sample_ov

606

False

True