In [1]:
import pandas as pd 
import numpy as np
import os
from tqdm import tqdm
from glob import glob
import requests
from bs4 import BeautifulSoup

In [2]:
all_genes = requests.get('https://www.oncokb.org/api/v1/utils/cancerGeneList', 
                         headers = {'Authorization': 'Bearer 948ece93-2979-4428-bef1-755d967f1a7a'}).json()
all_genes = pd.DataFrame(all_genes)
print('All genes in OncoKB:', all_genes.shape[0])
all_genes = all_genes[(all_genes.oncokbAnnotated == True)]
print('Oncokb annotated genes:', all_genes.shape[0])
all_genes.head(1)

All genes in OncoKB: 1203
Oncokb annotated genes: 950


Unnamed: 0,hugoSymbol,entrezGeneId,grch37Isoform,grch37RefSeq,grch38Isoform,grch38RefSeq,oncokbAnnotated,occurrenceCount,mSKImpact,mSKHeme,foundation,foundationHeme,vogelstein,sangerCGC,geneType,geneAliases
0,ABL1,25,ENST00000318560,NM_005157.4,ENST00000318560,NM_005157.4,True,7,True,True,True,True,True,True,ONCOGENE,"[JTK7, c-ABL, ABL]"


In [3]:
#downloaded from the oncokb website - TODO: figure out a way to programatically download this
table = pd.read_csv('raw-data/oncokb_biomarker_drug_associations.tsv', sep = '\t')
table = table[table['Drugs (for therapeutic implications only)'].notna()]
def process_alterations(x):
    x = x.lower()
    if 'excluding' in x:
        x = x
    else:
        if ' and ' in x:
            x = x.split(' and ')
        else:
            x = x.split(',')
    return x
table['Alterations'] = table['Alterations'].apply(lambda x: process_alterations(x))
table = table.explode('Alterations')
table['Alterations'] = table['Alterations'].apply(lambda x: x.strip())
def classify_alteration(x):
    if any(i in x for i in ['fusion', 'translocation', 'rearrangement', 'insertion', 'inversion', 'duplication']):
        return 'structural_variants'
    if 'mutation' in x or 'insertion' in x:
        return 'mutation'
    if 'amplification' in x or 'deletion' in x or 'loss' in x or 'gain' in x:
        return 'copy_number_alteration'
    if 'microsatellite instability' in x:
        return 'msi'
    if len(x) == 4 or len(x) == 5 or len(x) == 6 or x[-3:] == 'del' or 'insfqea' in x or 'delinsd' in x: # e.g. E545K, V600E
        return 'protein change'
    else:
        return 'other'
table['alt_type'] = table['Alterations'].apply(classify_alteration)
print(table['Gene'].nunique(), set(table['Gene']) - set(all_genes['hugoSymbol']))
table[table.alt_type == 'other']

85 {'Other Biomarkers'}


Unnamed: 0,Level,Gene,Alterations,Cancer Types,Drugs (for therapeutic implications only),alt_type
132,1,KRAS,wildtype,Colorectal Cancer,Cetuximab,other
133,1,KRAS,wildtype,Colorectal Cancer,Cetuximab + Chemotherapy,other
134,1,KRAS,wildtype,Colorectal Cancer,Panitumumab,other
135,1,KRAS,wildtype,Colorectal Cancer,Panitumumab + Chemotherapy,other
143,1,NRAS,wildtype,Colorectal Cancer,Panitumumab,other
144,1,NRAS,wildtype,Colorectal Cancer,Panitumumab + Chemotherapy,other
260,2,BRAF,v600 (excluding v600e and v600k),Melanoma,Dabrafenib + Trametinib,other
261,2,BRAF,v600 (excluding v600e and v600k),Melanoma,Encorafenib + Binimetinib,other
262,2,BRAF,v600 (excluding v600e and v600k),Melanoma,Vemurafenib + Cobimetinib,other


---
### copy number alterations / mutations

In [4]:
cna = table[table.alt_type.isin(['copy_number_alteration', 'mutation'])][['Gene']].drop_duplicates()
print(cna.Gene.nunique(), cna.shape[0])

65 65


In [5]:
os.makedirs('raw-data/cna', exist_ok=True)
missing = []
for i, row in (cna.iterrows()):
    gene = row['Gene']
    for t in ['GAIN', 'LOSS', 'AMPLIFICATION', 'DELETION']:
        url = f'https://www.oncokb.org/api/v1/annotate/copyNumberAlterations?hugoSymbol={gene}&copyNameAlterationType={t}&referenceGenome=GRCh37'
        api_out = requests.get(url, headers = {'Authorization': 'Bearer 948ece93-2979-4428-bef1-755d967f1a7a'}).json()['treatments']
        if len(api_out) == 0:
            missing.append([gene, t])
            continue
        df = pd.DataFrame(api_out)
        df.to_csv(f'raw-data/cna/{gene}_{t}.csv', index=False)
from glob import glob
print(len(glob('raw-data/cna/*.csv')))
missing = pd.DataFrame(missing, columns = ['Gene', 'Type'])
missing = missing['Gene'].value_counts().reset_index(name = 'count')
missing[missing['count'] == 4]

82


Unnamed: 0,Gene,count
0,FGFR3,4
1,KIT,4
2,PIK3CA,4
3,PDGFRA,4
4,HRAS,4
5,Other Biomarkers,4
6,RAD54L,4
7,RET,4
8,MAP2K2,4
9,MAP2K1,4


In [6]:
table[table['Gene'].isin(missing[missing['count'] == 4]['Gene'])].alt_type.value_counts()

alt_type
protein change            187
mutation                  115
structural_variants        20
msi                        10
other                       9
copy_number_alteration      1
Name: count, dtype: int64

In [7]:
table[(table['alt_type'] == 'copy_number_alteration')&(table['Gene'].isin(missing[missing['count'] == 4]['Gene']))]

Unnamed: 0,Level,Gene,Alterations,Cancer Types,Drugs (for therapeutic implications only),alt_type
166,1,PDGFRA,exon 18 in-frame deletions,Gastrointestinal Stromal Tumor,Avapritinib,copy_number_alteration


---
### fusions

In [8]:
sv = table[table.alt_type.isin(['structural_variants'])][['Gene']].drop_duplicates()
print(sv.Gene.nunique(), sv.shape[0])

24 24


In [9]:
os.makedirs('raw-data/sv', exist_ok=True)
missing = []
for i, row in (sv.iterrows()):
    gene = row['Gene']
    df_all = pd.DataFrame()
    for t in ['DELETION','TRANSLOCATION','DUPLICATION','INSERTION','INVERSION','FUSION','UNKNOWN']:
        url = f'https://www.oncokb.org/api/v1/annotate/structuralVariants?hugoSymbolA={gene}&structuralVariantType={t}&referenceGenome=GRCh37&isFunctionalFusion=true'
        api_out = requests.get(url, headers = {'Authorization': 'Bearer 948ece93-2979-4428-bef1-755d967f1a7a'}).json()['treatments']
        df = pd.DataFrame(api_out)
        df_all = pd.concat([df_all, df], axis = 0)
        url = f'https://www.oncokb.org/api/v1/annotate/structuralVariants?hugoSymbolA={gene}&structuralVariantType={t}&referenceGenome=GRCh37&isFunctionalFusion=false'
        api_out = requests.get(url, headers = {'Authorization': 'Bearer 948ece93-2979-4428-bef1-755d967f1a7a'}).json()['treatments']
        df = pd.DataFrame(api_out)
        df_all = pd.concat([df_all, df], axis = 0)
        if df_all.shape[0] == 0:
            missing.append([gene, t])
    if df_all.shape[0] > 0:
        df_all.to_csv(f'raw-data/sv/{gene}.csv', index=False)
from glob import glob
print(len(glob('raw-data/sv/*.csv')))
missing = pd.DataFrame(missing, columns = ['Gene', 'Type'])
missing = missing['Gene'].value_counts().reset_index(name = 'count')
missing[missing['count'] == 7]

20


Unnamed: 0,Gene,count
0,ABL1,7
1,EGFR,7
2,ERBB2,7
3,FLT3,7
4,PDGFB,7
5,RARA,7
6,ESR1,7
7,FLI1,7


In [10]:
fusions = table[(table['Gene'].isin(missing[missing['count'] == 7]['Gene']))&
      (table.alt_type == 'structural_variants')&
      (table.Alterations.str.contains('fusion'))&
      (table.Alterations.str.contains('-'))][['Gene', 'Alterations']].drop_duplicates()
missing_fusions = []
print(fusions.Alterations.tolist())
for i, row in (fusions.iterrows()):
    gene1 = row['Alterations'].split('-')[0]
    gene2 = row['Alterations'].split('-')[1].split(' ')[0]
    df_all = pd.DataFrame()
    for t in ['DELETION','TRANSLOCATION','DUPLICATION','INSERTION','INVERSION','FUSION','UNKNOWN']:
        url = f'https://www.oncokb.org/api/v1/annotate/structuralVariants?hugoSymbolA={gene1}&hugoSymbolB={gene2}&structuralVariantType={t}&referenceGenome=GRCh37&isFunctionalFusion=true'
        api_out = requests.get(url, headers = {'Authorization': 'Bearer 948ece93-2979-4428-bef1-755d967f1a7a'}).json()['treatments']
        df = pd.DataFrame(api_out)
        df_all = pd.concat([df_all, df], axis = 0)
        url = f'https://www.oncokb.org/api/v1/annotate/structuralVariants?hugoSymbolA={gene1}&hugoSymbolB={gene2}&structuralVariantType={t}&referenceGenome=GRCh37&isFunctionalFusion=false'
        api_out = requests.get(url, headers = {'Authorization': 'Bearer 948ece93-2979-4428-bef1-755d967f1a7a'}).json()['treatments']
        df = pd.DataFrame(api_out)
        df_all = pd.concat([df_all, df], axis = 0)
    if df_all.shape[0] == 0:
        missing_fusions.append([gene1, gene2, t])
    if df_all.shape[0] > 0:
        df_all.to_csv(f'raw-data/sv/{gene1}-{gene2}.csv', index=False)
missing_fusions = pd.DataFrame(missing_fusions, columns = ['Gene1', 'Gene2', 'Type'])
missing_fusions = missing_fusions['Gene1'].value_counts().reset_index(name = 'count')
missing_fusions[missing_fusions['count'] == 7]

['bcr-abl1 fusion', 'col1a1-pdgfb fusion', 'pml-rara fusion', 'ewsr1-fli1 fusion']


Unnamed: 0,Gene1,count


In [11]:
table[(table['Gene'].isin(missing[missing['count'] == 7]['Gene']))&
      (table.alt_type == 'structural_variants')&
      (~table.Alterations.str.contains('fusion'))][['Gene', 'Alterations']].drop_duplicates()

Unnamed: 0,Gene,Alterations
75,EGFR,exon 20 in-frame insertions
83,ERBB2,exon 20 in-frame insertions
109,FLT3,internal tandem duplication
426,EGFR,exon 19 in-frame insertions
433,EGFR,kinase domain duplication


---
### protein change

In [125]:
protein_change = table[table.alt_type == 'protein change'][['Gene', 'Alterations']].drop_duplicates()
os.makedirs('raw-data/protein_change', exist_ok=True)
for i, row in tqdm(protein_change.iterrows()):
    gene = row['Gene']
    alteration = row['Alterations']
    url = f'https://www.oncokb.org/api/v1/annotate/mutations/byProteinChange?referenceGenome=GRCh37&hugoSymbol={gene}&alteration={alteration}'
    try:
        api_out = requests.get(url, headers = {'Authorization': 'Bearer 948ece93-2979-4428-bef1-755d967f1a7a'}).json()['treatments']
        df = pd.DataFrame(api_out)
        df.to_csv(f'raw-data/protein_change/{gene}_{alteration}.csv', index=False)
    except:
        print(gene, alteration, 'error fetching, skipping')
        continue

239it [00:24,  9.62it/s]


----
### other

In [None]:
table = pd.read_csv('raw-data/oncokb_biomarker_drug_associations.tsv', sep = '\t')
table = table[table['Drugs (for therapeutic implications only)'].notna()]
table['Alterations'] = table['Alterations'].apply(lambda x: x.lower())
table[(table['Alterations'].str.contains('fusion') == False) & 
      (table['Alterations'].str.contains('amplification') == False) &
      (table['Alterations'].str.contains('oncogenic mutations') == False)].Alterations.value_counts()

Alterations
v600e                                16
g12c                                 14
t315i                                11
microsatellite instability-high      10
exon 19 in-frame deletions, l858r     9
                                     ..
a289v, r108k, t263p                   1
l747p                                 1
f1245y                                1
c1156y                                1
r1275q                                1
Name: count, Length: 112, dtype: int64

In [None]:
os.makedirs('api-data', exist_ok=True)
for i, row in tqdm(all_genes.iterrows()):
    gene = row['hugoSymbol']
    url = f'https://www.oncokb.org/api/v1/annotate/copyNumberAlterations?hugoSymbol={gene}&copyNameAlterationType=LOSS&referenceGenome=GRCh37'
    api_out = requests.get(url, headers = {'Authorization': 'Bearer 948ece93-2979-4428-bef1-755d967f1a7a'}).json()['treatments']
    if len(api_out) == 0:
        continue
    df = pd.DataFrame(api_out)
    # processing - we'll do this later
    # df['gene'] = gene
    # df['alterations'] = df['alterations'].apply(lambda x: ','.join(x))
    # df['drugs'] = df['drugs'].apply(lambda x: ','.join([i['drugName'] for i in x]))
    # df['approvedIndications'] = df['approvedIndications'].apply(lambda x: x[0] if len(x) > 0 else np.nan)
    # df['levelAssociatedCancerType'] = df['levelAssociatedCancerType'].apply(lambda x: x['name'] if x['name'] != '' else x['mainType']['name'])
    # df['levelExcludedCancerTypes'] = df['levelExcludedCancerTypes'].apply(lambda x: ','.join(x))
    # df['pmids'] = df['pmids'].apply(lambda x: ','.join(x))
    # df['abstracts'] = df['abstracts'].apply(lambda x: ','.join([i['link'] for i in x]))
    df.to_csv(f'api-data/{gene}.csv', index=False)
from glob import glob
len(glob('api-data/*.csv'))