In [1]:
import pandas as pd 
import numpy as np
import os
from tqdm import tqdm
from glob import glob
import requests
import ast
from bs4 import BeautifulSoup
os.makedirs('formatted-data', exist_ok=True)

In [2]:
oncokb_df = pd.read_csv('processed-data/oncokb_all.csv')
oncokb_df['cancer_type_raw_text'] = oncokb_df['cancer_type_raw_text'].apply(lambda x: ast.literal_eval(x))
oncokb_df.head(1)

Unnamed: 0,alterations,drugs,approvedIndications,level,fdaLevel,levelAssociatedCancerType,levelExcludedCancerTypes,pmids,abstracts,description,gene,change,cancer_type_raw_text
0,T315I,Ponatinib,,LEVEL_1,LEVEL_Fda2,B-Lymphoblastic Leukemia/Lymphoma,[],241804942319022129567798,,Ponatinib is a small molecule kinase inhibitor...,ABL1,t315i,"{'id': 95, 'code': 'BLL', 'color': 'LimeGreen'..."


In [3]:
oncokb_df[['gene', 'alterations', 'change']].drop_duplicates()

Unnamed: 0,gene,alterations,change
0,ABL1,T315I,t315i
3,AKT1,E17K,e17k
4,BRAF,V600E,v600e
10,BRAF,"V600E,V600K",v600e
14,BRAF,V600,v600e
...,...,...,...
608,FGFR2,Amplification,
609,KDM6A,Oncogenic Mutations,
618,SMARCB1,Oncogenic Mutations,
619,STK11,Oncogenic Mutations,


In [4]:
oncokb_df[oncokb_df['alterations'].str.contains('Oncogenic Mutations')]

Unnamed: 0,alterations,drugs,approvedIndications,level,fdaLevel,levelAssociatedCancerType,levelExcludedCancerTypes,pmids,abstracts,description,gene,change,cancer_type_raw_text
22,Oncogenic Mutations,Datopotamab Deruxtecan,,LEVEL_1,LEVEL_Fda2,Non-Small Cell Lung Cancer,[],3976148339250535,,Datopotamab deruxtecan (Dato-DXd) is an intrav...,EGFR,c797g,"{'id': 876, 'code': '', 'color': 'Gainsboro', ..."
40,Oncogenic Mutations,Trastuzumab Deruxtecan,,LEVEL_1,LEVEL_Fda3,Non-Small Cell Lung Cancer,[],34534430,,Trastuzumab Deruxtecan is a HER2-directed anti...,ERBB2,l768s,"{'id': 876, 'code': '', 'color': 'Gainsboro', ..."
54,Oncogenic Mutations,Imatinib,,LEVEL_1,LEVEL_Fda2,Gastrointestinal Stromal Tumor,[],28196207189554511823512212181401,,Imatinib is a small molecule inhibitor of KIT ...,KIT,p577del,"{'id': 642, 'code': 'GIST', 'color': 'LightYel..."
55,Oncogenic Mutations,Sunitinib,,LEVEL_1,LEVEL_Fda2,Gastrointestinal Stromal Tumor,[],17046465,,Sunitinib is a small molecule inhibitor of KIT...,KIT,p577del,"{'id': 642, 'code': 'GIST', 'color': 'LightYel..."
56,Oncogenic Mutations,Regorafenib,,LEVEL_1,LEVEL_Fda2,Gastrointestinal Stromal Tumor,[],23177515,,Regorafenib is a small molecule inhibitor of K...,KIT,p577del,"{'id': 642, 'code': 'GIST', 'color': 'LightYel..."
...,...,...,...,...,...,...,...,...,...,...,...,...,...
617,Oncogenic Mutations,AZD8186,,LEVEL_4,LEVEL_Fda3,All Solid Tumors,,2864594134281912,http://ascopubs.org/doi/abs/10.1200/JCO.2017.3...,GSK2636771 and AZD8186 are ATP-competitive sma...,PTEN,,"{'id': 984, 'code': '', 'color': '', 'name': '..."
618,Oncogenic Mutations,Tazemetostat,,LEVEL_4,LEVEL_Fda3,All Liquid Tumors,,29650362236205152095194227391784,http://www.abstractsonline.com/pp8/#!/4557/pre...,Tazemetostat is a small molecule inhibitor of ...,SMARCB1,,"{'id': 983, 'code': '', 'color': '', 'name': '..."
619,Oncogenic Mutations,"Bemcentinib,Pembrolizumab",,LEVEL_4,LEVEL_Fda3,Non-Small Cell Lung Cancer,,,https://jitc.bmj.com/content/9/Suppl_2/A632.al...,Bemcentinib is an inhibitor of the AXL recepto...,STK11,,"{'id': 876, 'code': '', 'color': 'Gainsboro', ..."
620,Oncogenic Mutations,Ceralasertib,,LEVEL_4,LEVEL_Fda3,Chronic Myelomonocytic Leukemia,,,https://journals.lww.com/hemasphere/fulltext/2...,"Ceralasertib is an orally available, small mol...",ZRSR2,,"{'id': 224, 'code': 'CMML', 'color': 'LightSal..."


In [8]:
#oncotree, raw cancer, modified standardized cancer, raw biomarker, therapy
formatted_df = pd.DataFrame()
formatted_df['level'] = oncokb_df['level']
formatted_df['oncotree'] = oncokb_df['cancer_type_raw_text'].apply(lambda x: x['mainType']['name'])
formatted_df['raw_cancer'] = oncokb_df['cancer_type_raw_text'].apply(lambda x: x['name'] if x['name'] != '' else x['mainType']['name'])
formatted_df['standardized_cancer'] = formatted_df['raw_cancer'].apply(lambda x: x.split(',')[0])
def format_biomarker(x):
    if x['alterations'] == 'Fusions':
        return f"{x['gene']} Fusion"
    if x['alterations'] in ['Amplification', 'Deletion']:
        return f"{x['gene']} {x['change']}"
    if 'Oncogenic Mutations' in x['alterations']:
        if str(x['change']) != 'nan':
            return f"{x['gene']} {x['change']}"
        else:
            return x['gene']
    if ' Fusion' in x['alterations']:
        return x['alterations']
    else:
        return x['gene'] + ' ' + x['alterations']
formatted_df['biomarker'] = oncokb_df.apply(lambda x: format_biomarker(x), axis=1)
formatted_df['therapy'] = oncokb_df['drugs']
formatted_df['statement'] = oncokb_df['description']
formatted_df.to_csv('formatted-data/oncokb_all.csv', index = False)
print(formatted_df.shape)
formatted_df.head()

(625, 7)


Unnamed: 0,level,oncotree,raw_cancer,standardized_cancer,biomarker,therapy,statement
0,LEVEL_1,B-Lymphoblastic Leukemia/Lymphoma,B-Lymphoblastic Leukemia/Lymphoma,B-Lymphoblastic Leukemia/Lymphoma,ABL1 T315I,Ponatinib,Ponatinib is a small molecule kinase inhibitor...
1,LEVEL_1,Myeloproliferative Neoplasms,Chronic Myelogenous Leukemia,Chronic Myelogenous Leukemia,ABL1 T315I,Ponatinib,Ponatinib is a small molecule kinase inhibitor...
2,LEVEL_1,Myeloproliferative Neoplasms,Chronic Myelogenous Leukemia,Chronic Myelogenous Leukemia,ABL1 T315I,Asciminib,Asciminib is a STAMP (Specifically Targeting t...
3,LEVEL_1,Breast Cancer,Breast Cancer,Breast Cancer,AKT1 E17K,"Capivasertib,Fulvestrant","Capivasertib is an orally available, ATP-compe..."
4,LEVEL_1,Melanoma,Melanoma,Melanoma,BRAF V600E,Dabrafenib,Dabrafenib is an orally bioavailable RAF inhib...
