In [1]:
import pandas as pd 
import numpy as np
import os
from tqdm import tqdm
from glob import glob
import requests
import ast
from bs4 import BeautifulSoup
os.makedirs('formatted-data', exist_ok=True)

In [2]:
oncokb_df = pd.read_csv('processed-data/oncokb_all.csv')
oncokb_df['cancer_type_raw_text'] = oncokb_df['cancer_type_raw_text'].apply(lambda x: ast.literal_eval(x))
oncokb_df.head(1)

Unnamed: 0,alterations,drugs,approvedIndications,level,fdaLevel,levelAssociatedCancerType,levelExcludedCancerTypes,pmids,abstracts,description,gene,change,cancer_type_raw_text
0,Oncogenic Mutations,Olaparib,,LEVEL_1,LEVEL_Fda2,"Prostate Cancer, NOS",,32343890,,Olaparib is a PARP inhibitor that is FDA-appro...,ATM,LOSS,"{'id': 944, 'code': '', 'color': 'Cyan', 'name..."


In [3]:
oncokb_df[['gene', 'alterations', 'change']].drop_duplicates()

Unnamed: 0,gene,alterations,change
0,ATM,Oncogenic Mutations,LOSS
4,ATR,Oncogenic Mutations,LOSS
6,BARD1,Oncogenic Mutations,LOSS
8,BRCA1,Oncogenic Mutations,LOSS
30,BRCA2,Oncogenic Mutations,DELETION
...,...,...,...
257,PDGFRA,Oncogenic Mutations,
278,FGFR1,Oncogenic Mutations,
285,MET,Fusions,
288,NTRK3,Fusions,


In [4]:
oncokb_df[oncokb_df['alterations'].str.contains('Oncogenic Mutations')]

Unnamed: 0,alterations,drugs,approvedIndications,level,fdaLevel,levelAssociatedCancerType,levelExcludedCancerTypes,pmids,abstracts,description,gene,change,cancer_type_raw_text
0,Oncogenic Mutations,Olaparib,,LEVEL_1,LEVEL_Fda2,"Prostate Cancer, NOS",,32343890,,Olaparib is a PARP inhibitor that is FDA-appro...,ATM,LOSS,"{'id': 944, 'code': '', 'color': 'Cyan', 'name..."
1,Oncogenic Mutations,Olaparib,,LEVEL_1,LEVEL_Fda2,Prostate Cancer,,32343890,,Olaparib is a PARP inhibitor that is FDA-appro...,ATM,LOSS,"{'id': 936, 'code': '', 'color': 'Cyan', 'name..."
2,Oncogenic Mutations,"Talazoparib,Enzalutamide",,LEVEL_1,LEVEL_Fda2,"Prostate Cancer, NOS",,37285865,,"Talazoparib, a small molecule PARP inhibitor, ...",ATM,LOSS,"{'id': 944, 'code': '', 'color': 'Cyan', 'name..."
3,Oncogenic Mutations,"Talazoparib,Enzalutamide",,LEVEL_1,LEVEL_Fda2,Prostate Cancer,,37285865,,"Talazoparib, a small molecule PARP inhibitor, ...",ATM,LOSS,"{'id': 936, 'code': '', 'color': 'Cyan', 'name..."
4,Oncogenic Mutations,"Talazoparib,Enzalutamide",,LEVEL_1,LEVEL_Fda2,"Prostate Cancer, NOS",,37285865,,"Talazoparib, a small molecule PARP inhibitor, ...",ATR,LOSS,"{'id': 944, 'code': '', 'color': 'Cyan', 'name..."
...,...,...,...,...,...,...,...,...,...,...,...,...,...
280,Oncogenic Mutations,Erdafitinib,,LEVEL_4,LEVEL_Fda3,All Solid Tumors,,3108883132463741,,Fexagratinib and erdafitinib are orally availa...,FGFR2,,"{'id': 984, 'code': '', 'color': '', 'name': '..."
281,Oncogenic Mutations,Lirafugratinib,,LEVEL_4,LEVEL_Fda3,All Solid Tumors,,37270847,https://ascopubs.org/doi/abs/10.1200/JCO.2023....,"Lirafugratinib is an orally available, small m...",FGFR2,,"{'id': 984, 'code': '', 'color': '', 'name': '..."
282,Oncogenic Mutations,Fexagratinib,,LEVEL_4,LEVEL_Fda3,All Solid Tumors,,3108883132463741,,Fexagratinib and erdafitinib are orally availa...,FGFR2,,"{'id': 984, 'code': '', 'color': '', 'name': '..."
283,Oncogenic Mutations,Erdafitinib,,LEVEL_4,LEVEL_Fda3,All Solid Tumors,,3108883132463741,https://ascopubs.org/doi/10.1200/JCO.2024.42.1...,Fexagratinib and erdafitinib are orally availa...,FGFR3,,"{'id': 984, 'code': '', 'color': '', 'name': '..."


In [5]:
#oncotree, raw cancer, modified standardized cancer, raw biomarker, therapy
formatted_df = pd.DataFrame()
formatted_df['level'] = oncokb_df['level']
formatted_df['oncotree'] = oncokb_df['cancer_type_raw_text'].apply(lambda x: x['mainType']['name'])
formatted_df['raw_cancer'] = oncokb_df['cancer_type_raw_text'].apply(lambda x: x['name'] if x['name'] != '' else x['mainType']['name'])
formatted_df['standardized_cancer'] = formatted_df['raw_cancer'].apply(lambda x: x.split(',')[0])
def format_biomarker(x):
    if x['alterations'] == 'Fusions':
        return f"{x['gene']} Fusion"
    if x['alterations'] in ['Amplification', 'Deletion']:
        return f"{x['gene']} {x['change']}"
    if 'Oncogenic Mutations' in x['alterations']:
        if str(x['change']) != 'nan':
            return f"{x['gene']} {x['change']}"
        else:
            return x['gene']
    if ' Fusion' in x['alterations']:
        return x['alterations']
    else:
        return x['gene']
formatted_df['biomarker'] = oncokb_df.apply(lambda x: format_biomarker(x), axis=1)
formatted_df['therapy'] = oncokb_df['drugs']
formatted_df['statement'] = oncokb_df['description']
formatted_df.to_csv('formatted-data/oncokb_all.csv', index = False)
formatted_df.head()

Unnamed: 0,level,oncotree,raw_cancer,standardized_cancer,biomarker,therapy,statement
0,LEVEL_1,"Prostate Cancer, NOS","Prostate Cancer, NOS",Prostate Cancer,ATM LOSS,Olaparib,Olaparib is a PARP inhibitor that is FDA-appro...
1,LEVEL_1,Prostate Cancer,Prostate Cancer,Prostate Cancer,ATM LOSS,Olaparib,Olaparib is a PARP inhibitor that is FDA-appro...
2,LEVEL_1,"Prostate Cancer, NOS","Prostate Cancer, NOS",Prostate Cancer,ATM LOSS,"Talazoparib,Enzalutamide","Talazoparib, a small molecule PARP inhibitor, ..."
3,LEVEL_1,Prostate Cancer,Prostate Cancer,Prostate Cancer,ATM LOSS,"Talazoparib,Enzalutamide","Talazoparib, a small molecule PARP inhibitor, ..."
4,LEVEL_1,"Prostate Cancer, NOS","Prostate Cancer, NOS",Prostate Cancer,ATR LOSS,"Talazoparib,Enzalutamide","Talazoparib, a small molecule PARP inhibitor, ..."
