In [1]:
import pandas as pd 
import numpy as np
import os
from tqdm import tqdm
from glob import glob
import requests
import ast
from bs4 import BeautifulSoup
os.makedirs('formatted-data', exist_ok=True)

In [2]:
oncokb_df = pd.read_csv('processed-data/oncokb_all.csv')
oncokb_df['statement_id'] = oncokb_df.index
oncokb_df.head(1)

Unnamed: 0,alterations,drugs,approvedIndications,level,fdaLevel,levelAssociatedCancerType,levelExcludedCancerTypes,pmids,abstracts,description,gene,change,cancer_type_raw_text,statement_id
0,T315I,Ponatinib,,LEVEL_1,LEVEL_Fda2,B-Lymphoblastic Leukemia/Lymphoma,[],241804942319022129567798,,Ponatinib is a small molecule kinase inhibitor...,ABL1,t315i,"{'id': 95, 'code': 'BLL', 'color': 'LimeGreen'...",0


In [3]:
oncokb_df['level'].value_counts()

level
LEVEL_1     221
LEVEL_2     140
LEVEL_3A    104
LEVEL_4      71
LEVEL_R2     61
LEVEL_R1     28
Name: count, dtype: int64

In [4]:
#save descriptions
statements = oncokb_df[['statement_id', 'description']]
print(statements.shape)
os.makedirs('oncokb-db', exist_ok=True)
statements.to_csv('oncokb-db/evidence_statements.csv', index=False)
statements['description'].iloc[0]

(625, 2)


'Ponatinib is a small molecule kinase inhibitor of the BCR-ABL1 fusion that is FDA-approved for adult patients with Philadelphia chromosome-positive acute lymphoblastic leukemia (Ph+ALL) that is resistant or intolerant to prior tyrosine kinase inhibitor therapy, as well as those harboring the BCR-ABL1 T315I mutation. FDA-approval was based on the results of the PACE study, a single-arm, open-label, international, multicenter trial (NCT01207440) of ponatinib in 32 eligible patients with T315I-positive Ph+ALL in which the major (MaHR) and complete hematologic responses achieved by six months were 36% and 32%, respectively (PMID: 24180494). In the five-year follow-up of the PACE study, major and complete cytogenetic responses were achieved by 41% and 32% of T315I-positive Ph+ALL patients, respectively (PMID: 29567798). (PMID: 23190221)'

----

In [5]:
formatted_df = pd.DataFrame()
formatted_df['statement_id'] = oncokb_df['statement_id']
formatted_df['approval_status'] = oncokb_df['level']
formatted_df['approval_org'] = 'OncoKB'
formatted_df['raw_cancer'] = oncokb_df['levelAssociatedCancerType']
formatted_df['evidence_statement'] = oncokb_df['description']
formatted_df.head(1)

Unnamed: 0,statement_id,approval_status,approval_org,raw_cancer,evidence_statement
0,0,LEVEL_1,OncoKB,B-Lymphoblastic Leukemia/Lymphoma,Ponatinib is a small molecule kinase inhibitor...


In [6]:
#format therapies
def format_drug(x):
    if '2,4' not in x:
        return [i.strip() for i in x.split(' + ')]
    else:
        return [x]
formatted_df['therapy'] = oncokb_df['drugs']
formatted_df['therapy'] = formatted_df['therapy'].apply(format_drug)
formatted_df.head(1)

Unnamed: 0,statement_id,approval_status,approval_org,raw_cancer,evidence_statement,therapy
0,0,LEVEL_1,OncoKB,B-Lymphoblastic Leukemia/Lymphoma,Ponatinib is a small molecule kinase inhibitor...,[Ponatinib]


In [7]:
# import requests
# import json
# oncotree = requests.get('https://oncotree.mskcc.org:443/api/tumorTypes').json()
# oncotree = pd.json_normalize(oncotree)
# oncotree['name'] = oncotree['name'].apply(lambda x: x.lower())
# oncotree_dict = dict(zip(oncotree['name'], oncotree['mainType']))
# #major cancer names
# cancer_dict = {'Lung Non-small Cell Carcinoma':'Non-Small Cell Lung Cancer', 'Chronic Myeloid Leukemia':'Chronic Myelogenous Leukemia',
#                'Low Grade Glioma':'Low-Grade Glioma, NOS', 'Skin Melanoma':'Melanoma',
#                }
# #map to standardized cancer
# def map_to_standardized_cancer(x):
#     if x in cancer_dict:
#         x = cancer_dict[x]
#         return x
#     elif x.lower() in oncotree_dict:
#         return x
#     elif x.lower() in ['cancer', 'solid tumor', 'sarcoma']:
#         return x
#     elif x in oncotree['mainType'].values:
#         return x
#     else:
#         try:
#             x = x.lower().replace('cancer','').replace('carcinoma','').replace('tumor','').replace('tumour','').replace('neoplasm','').split('With')[0].strip()
#             if x in oncotree_dict:
#                 return x
#             elif oncotree[oncotree['name'].str.contains(x)].shape[0] > 0:
#                 if oncotree[oncotree['name'].str.contains(x)]['mainType'].nunique() == 1:
#                     return x
#                 else:
#                     return ''
#             else:
#                 return ''
#         except:
#             return ''
formatted_df['standardized_cancer'] = oncokb_df['levelAssociatedCancerType']
formatted_df['standardized_cancer'].value_counts()

standardized_cancer
Non-Small Cell Lung Cancer                                         133
All Solid Tumors                                                    60
Colorectal Cancer                                                   34
Gastrointestinal Stromal Tumor                                      32
Prostate Cancer                                                     31
                                                                  ... 
Gastrointestinal Neuroendocrine Tumors of the Esophagus/Stomach      1
Acute Leukemias of Ambiguous Lineage                                 1
Dermatofibrosarcoma Protuberans                                      1
Pancreatic Cancer                                                    1
Chronic Myelomonocytic Leukemia                                      1
Name: count, Length: 78, dtype: int64

In [8]:
formatted_df[formatted_df['standardized_cancer'] == ''].raw_cancer.value_counts()

Series([], Name: count, dtype: int64)

In [9]:
formatted_df['biomarker'] = oncokb_df['alterations']

In [10]:
# def format_biomarker(x):
#     if 'AND' in x:
#         return x.split(' AND ')
#     else:
#         return [x]
formatted_df['biomarker'] = oncokb_df['alterations'].apply(lambda x: [x])#.apply(lambda x: x.split(' OR '))
formatted_df['approvedIndications'] = oncokb_df['approvedIndications']
# formatted_df = formatted_df.explode('biomarker')
# formatted_df['biomarker'] = formatted_df['biomarker'].apply(lambda x: format_biomarker(x) if type(x) == str else [x])
formatted_df['biomarker'].value_counts()

biomarker
[Oncogenic Mutations]                                                                   195
[Amplification]                                                                          63
[Fusions]                                                                                56
[V600E]                                                                                  19
[G12C]                                                                                   17
                                                                                       ... 
[G696A]                                                                                   1
[F617L]                                                                                   1
[G1269A]                                                                                  1
[C1156Y]                                                                                  1
[Oncogenic Mutations (excluding Y646S, Y646H, Y646C, Y646F, Y646N, A68

In [11]:
formatted_df.head(1)

Unnamed: 0,statement_id,approval_status,approval_org,raw_cancer,evidence_statement,therapy,standardized_cancer,biomarker,approvedIndications
0,0,LEVEL_1,OncoKB,B-Lymphoblastic Leukemia/Lymphoma,Ponatinib is a small molecule kinase inhibitor...,[Ponatinib],B-Lymphoblastic Leukemia/Lymphoma,[T315I],


In [12]:
def format_context(x):
    summary = ''
    if str(x['approvedIndications']) != 'nan':
        summary += 'Approved indication: ' + x['approvedIndications'] + '\n'
    summary += 'Approval level: ' + x['approval_status'] + '\n'
    summary += 'Database: ' + x['approval_org'] + '\n'
    summary += 'Description: ' + x['evidence_statement'] + '\n'
    summary += 'Cancer type: ' + x['raw_cancer'] + '\n'
    summary += 'Biomarkers: ' + ', '.join(x['biomarker']) + '\n' if type(x['biomarker']) == list else x['biomarker'] + '\n'
    summary += 'Therapy: ' + ', '.join(x['therapy']) + '\n' if type(x['therapy']) == list else x['therapy'] + '\n'
    # summary += 'Approval url: https://civicdb.org/links/evidence_items/' + x['statement_id'] + '\n'
    # summary += 'Updated date: ' + x['publication_date']
    return summary
formatted_df['context'] = formatted_df.apply(format_context, axis=1)

In [13]:
df = formatted_df.copy()
df.to_csv('oncokb-db/oncokb-draft.dereferenced.unique.context_db.csv', index=False)
df

Unnamed: 0,statement_id,approval_status,approval_org,raw_cancer,evidence_statement,therapy,standardized_cancer,biomarker,approvedIndications,context
0,0,LEVEL_1,OncoKB,B-Lymphoblastic Leukemia/Lymphoma,Ponatinib is a small molecule kinase inhibitor...,[Ponatinib],B-Lymphoblastic Leukemia/Lymphoma,[T315I],,Approval level: LEVEL_1\nDatabase: OncoKB\nDes...
1,1,LEVEL_1,OncoKB,Chronic Myelogenous Leukemia,Ponatinib is a small molecule kinase inhibitor...,[Ponatinib],Chronic Myelogenous Leukemia,[T315I],,Approval level: LEVEL_1\nDatabase: OncoKB\nDes...
2,2,LEVEL_1,OncoKB,Chronic Myelogenous Leukemia,Asciminib is a STAMP (Specifically Targeting t...,[Asciminib],Chronic Myelogenous Leukemia,[T315I],,Approval level: LEVEL_1\nDatabase: OncoKB\nDes...
3,3,LEVEL_1,OncoKB,Breast Cancer,"Capivasertib is an orally available, ATP-compe...","[Capivasertib,Fulvestrant]",Breast Cancer,[E17K],,Approval level: LEVEL_1\nDatabase: OncoKB\nDes...
4,4,LEVEL_1,OncoKB,Melanoma,Dabrafenib is an orally bioavailable RAF inhib...,[Dabrafenib],Melanoma,[V600E],Dabrafenib is FDA-approved for BRAF V600E muta...,Approved indication: Dabrafenib is FDA-approve...
...,...,...,...,...,...,...,...,...,...,...
620,620,LEVEL_4,OncoKB,Chronic Myelomonocytic Leukemia,"Ceralasertib is an orally available, small mol...",[Ceralasertib],Chronic Myelomonocytic Leukemia,[Oncogenic Mutations],,Approval level: LEVEL_4\nDatabase: OncoKB\nDes...
621,621,LEVEL_4,OncoKB,Myelodysplastic Syndromes,"Ceralasertib is an orally available, small mol...",[Ceralasertib],Myelodysplastic Syndromes,[Oncogenic Mutations],,Approval level: LEVEL_4\nDatabase: OncoKB\nDes...
622,622,LEVEL_R2,OncoKB,Non-Small Cell Lung Cancer,Erlotinib and gefitinib are first-generation s...,[Erlotinib],Non-Small Cell Lung Cancer,[Amplification],,Approval level: LEVEL_R2\nDatabase: OncoKB\nDe...
623,623,LEVEL_R2,OncoKB,Non-Small Cell Lung Cancer,Erlotinib and gefitinib are first-generation s...,[Gefitinib],Non-Small Cell Lung Cancer,[Amplification],,Approval level: LEVEL_R2\nDatabase: OncoKB\nDe...


In [14]:
import tiktoken
encoding = tiktoken.encoding_for_model("gpt-4o")

In [15]:
def calc_iqr(chunk_size_list):
    q1 = np.percentile(chunk_size_list, 25)
    q3 = np.percentile(chunk_size_list, 75)
    iqr = f'{q1}-{q3}'
    return(iqr)

struc_context_size = [len(encoding.encode(context)) for context in df['context']]
print("# chunks: "+str(len(struc_context_size)))
print("Min: "+str(np.min(struc_context_size)))
print("Max: "+str(np.max(struc_context_size)))
print("Mean: "+str(np.mean(struc_context_size)))
print("Median: "+str(np.median(struc_context_size)))
print("IQR: "+str(calc_iqr(struc_context_size)))


# chunks: 625
Min: 135
Max: 1711
Mean: 388.0192
Median: 344.0
IQR: 259.0-486.0


In [16]:
#for now, as we don't have the core dataframe, we save the formatted dataframe as df
df = pd.read_csv('oncokb-db/oncokb-draft.dereferenced.unique.context_db.csv')
df.head(1)

Unnamed: 0,statement_id,approval_status,approval_org,raw_cancer,evidence_statement,therapy,standardized_cancer,biomarker,approvedIndications,context
0,0,LEVEL_1,OncoKB,B-Lymphoblastic Leukemia/Lymphoma,Ponatinib is a small molecule kinase inhibitor...,['Ponatinib'],B-Lymphoblastic Leukemia/Lymphoma,['T315I'],,Approval level: LEVEL_1\nDatabase: OncoKB\nDes...


In [17]:
standardized_to_raw_mapping = df[['statement_id', 'standardized_cancer', 'raw_cancer', 'biomarker', 'therapy']]

standardized_to_raw_mapping['modified_standard_cancer'] = standardized_to_raw_mapping['standardized_cancer']
standardized_to_raw_mapping.to_csv(f"oncokb-db/oncokb_core__2025-09.csv", index=False)
standardized_to_raw_mapping.head(1)

A value is trying to be set on a copy of a slice from a DataFrame.
Try using .loc[row_indexer,col_indexer] = value instead

See the caveats in the documentation: https://pandas.pydata.org/pandas-docs/stable/user_guide/indexing.html#returning-a-view-versus-a-copy
  standardized_to_raw_mapping['modified_standard_cancer'] = standardized_to_raw_mapping['standardized_cancer']


Unnamed: 0,statement_id,standardized_cancer,raw_cancer,biomarker,therapy,modified_standard_cancer
0,0,B-Lymphoblastic Leukemia/Lymphoma,B-Lymphoblastic Leukemia/Lymphoma,['T315I'],['Ponatinib'],B-Lymphoblastic Leukemia/Lymphoma
