In [None]:
import pandas as pd 
import numpy as np
import os,
data_folder = 'sep-2025-release'

In [105]:
clinical_evidence = os.path.join(data_folder, '01-Sep-2025-ClinicalEvidenceSummaries.tsv')
clinical_evidence = pd.read_csv(clinical_evidence, sep='\t')
clinical_evidence = clinical_evidence[(clinical_evidence.therapies.notna())].reset_index(drop = True)
os.makedirs('civic-db', exist_ok=True)
clinical_evidence[['evidence_statement', 'evidence_civic_url']].to_csv('civic-db/evidence_statements.csv', index=False)
clinical_evidence.columns

Index(['molecular_profile', 'molecular_profile_id', 'disease', 'doid',
       'phenotypes', 'therapies', 'therapy_interaction_type', 'evidence_type',
       'evidence_direction', 'evidence_level', 'significance',
       'evidence_statement', 'citation_id', 'source_type', 'asco_abstract_id',
       'citation', 'nct_ids', 'rating', 'evidence_status', 'evidence_id',
       'variant_origin', 'last_review_date', 'evidence_civic_url',
       'molecular_profile_civic_url', 'is_flagged'],
      dtype='object')

In [3]:
import requests
import json
oncotree = requests.get('https://oncotree.mskcc.org:443/api/tumorTypes').json()
oncotree = pd.json_normalize(oncotree)
oncotree['name'] = oncotree['name'].apply(lambda x: x.lower())
oncotree_dict = dict(zip(oncotree['name'], oncotree['mainType']))

In [24]:
formatted_df = pd.DataFrame()
formatted_df['level'] = clinical_evidence['evidence_level']

#major cancer names
cancer_dict = {'Lung Non-small Cell Carcinoma':'Non-Small Cell Lung Cancer', 'Chronic Myeloid Leukemia':'Chronic Myelogenous Leukemia',
               'Low Grade Glioma':'Low-Grade Glioma, NOS'}
formatted_df['raw_cancer'] = clinical_evidence['disease'].apply(lambda x: cancer_dict[x] if x in cancer_dict else x)

#map to oncotree
def oncotree_map(x):
    if x.lower() in oncotree_dict:
        return oncotree_dict[x.lower()]
    elif x in oncotree['mainType'].values:
        return x
    else:
        try:
            x = x.lower().replace('cancer','').replace('carcinoma','').replace('tumor','').replace('tumour','').replace('neoplasm','').split('With')[0].strip()
            if x in oncotree_dict:
                return oncotree_dict[x]
            elif oncotree[oncotree['name'].str.contains(x)].shape[0] > 0:
                if oncotree[oncotree['name'].str.contains(x)]['mainType'].nunique() == 1:
                    return oncotree[oncotree['name'].str.contains(x)]['mainType'].values[0]
                else:
                    return 'NA'
            else:
                return 'NA'
        except:
            return 'NA'
formatted_df['oncotree'] = formatted_df['raw_cancer'].apply(oncotree_map)

#map to standardized cancer
def map_to_standardized_cancer(x):
    # Implement your mapping logic here
    if x.lower() in oncotree_dict:
        return x
    elif x in oncotree['mainType'].values:
        return x
    else:
        try:
            x = x.lower().replace('cancer','').replace('carcinoma','').replace('tumor','').replace('tumour','').replace('neoplasm','').split('With')[0].strip()
            if x in oncotree_dict:
                return x
            elif oncotree[oncotree['name'].str.contains(x)].shape[0] > 0:
                if oncotree[oncotree['name'].str.contains(x)]['mainType'].nunique() == 1:
                    return x
                else:
                    return 'NA'
            else:
                return 'NA'
        except:
            return 'NA'
formatted_df['standardized_cancer'] = formatted_df['raw_cancer'].apply(map_to_standardized_cancer)
formatted_df['biomarker'] = clinical_evidence['molecular_profile']
formatted_df['therapy'] = clinical_evidence['therapies']
formatted_df['statement']  = clinical_evidence['evidence_statement']
formatted_df['source'] = clinical_evidence['evidence_civic_url']
formatted_df

Unnamed: 0,level,raw_cancer,oncotree,standardized_cancer,biomarker,therapy,statement,source
0,B,Acute Myeloid Leukemia,Leukemia,Acute Myeloid Leukemia,DNMT3A R882,Daunorubicin,Daunorubicin treatment resulted in similar ove...,https://civicdb.org/links/evidence_items/11
1,D,Melanoma,Melanoma,Melanoma,MAP2K1 P124S,Selumetinib,A375 cells expressing MAP2K1 P124S mutation co...,https://civicdb.org/links/evidence_items/12
2,D,Melanoma,Melanoma,Melanoma,MAP2K1 Q56P,Selumetinib,A375 cells expressing MAP2K1 Q56P mutation con...,https://civicdb.org/links/evidence_items/13
3,B,Melanoma,Melanoma,Melanoma,NRAS Q61,Vemurafenib,Vemurafenib resistance is associated with gain...,https://civicdb.org/links/evidence_items/14
4,B,Gastrointestinal Stromal Tumor,Gastrointestinal Stromal Tumor,Gastrointestinal Stromal Tumor,PDGFRA D842V,Imatinib,GIST cancer with D842V mutation is resistant t...,https://civicdb.org/links/evidence_items/15
...,...,...,...,...,...,...,...,...
2709,B,Prostate Cancer,Prostate Cancer,Prostate Cancer,AR AR-V7,"Abiraterone,Enzalutamide",A cross-sectional cohort study was performed i...,https://civicdb.org/links/evidence_items/12463
2710,B,Prostate Cancer,Prostate Cancer,Prostate Cancer,AR AR-V7,"Abiraterone,Enzalutamide","A blinded, multi-institutional cohort study wa...",https://civicdb.org/links/evidence_items/12464
2711,B,Breast Cancer,Breast Cancer,Breast Cancer,FGFR1 Amplification,Erdafitinib,Fibroblast Growth Factor Receptor 1 (FGFR1) am...,https://civicdb.org/links/evidence_items/12469
2712,B,Breast Cancer,Breast Cancer,Breast Cancer,FGFR1 Amplification,Dovitinib,"Dovitinib, a pan-FGFR small molecular inhibito...",https://civicdb.org/links/evidence_items/12478


In [89]:
def check_gene_name_exist(gene):
    url = f'https://clinicaltables.nlm.nih.gov/api/ncbi_genes/v3/search?terms="{gene}"&df=Symbol&count=500'
    response = requests.get(url).json()[3]
    for item in response:
        if item[0].upper() == gene.upper():
            return True
    return False

In [102]:
def format_gene(x):
    x = x.lower()
    #if the number of spaces is 1, it's standard so extract gene and alteration
    if ' and ' in x or ' or ' in x:
        return 'Combination'
    elif x.count(' ') == 1:
        gene = x.split(' ')[0]
        return gene
    elif ' and ' in x or ' or ' in x:
        return 'Combination'
    elif 'exon' in x:
        return x.split(' exon')[0]
    else:
        if check_gene_name_exist(x.split(' ')[0]):
            return x.split(' ')[0]
        else:
            return '-'
def format_alteration(x):
    x = x.lower()
    #if the number of spaces is 1, it's standard so extract gene and alteration
    if x.count(' ') == 1:
        alteration = x.split(' ')[1]
        return alteration
    elif ' and ' in x or ' or ' in x:
        return 'Combination'
    elif 'exon' in x:
        return 'exon '+ x.split(' exon')[1].strip()
    else:
        if check_gene_name_exist(x.split(' ')[0]):
            return x.split(' ')[1]
        else:
            return '-'
formatted_df['gene'] = formatted_df['biomarker'].apply(lambda x: pd.Series(format_gene(x)))
formatted_df['alteration'] = formatted_df['biomarker'].apply(lambda x: pd.Series(format_alteration(x)))
formatted_df['gene'].value_counts()

gene
Combination    463
egfr           221
braf           168
kras           162
erbb2          138
              ... 
tfg::ros1        1
calr             1
raf1             1
v::pdgfrb        1
etv6::runx1      1
Name: count, Length: 345, dtype: int64

In [104]:
formatted_df['alteration'].value_counts().reset_index()

Unnamed: 0,alteration,count
0,Combination,463
1,mutation,265
2,fusion,258
3,amplification,149
4,expression,105
...,...,...
526,i1145i,1
527,n822h,1
528,f506_f508dup,1
529,t417_d419delinsy,1


In [92]:
url = 'https://clinicaltables.nlm.nih.gov/api/ncbi_genes/v3/search?terms=CDKN2A&df=Symbol&count=500'
requests.get(url).json()

[11,
 ['HGNC:24325',
  'HGNC:23831',
  'HGNC:39854',
  'HGNC:34341',
  'HGNC:30545',
  'HGNC:39855',
  'HGNC:39853',
  'HGNC:978',
  'HGNC:1787',
  'HGNC:8591',
  'HGNC:25560'],
 None,
 [['CDKN2AIP'],
  ['CDKN2A-AS1'],
  ['CDKN2AIPNLP1'],
  ['CDKN2B-AS1'],
  ['CDKN2AIPNL'],
  ['CDKN2AIPNLP2'],
  ['CDKN2AIPNLP3'],
  ['BCCIP'],
  ['CDKN2A'],
  ['PAK2'],
  ['RPRD1A']],
 ['HGNC-Symb',
  'HGNC-Symb',
  'HGNC-Symb',
  'HGNC-Symb',
  'HGNC-Symb',
  'HGNC-Symb',
  'HGNC-Symb',
  'HGNC-Symb',
  'HGNC-Symb',
  'HGNC-Symb',
  'HGNC-Symb']]

In [101]:
formatted_df[(formatted_df['biomarker'].str.contains('NOT KIT D816V'))]#['statement'].iloc[0]

Unnamed: 0,level,raw_cancer,oncotree,standardized_cancer,biomarker,therapy,statement,source,gene,alteration
2567,B,Systemic Mastocytosis,Mastocytosis,Systemic Mastocytosis,NOT KIT D816V,Imatinib Mesylate,"n this open-label phase II study, 20 patients ...",https://civicdb.org/links/evidence_items/11158,-,Other


In [94]:
formatted_df[formatted_df['gene'] == '-']['biomarker'].value_counts()

biomarker
NOT KIT D816V    1
Name: count, dtype: int64

In [95]:
formatted_df[formatted_df.oncotree == 'NA'].level.value_counts()

level
D    325
B    159
C    139
A     48
E     10
Name: count, dtype: int64

In [27]:
formatted_df[(formatted_df.level == 'B')&(formatted_df.oncotree == 'NA')].raw_cancer.value_counts()

raw_cancer
Her2-receptor Positive Breast Cancer                                                               35
Cancer                                                                                             32
Skin Melanoma                                                                                      14
Solid Tumor                                                                                         7
Multiple Myeloma                                                                                    6
Bladder Carcinoma                                                                                   5
Acute Promyelocytic Leukemia                                                                        4
Ovarian Carcinoma                                                                                   4
Stomach Carcinoma                                                                                   4
Triple-receptor Negative Breast Cancer                                 

In [6]:
formatted_df.level.value_counts()

level
D    1056
B     750
C     725
A     157
E      26
Name: count, dtype: int64

In [7]:
formatted_df[formatted_df.oncotree == 'NA'].raw_cancer.value_counts()

raw_cancer
Lung Non-small Cell Carcinoma                        394
Chronic Myeloid Leukemia                             308
Cancer                                               205
Her2-receptor Positive Breast Cancer                  85
Skin Melanoma                                         53
                                                    ... 
Rectum Cancer                                          1
Plexiform Neurofibroma                                 1
Cecum Adenocarcinoma                                   1
Endometrioid Ovary Carcinoma                           1
B-lymphoblastic Leukemia/lymphoma With ETV6-RUNX1      1
Name: count, Length: 95, dtype: int64

In [12]:
formatted_df[formatted_df.oncotree == 'NA'].raw_cancer.unique()

array(['Lung Non-small Cell Carcinoma', 'Cancer', 'Bladder Carcinoma',
       'Chronic Myeloid Leukemia',
       'T-cell Lymphoblastic Leukemia/lymphoma',
       'Estrogen-receptor Positive Breast Cancer',
       'Malignant Anus Melanoma', 'Acute Promyelocytic Leukemia',
       'Invasive Bladder Transitional Cell Carcinoma',
       'Acute Lymphoblastic Leukemia', 'Colon Carcinoma',
       'Esophagus Adenocarcinoma', 'Her2-receptor Positive Breast Cancer',
       'Ovarian Serous Carcinoma', 'T-cell Acute Lymphoblastic Leukemia',
       'Ovarian Carcinoma', 'Colon Cancer', 'Epithelial Ovarian Cancer',
       'Stomach Cancer', 'Esophagus Squamous Cell Carcinoma',
       'Malignant Pleural Mesothelioma', 'Ewing Sarcoma Of Bone',
       'Pancreatic Ductal Adenocarcinoma',
       'Uterine Corpus Endometrial Carcinoma', 'Sarcoma',
       'Malignant Mesothelioma', 'Bile Duct Adenocarcinoma',
       'Skin Melanoma', 'Brain Glioma', 'Renal Carcinoma',
       'Lung Small Cell Carcinoma', 'Skin Sq

In [8]:
clinical_evidence[['disease', 'therapies', 'evidence_statement', 'molecular_profile', 'evidence_civic_url', 'molecular_profile_civic_url']]

Unnamed: 0,disease,therapies,evidence_statement,molecular_profile,evidence_civic_url,molecular_profile_civic_url
0,Acute Myeloid Leukemia,Daunorubicin,Daunorubicin treatment resulted in similar ove...,DNMT3A R882,https://civicdb.org/links/evidence_items/11,https://civicdb.org/links/molecular_profiles/32
1,Melanoma,Selumetinib,A375 cells expressing MAP2K1 P124S mutation co...,MAP2K1 P124S,https://civicdb.org/links/evidence_items/12,https://civicdb.org/links/molecular_profiles/82
2,Melanoma,Selumetinib,A375 cells expressing MAP2K1 Q56P mutation con...,MAP2K1 Q56P,https://civicdb.org/links/evidence_items/13,https://civicdb.org/links/molecular_profiles/83
3,Melanoma,Vemurafenib,Vemurafenib resistance is associated with gain...,NRAS Q61,https://civicdb.org/links/evidence_items/14,https://civicdb.org/links/molecular_profiles/94
4,Gastrointestinal Stromal Tumor,Imatinib,GIST cancer with D842V mutation is resistant t...,PDGFRA D842V,https://civicdb.org/links/evidence_items/15,https://civicdb.org/links/molecular_profiles/99
...,...,...,...,...,...,...
2709,Prostate Cancer,"Abiraterone,Enzalutamide",A cross-sectional cohort study was performed i...,AR AR-V7,https://civicdb.org/links/evidence_items/12463,https://civicdb.org/links/molecular_profiles/358
2710,Prostate Cancer,"Abiraterone,Enzalutamide","A blinded, multi-institutional cohort study wa...",AR AR-V7,https://civicdb.org/links/evidence_items/12464,https://civicdb.org/links/molecular_profiles/358
2711,Breast Cancer,Erdafitinib,Fibroblast Growth Factor Receptor 1 (FGFR1) am...,FGFR1 Amplification,https://civicdb.org/links/evidence_items/12469,https://civicdb.org/links/molecular_profiles/263
2712,Breast Cancer,Dovitinib,"Dovitinib, a pan-FGFR small molecular inhibito...",FGFR1 Amplification,https://civicdb.org/links/evidence_items/12478,https://civicdb.org/links/molecular_profiles/263


In [10]:
clinical_evidence[clinical_evidence.evidence_level == 'A'].columns

Index(['molecular_profile', 'molecular_profile_id', 'disease', 'doid',
       'phenotypes', 'therapies', 'therapy_interaction_type', 'evidence_type',
       'evidence_direction', 'evidence_level', 'significance',
       'evidence_statement', 'citation_id', 'source_type', 'asco_abstract_id',
       'citation', 'nct_ids', 'rating', 'evidence_status', 'evidence_id',
       'variant_origin', 'last_review_date', 'evidence_civic_url',
       'molecular_profile_civic_url', 'is_flagged'],
      dtype='object')