In [1]:
import pandas as pd 
import numpy as np
import os
data_folder = 'sep-2025-release'

In [2]:
clinical_evidence = os.path.join(data_folder, '01-Sep-2025-ClinicalEvidenceSummaries.tsv')
clinical_evidence = pd.read_csv(clinical_evidence, sep='\t')
print(clinical_evidence.shape)
clinical_evidence = clinical_evidence[(clinical_evidence.therapies.notna())&
                                      (clinical_evidence.evidence_direction == 'Supports')&
                                      (clinical_evidence.significance == 'Sensitivity/Response')].reset_index(drop = True)
print(clinical_evidence.shape)
os.makedirs('civic-db', exist_ok=True)
clinical_evidence['statement_id'] = clinical_evidence['evidence_civic_url'].apply(lambda x: x.split('/')[-1])
clinical_evidence.columns

(4587, 25)
(1509, 25)


Index(['molecular_profile', 'molecular_profile_id', 'disease', 'doid',
       'phenotypes', 'therapies', 'therapy_interaction_type', 'evidence_type',
       'evidence_direction', 'evidence_level', 'significance',
       'evidence_statement', 'citation_id', 'source_type', 'asco_abstract_id',
       'citation', 'nct_ids', 'rating', 'evidence_status', 'evidence_id',
       'variant_origin', 'last_review_date', 'evidence_civic_url',
       'molecular_profile_civic_url', 'is_flagged', 'statement_id'],
      dtype='object')

In [3]:
clinical_evidence.iloc[0]

molecular_profile                                                     ARAF S214C
molecular_profile_id                                                          10
disease                                            Lung Non-small Cell Carcinoma
doid                                                                      3908.0
phenotypes                                                                   NaN
therapies                                                              Sorafenib
therapy_interaction_type                                                     NaN
evidence_type                                                         Predictive
evidence_direction                                                      Supports
evidence_level                                                                 C
significance                                                Sensitivity/Response
evidence_statement             In one patient with S214C mutation, the use of...
citation_id                 

In [6]:
clinical_evidence['evidence_level'].value_counts()

evidence_level
D    553
B    424
C    361
A    148
E     23
Name: count, dtype: int64

In [7]:
#save descriptions
statements = clinical_evidence[['statement_id', 'evidence_statement']]
print(statements.shape)
statements.to_csv('civic-db/evidence_statements.csv', index=False)
statements['evidence_statement'].iloc[0]

(1509, 2)


'In one patient with S214C mutation, the use of sorafenib has led to more than 5 years of survival and near remission.'

----

In [32]:
formatted_df = pd.DataFrame()
formatted_df['statement_id'] = clinical_evidence['statement_id']
formatted_df['approval_status'] = clinical_evidence['evidence_level']
formatted_df['publication_date'] = clinical_evidence['last_review_date']
formatted_df['approval_org'] = 'CIVIC'
formatted_df['raw_cancer'] = clinical_evidence['disease']
formatted_df['evidence_statement'] = clinical_evidence['evidence_statement']
formatted_df.head(1)

Unnamed: 0,statement_id,approval_status,publication_date,approval_org,raw_cancer,evidence_statement
0,17,C,2023-01-09 21:46:24 UTC,CIVIC,Lung Non-small Cell Carcinoma,"In one patient with S214C mutation, the use of..."


In [33]:
#format therapies
def format_drug(x):
    if '2,4' not in x:
        return [i.strip() for i in x.split(',')]
    else:
        return [x]
formatted_df['therapy'] = clinical_evidence['therapies']
formatted_df['therapy'] = formatted_df['therapy'].apply(format_drug)
formatted_df.head(1)

Unnamed: 0,statement_id,approval_status,publication_date,approval_org,raw_cancer,evidence_statement,therapy
0,17,C,2023-01-09 21:46:24 UTC,CIVIC,Lung Non-small Cell Carcinoma,"In one patient with S214C mutation, the use of...",[Sorafenib]


In [34]:
import requests
import json
oncotree = requests.get('https://oncotree.mskcc.org:443/api/tumorTypes').json()
oncotree = pd.json_normalize(oncotree)
oncotree['name'] = oncotree['name'].apply(lambda x: x.lower())
oncotree_dict = dict(zip(oncotree['name'], oncotree['mainType']))
#major cancer names
cancer_dict = {'Lung Non-small Cell Carcinoma':'Non-Small Cell Lung Cancer', 'Chronic Myeloid Leukemia':'Chronic Myelogenous Leukemia',
               'Low Grade Glioma':'Low-Grade Glioma, NOS', 'Skin Melanoma':'Melanoma',
               }
#map to standardized cancer
def map_to_standardized_cancer(x):
    if x in cancer_dict:
        x = cancer_dict[x]
        return x
    elif x.lower() in oncotree_dict:
        return x
    elif x.lower() in ['cancer', 'solid tumor', 'sarcoma']:
        return x
    elif x in oncotree['mainType'].values:
        return x
    else:
        try:
            x = x.lower().replace('cancer','').replace('carcinoma','').replace('tumor','').replace('tumour','').replace('neoplasm','').split('With')[0].strip()
            if x in oncotree_dict:
                return x
            elif oncotree[oncotree['name'].str.contains(x)].shape[0] > 0:
                if oncotree[oncotree['name'].str.contains(x)]['mainType'].nunique() == 1:
                    return x
                else:
                    return ''
            else:
                return ''
        except:
            return ''
formatted_df['standardized_cancer'] = formatted_df['raw_cancer'].apply(map_to_standardized_cancer)
formatted_df['standardized_cancer'].value_counts()

standardized_cancer
Non-Small Cell Lung Cancer       285
                                 262
Melanoma                         115
Cancer                            92
Colorectal Cancer                 89
                                ... 
Chordoma                           1
mucosal melanoma                   1
hematologic                        1
cervix                             1
Spindle Cell Rhabdomyosarcoma      1
Name: count, Length: 94, dtype: int64

In [35]:
formatted_df[formatted_df['standardized_cancer'] == ''].raw_cancer.value_counts()

raw_cancer
Her2-receptor Positive Breast Cancer                45
B-lymphoblastic Leukemia/lymphoma, BCR-ABL1–like    12
Castration-resistant Prostate Carcinoma             12
Transitional Cell Carcinoma                         12
High Grade Glioma                                   10
                                                    ..
Skin Squamous Cell Carcinoma                         1
Renal Carcinoma                                      1
Endometrioid Ovary Carcinoma                         1
Ureter Small Cell Carcinoma                          1
Brain Glioblastoma Multiforme                        1
Name: count, Length: 79, dtype: int64

In [36]:
def format_biomarker(x):
    if 'AND' in x:
        return x.split(' AND ')
    else:
        return [x]
formatted_df['biomarker'] = clinical_evidence['molecular_profile'].apply(lambda x: x.split(' OR '))
formatted_df = formatted_df.explode('biomarker')
formatted_df['biomarker'] = formatted_df['biomarker'].apply(lambda x: format_biomarker(x) if type(x) == str else [x])
formatted_df['biomarker'].value_counts()

biomarker
[BRAF V600E]                                   61
[ERBB2 Amplification]                          50
[EGFR L858R]                                   45
[v::ALK Fusion]                                37
[EGFR Exon 19 Deletion]                        27
                                               ..
[MTOR E2014K, MTOR E2419K]                      1
[SF3B1 K666N]                                   1
[EGFR T790M, EGFR Exon 19 Deletion]             1
[CDK4 R24C]                                     1
[FUS::TFCP2 Fusion, ALK Exon 2-18 Deletion]     1
Name: count, Length: 670, dtype: int64

In [37]:
formatted_df.head(1)

Unnamed: 0,statement_id,approval_status,publication_date,approval_org,raw_cancer,evidence_statement,therapy,standardized_cancer,biomarker
0,17,C,2023-01-09 21:46:24 UTC,CIVIC,Lung Non-small Cell Carcinoma,"In one patient with S214C mutation, the use of...",[Sorafenib],Non-Small Cell Lung Cancer,[ARAF S214C]


In [45]:
def format_context(x):
    summary = ''
    summary += 'Approval level: ' + x['approval_status'] + '\n'
    summary += 'Database: ' + x['approval_org'] + '\n'
    summary += 'Description: ' + x['evidence_statement'] + '\n'
    summary += 'Cancer type: ' + x['raw_cancer'] + '\n'
    summary += 'Biomarkers: ' + ', '.join(x['biomarker']) if type(x['biomarker']) == list else x['biomarker'] + '\n'
    summary += 'Therapy: ' + ', '.join(x['therapy']) if type(x['therapy']) == list else x['therapy'] + '\n'
    summary += 'Approval url: https://civicdb.org/links/evidence_items/' + x['statement_id'] + '\n'
    summary += 'Updated date: ' + x['publication_date']
    return summary
formatted_df['context'] = formatted_df.apply(format_context, axis=1)

In [50]:
df = formatted_df.copy()
df.to_csv('civic-db/civic-draft.dereferenced.unique.context_db.csv', index=False)
df

Unnamed: 0,statement_id,approval_status,publication_date,approval_org,raw_cancer,evidence_statement,therapy,standardized_cancer,biomarker,context
0,17,C,2023-01-09 21:46:24 UTC,CIVIC,Lung Non-small Cell Carcinoma,"In one patient with S214C mutation, the use of...",[Sorafenib],Non-Small Cell Lung Cancer,[ARAF S214C],Approval level: C\nDatabase: CIVIC\nDescriptio...
1,18,B,2023-01-09 21:46:25 UTC,CIVIC,Acute Myeloid Leukemia,Idarubicin increases the overall survival and ...,[Idarubicin],Acute Myeloid Leukemia,[DNMT3A R882],Approval level: B\nDatabase: CIVIC\nDescriptio...
2,19,B,2023-01-09 21:46:26 UTC,CIVIC,Polycythemia Vera,"In patients with JAK2 V617F, the use of pegyla...",[Peginterferon Alfa-2b],Polycythemia Vera,[JAK2 V617F],Approval level: B\nDatabase: CIVIC\nDescriptio...
3,20,D,2023-01-09 21:46:26 UTC,CIVIC,Polycythemia Vera,TG101348 effectively inhibits STAT5 signaling ...,[Fedratinib],Polycythemia Vera,[JAK2 V617F],Approval level: D\nDatabase: CIVIC\nDescriptio...
4,21,C,2023-01-09 21:46:27 UTC,CIVIC,Melanoma,Likely due to increased reliance of mutant NRA...,[Tanespimycin],Melanoma,[NRAS G13D],Approval level: C\nDatabase: CIVIC\nDescriptio...
...,...,...,...,...,...,...,...,...,...,...
1504,12371,C,2025-06-10 20:44:50 UTC,CIVIC,Lung Non-small Cell Carcinoma,A 37-year-old never smoker was diagnosed of pu...,[Gefitinib],Non-Small Cell Lung Cancer,[EGFR::RAD51 Fusion],Approval level: C\nDatabase: CIVIC\nDescriptio...
1505,12372,C,2025-06-10 21:01:24 UTC,CIVIC,Lung Adenocarcinoma,This is a case report of a 45-year-old male ne...,[Osimertinib],Lung Adenocarcinoma,[EGFR::RAD51 Fusion],Approval level: C\nDatabase: CIVIC\nDescriptio...
1506,12406,C,2025-05-20 21:42:45 UTC,CIVIC,Spindle Cell Rhabdomyosarcoma,A 31 year old male initially thought to have a...,[Alectinib],Spindle Cell Rhabdomyosarcoma,"[FUS::TFCP2 Fusion, ALK Exon 2-18 Deletion]",Approval level: C\nDatabase: CIVIC\nDescriptio...
1507,12469,B,2025-08-04 13:04:11 UTC,CIVIC,Breast Cancer,Fibroblast Growth Factor Receptor 1 (FGFR1) am...,[Erdafitinib],Breast Cancer,[FGFR1 Amplification],Approval level: B\nDatabase: CIVIC\nDescriptio...


In [47]:
import tiktoken
encoding = tiktoken.encoding_for_model("gpt-4o")

In [48]:
def calc_iqr(chunk_size_list):
    q1 = np.percentile(chunk_size_list, 25)
    q3 = np.percentile(chunk_size_list, 75)
    iqr = f'{q1}-{q3}'
    return(iqr)

struc_context_size = [len(encoding.encode(context)) for context in df['context']]
print("# chunks: "+str(len(struc_context_size)))
print("Min: "+str(np.min(struc_context_size)))
print("Max: "+str(np.max(struc_context_size)))
print("Mean: "+str(np.mean(struc_context_size)))
print("Median: "+str(np.median(struc_context_size)))
print("IQR: "+str(calc_iqr(struc_context_size)))


# chunks: 1569
Min: 83
Max: 579
Mean: 204.54811982154237
Median: 187.0
IQR: 149.0-245.0


In [52]:
#for now, as we don't have the core dataframe, we save the formatted dataframe as df
df = pd.read_csv('civic-db/civic-draft.dereferenced.unique.context_db.csv')
df.head(1)

Unnamed: 0,statement_id,approval_status,publication_date,approval_org,raw_cancer,evidence_statement,therapy,standardized_cancer,biomarker,context
0,17,C,2023-01-09 21:46:24 UTC,CIVIC,Lung Non-small Cell Carcinoma,"In one patient with S214C mutation, the use of...",['Sorafenib'],Non-Small Cell Lung Cancer,['ARAF S214C'],Approval level: C\nDatabase: CIVIC\nDescriptio...


In [54]:
standardized_to_raw_mapping = df[['statement_id', 'standardized_cancer', 'raw_cancer', 'biomarker', 'therapy']]

standardized_to_raw_mapping['modified_standard_cancer'] = standardized_to_raw_mapping['standardized_cancer']
standardized_to_raw_mapping.to_csv(f"civic-db/civic_core__2025-09.csv", index=False)
standardized_to_raw_mapping.head(1)

A value is trying to be set on a copy of a slice from a DataFrame.
Try using .loc[row_indexer,col_indexer] = value instead

See the caveats in the documentation: https://pandas.pydata.org/pandas-docs/stable/user_guide/indexing.html#returning-a-view-versus-a-copy
  standardized_to_raw_mapping['modified_standard_cancer'] = standardized_to_raw_mapping['standardized_cancer']


Unnamed: 0,statement_id,standardized_cancer,raw_cancer,biomarker,therapy,modified_standard_cancer
0,17,Non-Small Cell Lung Cancer,Lung Non-small Cell Carcinoma,['ARAF S214C'],['Sorafenib'],Non-Small Cell Lung Cancer
