In [23]:
import pandas as pd
import numpy as np
import os
import ast
import warnings
warnings.filterwarnings('ignore')

In [24]:
patient_data_path = 'msk_chord_2024/data_clinical_patient.txt'
sample_data_path = 'msk_chord_2024/data_clinical_sample.txt'
snv_data_path = 'msk_chord_2024/data_mutations.txt'
cna_data_path = 'msk_chord_2024/data_cna.txt'
sv_data_path = 'msk_chord_2024/data_sv.txt'

In [25]:
patient = pd.read_csv('msk_chord_2024/data_clinical_patient.txt', sep='\t', comment='#')
treatment = pd.read_csv('msk_chord_2024/data_timeline_treatment.txt', sep='\t', comment='#')
sample = pd.read_csv('msk_chord_2024/data_clinical_sample.txt', sep='\t', comment='#')

---
aim : select X of the most clinically / genomically "diverse" samples from the patients selected in 1-1. \
reason : to alleviate the cost of this validation. we assume that a substantial portion of these patients present with relatively "similar" clinical / genomic phenotypes, in terms of treatments/genomic variants detected/clinical history.

In [33]:
cohort = pd.read_csv('reports/filtered-data/first_line_treatments_post_msk.csv')
print(cohort.shape)
cohort['FIRST_LINE_TREATMENT'] = cohort['FIRST_LINE_TREATMENT'].apply(lambda x: ast.literal_eval(x))
cohort['FIRST_LINE_TREATMENT_TYPE'] = cohort['FIRST_LINE_TREATMENT_TYPE'].apply(lambda x: ast.literal_eval(x))
cohort = cohort[cohort.FIRST_LINE_TIMING <= 365]
print(cohort.shape)
#filter out investigational first-line patients
cohort = cohort[cohort.INVESTIGATIONAL_THERAPY_GIVEN == False]
#select for cancer type
cancer_type = dict(zip(sample.PATIENT_ID, sample.CANCER_TYPE))
cohort['CANCER_TYPE'] = cohort.PATIENT_ID.map(cancer_type)
#select for single / multi therapy
cohort['TREATMENT_NUM'] = cohort.FIRST_LINE_TREATMENT.apply(lambda x: len(list(x)))
cohort['FIRST_LINE_TREATMENT'] = cohort.FIRST_LINE_TREATMENT.apply(lambda x: ','.join(x) if len(list(x)) > 1 else list(x)[0])
print(cohort.shape, cohort.PATIENT_ID.nunique())
cohort.head(1)

(4202, 6)
(3681, 6)
(3321, 8) 3321


Unnamed: 0,PATIENT_ID,FIRST_LINE_TIMING,FIRST_LINE_TREATMENT,FIRST_LINE_TREATMENT_TYPE,TARGETED_THERAPY_GIVEN,INVESTIGATIONAL_THERAPY_GIVEN,CANCER_TYPE,TREATMENT_NUM
0,P-0000036,22,CRIZOTINIB,{Targeted},True,False,Non-Small Cell Lung Cancer,1


In [34]:
pd.crosstab(cohort.CANCER_TYPE, cohort.TARGETED_THERAPY_GIVEN)

TARGETED_THERAPY_GIVEN,False,True
CANCER_TYPE,Unnamed: 1_level_1,Unnamed: 2_level_1
Breast Cancer,750,593
Colorectal Cancer,259,62
Non-Small Cell Lung Cancer,534,786
Pancreatic Cancer,93,46
Prostate Cancer,167,31


----

In [35]:
#flag the patient that have not seen any treatment before MSK-IMPACT
pretreated_patients = treatment[(treatment.PATIENT_ID.isin(cohort.PATIENT_ID))&
                                (treatment.START_DATE <= 0)].PATIENT_ID.unique()
cohort['PRETREATED'] = cohort.PATIENT_ID.isin(pretreated_patients)

#flag any patients with prior surgery
surgery = pd.read_csv('msk_chord_2024/data_timeline_surgery.txt', sep='\t', comment='#')
prior_surgery_patients = surgery[(surgery.PATIENT_ID.isin(cohort.PATIENT_ID))&
                  (surgery.SUBTYPE == 'PROCEDURE')&
                  (surgery.START_DATE <= 0)
                  ].PATIENT_ID.unique()
cohort['PRIOR_SURGERY'] = cohort.PATIENT_ID.isin(prior_surgery_patients)
#flag any patients with prior radiation
radiation = pd.read_csv('msk_chord_2024/data_timeline_radiation.txt', sep='\t', comment='#')
prior_radiation_patients = radiation[(radiation.PATIENT_ID.isin(cohort.PATIENT_ID))&
                  (radiation.START_DATE <= 0)
                  ].PATIENT_ID.unique()
cohort['PRIOR_RADIATION'] = cohort.PATIENT_ID.isin(prior_radiation_patients)

#flag any patients with prior medication
prior_medication = pd.read_csv('msk_chord_2024/data_timeline_prior_meds.txt', sep='\t', comment='#')
prior_medication_patients = prior_medication[(prior_medication.PATIENT_ID.isin(cohort.PATIENT_ID))&
                  (prior_medication.START_DATE <= 0)&
                  (prior_medication.PRIOR_MED_TO_MSK == 'Prior medications to MSK')
                  ].PATIENT_ID.unique()
cohort['PRIOR_MEDICATION'] = cohort.PATIENT_ID.isin(prior_medication_patients)

#count how many patients have no prior treatment (msk or otherwise), surgery or radiation
cohort['NO_PRIOR_TREATMENT'] = ~(cohort.PRETREATED | cohort.PRIOR_SURGERY | cohort.PRIOR_RADIATION | cohort.PRIOR_MEDICATION)
print(cohort.NO_PRIOR_TREATMENT.sum(), cohort.shape[0])
cohort[cohort['NO_PRIOR_TREATMENT']].to_csv('reports/test/selected-samples-no-previous-treatment-metadata.csv', index=False)
cohort[cohort['NO_PRIOR_TREATMENT']].head(1)

524 3321


Unnamed: 0,PATIENT_ID,FIRST_LINE_TIMING,FIRST_LINE_TREATMENT,FIRST_LINE_TREATMENT_TYPE,TARGETED_THERAPY_GIVEN,INVESTIGATIONAL_THERAPY_GIVEN,CANCER_TYPE,TREATMENT_NUM,PRETREATED,PRIOR_SURGERY,PRIOR_RADIATION,PRIOR_MEDICATION,NO_PRIOR_TREATMENT
24,P-0000289,4,PEMETREXED,{Chemo},False,False,Non-Small Cell Lung Cancer,1,False,False,False,False,True


In [36]:
pd.crosstab(cohort.CANCER_TYPE, cohort.NO_PRIOR_TREATMENT)

NO_PRIOR_TREATMENT,False,True
CANCER_TYPE,Unnamed: 1_level_1,Unnamed: 2_level_1
Breast Cancer,1265,78
Colorectal Cancer,294,27
Non-Small Cell Lung Cancer,923,397
Pancreatic Cancer,119,20
Prostate Cancer,196,2


---

In [21]:
#select samples - 10 of each cancer type only, and all 10 should have different treatment labels. 
#select from those who have not seen any treatment before MSK-IMPACT
selected = []
for cancer in cohort.CANCER_TYPE.unique():
    grp = cohort[(cohort.CANCER_TYPE == cancer) & (cohort.TARGETED_THERAPY_GIVEN == True) & (cohort.NO_PRIOR_TREATMENT == True)]
    if grp.empty:
        continue
    # get up to 10 unique treatments for this cancer type
    unique_treatments = grp.FIRST_LINE_TREATMENT.unique()[:10]
    for treatment in unique_treatments:
        sub = grp[grp.FIRST_LINE_TREATMENT == treatment]
        # randomly select one patient for each treatment label
        sampled_id = sub.sample(n=1, random_state=42)['PATIENT_ID'].iloc[0]
        selected.append(sampled_id)

# preserve order and deduplicate patient IDs
_seen = set()
selected = [x for x in selected if not (x in _seen or _seen.add(x))]

# map to one sample per selected patient (choose highest coverage sample)
selected_samples = (
    sample[sample.PATIENT_ID.isin(selected)]
    .sort_values(['PATIENT_ID', 'SAMPLE_COVERAGE'], ascending=[True, False])
    .drop_duplicates('PATIENT_ID')['SAMPLE_ID']
    .tolist()
)

with open('reports/test/selected-samples-no-previous-treatment.json', 'w') as f:
    import json
    json.dump(selected_samples, f, indent=4)

---

In [None]:
#select samples - 10 of each cancer type only, and all 10 should have different treatment labels
selected = []
for cancer in cohort.CANCER_TYPE.unique():
    grp = cohort[(cohort.CANCER_TYPE == cancer) & (cohort.TARGETED_THERAPY_GIVEN == True)]
    if grp.empty:
        continue
    # get up to 10 unique treatments for this cancer type
    unique_treatments = grp.FIRST_LINE_TREATMENT.unique()[:10]
    for treatment in unique_treatments:
        sub = grp[grp.FIRST_LINE_TREATMENT == treatment]
        # randomly select one patient for each treatment label
        sampled_id = sub.sample(n=1, random_state=42)['PATIENT_ID'].iloc[0]
        selected.append(sampled_id)

# preserve order and deduplicate patient IDs
_seen = set()
selected = [x for x in selected if not (x in _seen or _seen.add(x))]

# map to one sample per selected patient (choose highest coverage sample)
selected_samples = (
    sample[sample.PATIENT_ID.isin(selected)]
    .sort_values(['PATIENT_ID', 'SAMPLE_COVERAGE'], ascending=[True, False])
    .drop_duplicates('PATIENT_ID')['SAMPLE_ID']
    .tolist()
)

with open('reports/test/selected-samples.json', 'w') as f:
    import json
    json.dump(selected_samples, f, indent=4)

In [7]:
cohort[cohort.PATIENT_ID.isin(selected)].CANCER_TYPE.value_counts()

CANCER_TYPE
Breast Cancer                 10
Non-Small Cell Lung Cancer    10
Colorectal Cancer             10
Prostate Cancer               10
Pancreatic Cancer              9
Name: count, dtype: int64

----

In [7]:
#first, we stratify by cancer type - treatment combination. we estimate the number of patients to sample from each group.
total_patients = 0
num_of_randomly_sample_required_treatment_cancer_pairs = 0
for cancer_type in cohort.CANCER_TYPE.unique():
    df = cohort[(cohort.CANCER_TYPE == cancer_type)&(cohort.TARGETED_THERAPY_GIVEN == True)].FIRST_LINE_TREATMENT.value_counts().reset_index()
    many_patients = df[df['count'] >= 10].shape[0] * 10 # we randomly sample 10 from the treatment types with many patients
    few_patients = df[df['count'] < 10]['count'].sum() # we take all from the treatment types with few patients
    total_patients += many_patients + few_patients
    num_of_randomly_sample_required_treatment_cancer_pairs += int(many_patients / 10)
print(total_patients)
print(num_of_randomly_sample_required_treatment_cancer_pairs)

483
24


In [26]:
#select samples
selected = []
for cancer in cohort.CANCER_TYPE.unique():
    grp = cohort[(cohort.CANCER_TYPE == cancer) & (cohort.TARGETED_THERAPY_GIVEN == True)]
    if grp.empty:
        continue
    for treatment, sub in grp.groupby('FIRST_LINE_TREATMENT'):
        if len(sub) >= 10:
            sampled_ids = sub.sample(n=10, random_state=42)['PATIENT_ID'].tolist()
        else:
            sampled_ids = sub['PATIENT_ID'].tolist()
        selected.extend(sampled_ids)

# preserve order and deduplicate patient IDs
_seen = set()
selected = [x for x in selected if not (x in _seen or _seen.add(x))]

# map to one sample per selected patient (choose highest coverage sample)
selected_samples = (
    sample[sample.PATIENT_ID.isin(selected)]
    .sort_values(['PATIENT_ID', 'SAMPLE_COVERAGE'], ascending=[True, False])
    .drop_duplicates('PATIENT_ID')['SAMPLE_ID']
    .tolist()
)

with open('reports/filtered-data/selected-samples-representative-filtering.json', 'w') as f:
    import json
    json.dump(selected_samples, f, indent=4)
cohort[cohort.PATIENT_ID.isin(selected)]

Unnamed: 0,PATIENT_ID,FIRST_LINE_TIMING,FIRST_LINE_TREATMENT,FIRST_LINE_TREATMENT_TYPE,TARGETED_THERAPY_GIVEN,INVESTIGATIONAL_THERAPY_GIVEN,CANCER_TYPE,TREATMENT_NUM
0,P-0000036,22,CRIZOTINIB,{Targeted},True,False,Non-Small Cell Lung Cancer,1
42,P-0000495,5,"EVEROLIMUS,TAMOXIFEN","{Hormone, Targeted}",True,False,Breast Cancer,2
69,P-0000795,7,EVEROLIMUS,{Targeted},True,False,Breast Cancer,1
70,P-0000806,250,"LAPATINIB,CAPECITABINE","{Chemo, Targeted}",True,False,Breast Cancer,2
72,P-0000840,9,CRIZOTINIB,{Targeted},True,False,Non-Small Cell Lung Cancer,1
...,...,...,...,...,...,...,...,...
3664,P-0087682,1,"PEMBROLIZUMAB,PACLITAXEL,ALECTINIB,CARBOPLATIN","{Chemo, Immuno, Targeted}",True,False,Non-Small Cell Lung Cancer,4
3667,P-0087851,238,TUCATINIB,{Targeted},True,False,Breast Cancer,1
3669,P-0087986,65,TEPOTINIB,{Targeted},True,False,Non-Small Cell Lung Cancer,1
3673,P-0088575,35,"RIBOCICLIB,TAMOXIFEN","{Hormone, Targeted}",True,False,Breast Cancer,2


In [8]:
#first, we stratify by cancer type - treatment combination. we estimate the number of patients to sample from each group.
total_patients = 0
num_of_randomly_sample_required_treatment_cancer_pairs = 0
for cancer_type in cohort.CANCER_TYPE.unique():
    df = cohort[(cohort.CANCER_TYPE == cancer_type)&(cohort.TARGETED_THERAPY_GIVEN == False)].FIRST_LINE_TREATMENT.value_counts().reset_index()
    many_patients = df[df['count'] >= 10].shape[0] * 10 # we randomly sample 10 from the treatment types with many patients
    few_patients = df[df['count'] < 10]['count'].sum() # we take all from the treatment types with few patients
    total_patients += many_patients + few_patients
    num_of_randomly_sample_required_treatment_cancer_pairs += int(many_patients / 10)
print(total_patients)
print(num_of_randomly_sample_required_treatment_cancer_pairs)

867
49
