In [9]:
import pandas as pd
import numpy as np
import os
import ast
import warnings
warnings.filterwarnings('ignore')

In [10]:
patient_data_path = 'msk_chord_2024/data_clinical_patient.txt'
sample_data_path = 'msk_chord_2024/data_clinical_sample.txt'
snv_data_path = 'msk_chord_2024/data_mutations.txt'
cna_data_path = 'msk_chord_2024/data_cna.txt'
sv_data_path = 'msk_chord_2024/data_sv.txt'

In [14]:
patient = pd.read_csv('msk_chord_2024/data_clinical_patient.txt', sep='\t', comment='#')
treatment = pd.read_csv('msk_chord_2024/data_timeline_treatment.txt', sep='\t', comment='#')
sample = pd.read_csv('msk_chord_2024/data_clinical_sample.txt', sep='\t', comment='#')

---
aim : select X of the most clinically / genomically "diverse" samples from the patients selected in 1-1. \
reason : to alleviate the cost of this validation. we assume that a substantial portion of these patients present with relatively "similar" clinical / genomic phenotypes, in terms of treatments/genomic variants detected/clinical history.

In [39]:
cohort = pd.read_csv('reports/filtered-data/first_line_treatments_post_msk.csv')
cohort['FIRST_LINE_TREATMENT'] = cohort['FIRST_LINE_TREATMENT'].apply(lambda x: ast.literal_eval(x))
cohort['FIRST_LINE_TREATMENT_TYPE'] = cohort['FIRST_LINE_TREATMENT_TYPE'].apply(lambda x: ast.literal_eval(x))
#filter out investigational first-line patients
cohort = cohort[cohort.INVESTIGATIONAL_THERAPY_GIVEN == False]
#select for cancer type
cancer_type = dict(zip(sample.PATIENT_ID, sample.CANCER_TYPE))
cohort['CANCER_TYPE'] = cohort.PATIENT_ID.map(cancer_type)
#select for single / multi therapy
cohort['TREATMENT_NUM'] = cohort.FIRST_LINE_TREATMENT.apply(lambda x: len(list(x)))
cohort.head(1)

Unnamed: 0,PATIENT_ID,FIRST_LINE_TIMING,FIRST_LINE_TREATMENT,FIRST_LINE_TREATMENT_TYPE,TARGETED_THERAPY_GIVEN,INVESTIGATIONAL_THERAPY_GIVEN,CANCER_TYPE,TREATMENT_NUM
0,P-0000036,22,{CRIZOTINIB},{Targeted},True,False,Non-Small Cell Lung Cancer,1


In [31]:
pd.crosstab(cohort.CANCER_TYPE, cohort.TARGETED_THERAPY_GIVEN)

TARGETED_THERAPY_GIVEN,False,True
CANCER_TYPE,Unnamed: 1_level_1,Unnamed: 2_level_1
Breast Cancer,750,593
Colorectal Cancer,259,62
Non-Small Cell Lung Cancer,534,786
Pancreatic Cancer,93,46
Prostate Cancer,167,31


In [54]:
#first, we stratify by cancer type - treatment combination. we estimate the number of patients to sample from each group.
total_patients = 0
num_of_randomly_sample_required_treatment_cancer_pairs = 0
for cancer_type in cohort.CANCER_TYPE.unique():
    df = cohort[(cohort.CANCER_TYPE == cancer_type)&(cohort.TARGETED_THERAPY_GIVEN == True)].FIRST_LINE_TREATMENT.value_counts().reset_index()
    many_patients = df[df['count'] >= 10].shape[0] * 10 # we randomly sample 10 from the treatment types with many patients
    few_patients = df[df['count'] < 10]['count'].sum() # we take all from the treatment types with few patients
    total_patients += many_patients + few_patients
    num_of_randomly_sample_required_treatment_cancer_pairs += int(many_patients / 10)
print(total_patients)
print(num_of_randomly_sample_required_treatment_cancer_pairs)

483
24


In [56]:
#first, we stratify by cancer type - treatment combination. we estimate the number of patients to sample from each group.
total_patients = 0
num_of_randomly_sample_required_treatment_cancer_pairs = 0
for cancer_type in cohort.CANCER_TYPE.unique():
    df = cohort[(cohort.CANCER_TYPE == cancer_type)&(cohort.TARGETED_THERAPY_GIVEN == False)].FIRST_LINE_TREATMENT.value_counts().reset_index()
    many_patients = df[df['count'] >= 10].shape[0] * 10 # we randomly sample 10 from the treatment types with many patients
    few_patients = df[df['count'] < 10]['count'].sum() # we take all from the treatment types with few patients
    total_patients += many_patients + few_patients
    num_of_randomly_sample_required_treatment_cancer_pairs += int(many_patients / 10)
print(total_patients)
print(num_of_randomly_sample_required_treatment_cancer_pairs)

867
49
