In [1]:
import pandas as pd
import numpy as np
from tqdm import tqdm
import os
import warnings
warnings.filterwarnings('ignore')

In [2]:
from misc.extract_clinical_data import extract_clinical_data_minimum
from misc.extract_molecular_data import extract_dna_variants, extract_cna, extract_structural_variants
patient_data_path = 'msk_chord_2024/data_clinical_patient.txt'
sample_data_path = 'msk_chord_2024/data_clinical_sample.txt'
snv_data_path = 'msk_chord_2024/data_mutations.txt'
cna_data_path = 'msk_chord_2024/data_cna.txt'
sv_data_path = 'msk_chord_2024/data_sv.txt'

----

In [8]:
patient = pd.read_csv('msk_chord_2024/data_clinical_patient.txt', sep='\t', comment='#')
treatment = pd.read_csv('msk_chord_2024/data_timeline_treatment.txt', sep='\t', comment='#')
sample = pd.read_csv('msk_chord_2024/data_clinical_sample.txt', sep='\t', comment='#')

targeted_tx_patients = treatment[
    #select patients that have received targeted therapies after msk-impact
    (treatment.SUBTYPE == 'Targeted')&(treatment.START_DATE > 0)&
    #select patients that have only had one msk-impact test (for the sake of "gold standard", we'll think of ways to incorporate later)
    (treatment['PATIENT_ID'].isin(sample['PATIENT_ID'].value_counts()[sample['PATIENT_ID'].value_counts() == 1].index))]
print(f'Number of unique patients receiving targeted therapy post test: {len(targeted_tx_patients.PATIENT_ID.unique())}')

Number of unique patients receiving targeted therapy post test: 4202


In [4]:
#for now, it seems like we don't need to think about multiple samples per patient
sample[sample.PATIENT_ID.isin(targeted_tx_patients.PATIENT_ID.unique())].PATIENT_ID.value_counts()

PATIENT_ID
P-0000036    1
P-0044467    1
P-0044824    1
P-0044387    1
P-0044848    1
            ..
P-0017422    1
P-0017476    1
P-0017466    1
P-0017438    1
P-0009406    1
Name: count, Length: 4202, dtype: int64

In [5]:
treatment_filtered = treatment[(treatment.PATIENT_ID.isin(targeted_tx_patients.PATIENT_ID.unique()))&
                               (treatment.START_DATE > 0)]
treatment_filtered.head()

Unnamed: 0,PATIENT_ID,START_DATE,STOP_DATE,EVENT_TYPE,SUBTYPE,AGENT,RX_INVESTIGATIVE,FLAG_OROTOPICAL
30,P-0000036,22,931,Treatment,Targeted,CRIZOTINIB,N,1
31,P-0000036,931,3512,Treatment,Targeted,CRIZOTINIB,N,1
48,P-0000058,288,1255,Treatment,Biologic,ADO-TRASTUZUMAB EMTANSINE,N,0
49,P-0000058,624,708,Treatment,Bone Treatment,ZOLEDRONIC ACID,N,0
50,P-0000058,799,800,Treatment,Bone Treatment,ZOLEDRONIC ACID,N,0


In [6]:
'''
we want to 
1. identify the "first line treatment(s)" received after msk-impact was administered.
2. if the "first line treatment(s)" weren't targeted, filter out for now.
3. if the "first line treatment(s)" were targeted, keep and save the agent(s) administered first. 
'''
first_line_table = []
for patient in tqdm(treatment_filtered.PATIENT_ID.unique()):
    patient_treatments = treatment_filtered[treatment_filtered.PATIENT_ID == patient]
    first_line_treatment = patient_treatments[patient_treatments.START_DATE == patient_treatments.START_DATE.min()]
    agent_type = first_line_treatment.SUBTYPE.unique()
    first_line_timing = first_line_treatment.START_DATE.tolist()[0]
    first_line_treatment_agent = set(first_line_treatment.AGENT.tolist())
    first_line_treatment_type = set(first_line_treatment.SUBTYPE.tolist())
    first_line_table.append([patient, first_line_timing, first_line_treatment_agent, first_line_treatment_type])
first_line_table = pd.DataFrame(first_line_table, columns=['PATIENT_ID', 'FIRST_LINE_TIMING', 'FIRST_LINE_TREATMENT', 'FIRST_LINE_TREATMENT_TYPE'])
first_line_table['TARGETED_THERAPY_GIVEN'] = first_line_table['FIRST_LINE_TREATMENT_TYPE'].apply(lambda x: 'Targeted' in x)
first_line_table['INVESTIGATIONAL_THERAPY_GIVEN'] = first_line_table['FIRST_LINE_TREATMENT_TYPE'].apply(lambda x: 'Investigational' in x)
os.makedirs('reports/filtered-data', exist_ok=True)
first_line_table.to_csv('reports/filtered-data/first_line_treatments_post_msk.csv', index=False)
print(first_line_table.shape)
first_line_table.head()

100%|██████████| 4202/4202 [00:03<00:00, 1066.28it/s]


(4202, 6)


Unnamed: 0,PATIENT_ID,FIRST_LINE_TIMING,FIRST_LINE_TREATMENT,FIRST_LINE_TREATMENT_TYPE,TARGETED_THERAPY_GIVEN,INVESTIGATIONAL_THERAPY_GIVEN
0,P-0000036,22,{CRIZOTINIB},{Targeted},True,False
1,P-0000058,288,{ADO-TRASTUZUMAB EMTANSINE},{Biologic},False,False
2,P-0000066,47,{INVESTIGATIONAL},{Investigational},False,True
3,P-0000093,147,{CAPECITABINE},{Chemo},False,False
4,P-0000098,81,{EXEMESTANE},{Hormone},False,False


In [7]:
first_line_table.FIRST_LINE_TIMING.describe()
#realistically, it's probably reasonable to cut off after 1 year

count    4202.000000
mean      159.544741
std       285.156167
min         1.000000
25%        14.000000
50%        46.000000
75%       172.000000
max      2402.000000
Name: FIRST_LINE_TIMING, dtype: float64

In [20]:
first_line_table = first_line_table[first_line_table.FIRST_LINE_TIMING <= 365]
first_line_table.to_csv('reports/filtered-data/first_line_treatments_post_msk.csv', index=False)
print(first_line_table.shape)
first_line_table.head(1)

(3681, 6)


Unnamed: 0,PATIENT_ID,FIRST_LINE_TIMING,FIRST_LINE_TREATMENT,FIRST_LINE_TREATMENT_TYPE,TARGETED_THERAPY_GIVEN,INVESTIGATIONAL_THERAPY_GIVEN
0,P-0000036,22,{CRIZOTINIB},{Targeted},True,False


In [21]:
sample[sample.PATIENT_ID.isin(first_line_table.PATIENT_ID.unique())].CANCER_TYPE.value_counts()

CANCER_TYPE
Non-Small Cell Lung Cancer    1493
Breast Cancer                 1470
Colorectal Cancer              332
Prostate Cancer                224
Pancreatic Cancer              162
Name: count, dtype: int64

In [23]:
pd.crosstab(first_line_table.TARGETED_THERAPY_GIVEN, first_line_table.INVESTIGATIONAL_THERAPY_GIVEN)

INVESTIGATIONAL_THERAPY_GIVEN,False,True
TARGETED_THERAPY_GIVEN,Unnamed: 1_level_1,Unnamed: 2_level_1
False,1803,316
True,1518,44


we might want to remove the patients that were given investigational therapies, because of a lack of information.

In [25]:
first_line_table[first_line_table.INVESTIGATIONAL_THERAPY_GIVEN == False]['FIRST_LINE_TREATMENT_TYPE'].value_counts()

FIRST_LINE_TREATMENT_TYPE
{Targeted}                             1249
{Chemo}                                 840
{Hormone}                               484
{Hormone, Targeted}                     233
{Biologic, Chemo}                       119
{Biologic}                              104
{Immuno}                                 84
{Bone Treatment}                         84
{Immuno, Chemo}                          63
{Chemo, Targeted}                        23
{Hormone, Chemo}                          6
{Biologic, Targeted}                      5
{Chemo, Bone Treatment}                   4
{Hormone, Bone Treatment}                 3
{Hormone, Biologic}                       3
{Chemo, Biologic, Immuno}                 2
{Biologic, Chemo, Targeted}               2
{Hormone, Bone Treatment, Targeted}       2
{Bone Treatment, Targeted}                2
{Biologic, Chemo, Bone Treatment}         2
{Immuno, Bone Treatment}                  1
{Hormone, Biologic, Chemo}                1
{Other

In [26]:
sample[sample.PATIENT_ID.isin(first_line_table[first_line_table.INVESTIGATIONAL_THERAPY_GIVEN == False].PATIENT_ID.unique())].CANCER_TYPE.value_counts()

CANCER_TYPE
Breast Cancer                 1343
Non-Small Cell Lung Cancer    1320
Colorectal Cancer              321
Prostate Cancer                198
Pancreatic Cancer              139
Name: count, dtype: int64