In [24]:
import pandas as pd
import numpy as np
from tqdm import tqdm
import os
import warnings
warnings.filterwarnings('ignore')

In [25]:
from misc.extract_clinical_data import extract_clinical_data_minimum
from misc.extract_molecular_data import extract_dna_variants, extract_cna, extract_structural_variants
patient_data_path = 'msk_chord_2024/data_clinical_patient.txt'
sample_data_path = 'msk_chord_2024/data_clinical_sample.txt'
snv_data_path = 'msk_chord_2024/data_mutations.txt'
cna_data_path = 'msk_chord_2024/data_cna.txt'
sv_data_path = 'msk_chord_2024/data_sv.txt'

----
#### rough overview of report content
- basic patient and sample info
    - patient id, age, gender, birth date, 
    - test performed, test description, accession number, sample collection date, path diagnosis, % of tumor cells in sample
- sequencing quality
    - number of total reads, mean reads per taargeted exon, X% of exons with >30 reads
- dna variants
    - tiered list (tier 1-4) of 
        - `gene cHGV (pHGV) exon % of reads, total reads`
        - eg. TP53 c.613T_>C (p.Y205H), exon 2 - in 50% of 73 reads**
    - negative for mutations in genes with clinical relevance for tumor type
- copy number variation
    - list of `location gene copy number type`
    - eg. 1p12 NOTCH2   Low copy number gain
- chromosomal rearrangement

----

In [26]:
patient = pd.read_csv('msk_chord_2024/data_clinical_patient.txt', sep='\t', comment='#')
treatment = pd.read_csv('msk_chord_2024/data_timeline_treatment.txt', sep='\t', comment='#')
sample = pd.read_csv('msk_chord_2024/data_clinical_sample.txt', sep='\t', comment='#')

targeted_tx_patients = treatment[
    #select patients that have received targeted therapies after msk-impact
    (treatment.SUBTYPE == 'Targeted')&(treatment.START_DATE > 0)&
    #select patients that have only had one msk-impact test (for the sake of "gold standard", we'll think of ways to incorporate later)
    (treatment['PATIENT_ID'].isin(sample['PATIENT_ID'].value_counts()[sample['PATIENT_ID'].value_counts() == 1].index))]
print(f'Number of unique patients receiving targeted therapy post test: {len(targeted_tx_patients.PATIENT_ID.unique())}')

Number of unique patients receiving targeted therapy post test: 4202


In [27]:
#for now, it seems like we don't need to think about multiple samples per patient
sample[sample.PATIENT_ID.isin(targeted_tx_patients.PATIENT_ID.unique())].PATIENT_ID.value_counts()

PATIENT_ID
P-0000036    1
P-0044467    1
P-0044824    1
P-0044387    1
P-0044848    1
            ..
P-0017422    1
P-0017476    1
P-0017466    1
P-0017438    1
P-0009406    1
Name: count, Length: 4202, dtype: int64

In [28]:
treatment_filtered = treatment[(treatment.PATIENT_ID.isin(targeted_tx_patients.PATIENT_ID.unique()))&
                               (treatment.START_DATE > 0)]
treatment_filtered.head()

Unnamed: 0,PATIENT_ID,START_DATE,STOP_DATE,EVENT_TYPE,SUBTYPE,AGENT,RX_INVESTIGATIVE,FLAG_OROTOPICAL
30,P-0000036,22,931,Treatment,Targeted,CRIZOTINIB,N,1
31,P-0000036,931,3512,Treatment,Targeted,CRIZOTINIB,N,1
48,P-0000058,288,1255,Treatment,Biologic,ADO-TRASTUZUMAB EMTANSINE,N,0
49,P-0000058,624,708,Treatment,Bone Treatment,ZOLEDRONIC ACID,N,0
50,P-0000058,799,800,Treatment,Bone Treatment,ZOLEDRONIC ACID,N,0


In [None]:
'''
we want to 
1. identify the "first line treatment(s)" received after msk-impact was administered.
2. if the "first line treatment(s)" weren't targeted, filter out for now.
3. if the "first line treatment(s)" were targeted, keep and save the agent(s) administered first. 
'''
first_line_table = []
for patient in tqdm(treatment_filtered.PATIENT_ID.unique()):
    patient_treatments = treatment_filtered[treatment_filtered.PATIENT_ID == patient]
    first_line_treatment = patient_treatments[patient_treatments.START_DATE == patient_treatments.START_DATE.min()]
    agent_type = first_line_treatment.SUBTYPE.unique()
    if 'Targeted' in agent_type:
        first_line_timing = first_line_treatment.START_DATE.tolist()[0]
        first_line_treatment = first_line_treatment.AGENT.tolist()
        first_line_table.append([patient, first_line_timing, first_line_treatment])
first_line_table = pd.DataFrame(first_line_table, columns=['PATIENT_ID', 'FIRST_LINE_TIMING', 'FIRST_LINE_TREATMENT'])
os.makedirs('reports/filtered-data', exist_ok=True)
first_line_table.to_csv('reports/filtered-data/first_line_treatments_post_msk.csv', index=False)
print(first_line_table.shape)
first_line_table.head()

100%|██████████| 4202/4202 [00:03<00:00, 1082.13it/s]

(1853, 3)





Unnamed: 0,PATIENT_ID,FIRST_LINE_TIMING,FIRST_LINE_TREATMENT
0,P-0000036,22,[CRIZOTINIB]
1,P-0000242,98,"[EVEROLIMUS, EXEMESTANE]"
2,P-0000280,125,[ERLOTINIB]
3,P-0000288,6,"[EVEROLIMUS, EXEMESTANE]"
4,P-0000301,73,[ERLOTINIB]


In [None]:
first_line_table.FIRST_LINE_TIMING.describe()
#realistically, it's probably reasonable to cut off after 1 year

count    1853.000000
mean      193.860766
std       336.460754
min         1.000000
25%        15.000000
50%        55.000000
75%       206.000000
max      2402.000000
Name: FIRST_LINE_TIMING, dtype: float64

In [41]:
first_line_table = first_line_table[first_line_table.FIRST_LINE_TIMING <= 365]
first_line_table.to_csv('reports/filtered-data/first_line_treatments_post_msk.csv', index=False)
print(first_line_table.shape)
first_line_table.head(1)

(1562, 3)


Unnamed: 0,PATIENT_ID,FIRST_LINE_TIMING,FIRST_LINE_TREATMENT
0,P-0000036,22,[CRIZOTINIB]


In [42]:
sample[sample.PATIENT_ID.isin(first_line_table.PATIENT_ID.unique())].CANCER_TYPE.value_counts()

CANCER_TYPE
Non-Small Cell Lung Cancer    811
Breast Cancer                 607
Colorectal Cancer              63
Pancreatic Cancer              48
Prostate Cancer                33
Name: count, dtype: int64

In [50]:
first_line_table

Unnamed: 0,PATIENT_ID,FIRST_LINE_TIMING,FIRST_LINE_TREATMENT
0,P-0000036,22,[CRIZOTINIB]
1,P-0000242,98,"[EVEROLIMUS, EXEMESTANE]"
2,P-0000280,125,[ERLOTINIB]
3,P-0000288,6,"[EVEROLIMUS, EXEMESTANE]"
4,P-0000301,73,[ERLOTINIB]
...,...,...,...
1848,P-0088691,244,"[ABEMACICLIB, ANASTROZOLE]"
1849,P-0088753,5,[OSIMERTINIB]
1850,P-0088892,29,[ABEMACICLIB]
1851,P-0089490,22,[OSIMERTINIB]
