In [1]:
import pandas as pd
import numpy as np
import os
import warnings
warnings.filterwarnings('ignore')

In [2]:
from misc.extract_clinical_data import extract_clinical_data_minimum
from misc.extract_molecular_data import extract_dna_variants, extract_cna, extract_structural_variants
patient_data_path = 'msk_chord_2024/data_clinical_patient.txt'
sample_data_path = 'msk_chord_2024/data_clinical_sample.txt'
snv_data_path = 'msk_chord_2024/data_mutations.txt'
cna_data_path = 'msk_chord_2024/data_cna.txt'
sv_data_path = 'msk_chord_2024/data_sv.txt'

----
#### rough overview of report content
- basic patient and sample info
    - patient id, age, gender, birth date, 
    - test performed, test description, accession number, sample collection date, path diagnosis, % of tumor cells in sample
- sequencing quality
    - number of total reads, mean reads per taargeted exon, X% of exons with >30 reads
- dna variants
    - tiered list (tier 1-4) of 
        - `gene cHGV (pHGV) exon % of reads, total reads`
        - eg. TP53 c.613T_>C (p.Y205H), exon 2 - in 50% of 73 reads**
    - negative for mutations in genes with clinical relevance for tumor type
- copy number variation
    - list of `location gene copy number type`
    - eg. 1p12 NOTCH2   Low copy number gain
- chromosomal rearrangement

----

In [3]:
patient = pd.read_csv('msk_chord_2024/data_clinical_patient.txt', sep='\t', comment='#')
treatment = pd.read_csv('msk_chord_2024/data_timeline_treatment.txt', sep='\t', comment='#')
sample = pd.read_csv('msk_chord_2024/data_clinical_sample.txt', sep='\t', comment='#')

targeted_tx_patients = treatment[(treatment.SUBTYPE == 'Targeted')&(treatment.START_DATE > 0)&(treatment['PATIENT_ID'].isin(sample['PATIENT_ID'].value_counts()[sample['PATIENT_ID'].value_counts() == 1].index))]
print(f'Number of patients with targeted therapy: {len(targeted_tx_patients.PATIENT_ID.unique())}')

Number of patients with targeted therapy: 4202


In [4]:
#for now, it seems like we don't need to think about multiple samples per patient
sample[sample.PATIENT_ID.isin(targeted_tx_patients.PATIENT_ID.unique())].PATIENT_ID.value_counts()

PATIENT_ID
P-0000036    1
P-0044467    1
P-0044824    1
P-0044387    1
P-0044848    1
            ..
P-0017422    1
P-0017476    1
P-0017466    1
P-0017438    1
P-0009406    1
Name: count, Length: 4202, dtype: int64

In [5]:
def generate_report(patient_id):
    '''
    Generate a report for a given patient ID by extracting clinical and molecular data.
    '''

    report = ''

    report_text = extract_clinical_data_minimum(
        patient_data_path=patient_data_path,
        sample_data_path=sample_data_path,
        patient_id=patient_id
    )
    report += report_text

    report += '\n============='

    report += '\nDNA Variants:'
    report_text = extract_dna_variants(
        snv_data_path=snv_data_path,
        sample_data_path=sample_data_path,
        patient_id=patient_id
    )
    report += '\n' + report_text

    report += '\nCNA Data:'
    report_text = extract_cna(
        cna_data_path=cna_data_path,
        sample_data_path=sample_data_path,
        patient_id=patient_id
    )
    report += '\n' + report_text

    report += '\nStructural Variants:'
    report_text = extract_structural_variants(
        sv_data_path=sv_data_path,
        sample_data_path=sample_data_path,
        patient_id=patient_id
    )
    report += '\n' + report_text

    return report


In [6]:
patients = sample[sample.PATIENT_ID.isin(targeted_tx_patients.PATIENT_ID.unique())].PATIENT_ID.tolist()[:10]
os.makedirs('reports/test', exist_ok=True)
from tqdm import tqdm
for patient_id in tqdm(patients):
    report = generate_report(patient_id)
    with open(f'reports/test/{patient_id}.txt', 'w') as f:
        f.write(report)

100%|██████████| 10/10 [00:37<00:00,  3.76s/it]
