In [1]:
import pandas as pd
import numpy as np

---
### sample info

In [25]:
patient = pd.read_csv('msk_chord_2024/data_clinical_patient.txt', sep='\t', comment='#')
patient.head(1)

Unnamed: 0,PATIENT_ID,GENDER,RACE,ETHNICITY,CURRENT_AGE_DEID,STAGE_HIGHEST_RECORDED,NUM_ICDO_DX,ADRENAL_GLANDS,BONE,CNS_BRAIN,...,REPRODUCTIVE_ORGANS,SMOKING_PREDICTIONS_3_CLASSES,GLEASON_FIRST_REPORTED,GLEASON_HIGHEST_REPORTED,HISTORY_OF_PDL1,PRIOR_MED_TO_MSK,OS_MONTHS,OS_STATUS,HR,HER2
0,P-0000012,Female,White,Non-Spanish; Non-Hispanic,68.0,Stage 1-3,2,No,No,No,...,No,Former/Current Smoker,,,No,Unknown,118.454665,0:LIVING,No,No


In [26]:
sample = pd.read_csv('msk_chord_2024/data_clinical_sample.txt', sep='\t', comment='#')
sample.head(1)

Unnamed: 0,SAMPLE_ID,PATIENT_ID,GLEASON_SAMPLE_LEVEL,PDL1_POSITIVE,CANCER_TYPE,SAMPLE_TYPE,SAMPLE_CLASS,METASTATIC_SITE,PRIMARY_SITE,CANCER_TYPE_DETAILED,...,MSI_COMMENT,MSI_SCORE,MSI_TYPE,SOMATIC_STATUS,CLINICAL_GROUP,PATHOLOGICAL_GROUP,CLINICAL_SUMMARY,ICD_O_HISTOLOGY_DESCRIPTION,DIAGNOSIS_DESCRIPTION,TMB_NONSYNONYMOUS
0,P-0000012-T03-IM3,P-0000012,,,Non-Small Cell Lung Cancer,Metastasis,Tumor,Neck,Lung,Lung Adenocarcinoma,...,MICROSATELLITE STABLE (MSS). See MSI note below.,0.47,Stable,Matched,3B,,Distant,"Adenocarcinoma, Nos",Lung and Bronchus,32.165504


In [27]:
treatment = pd.read_csv('msk_chord_2024/data_timeline_treatment.txt', sep='\t', comment='#')
treatment.head(1)

Unnamed: 0,PATIENT_ID,START_DATE,STOP_DATE,EVENT_TYPE,SUBTYPE,AGENT,RX_INVESTIGATIVE,FLAG_OROTOPICAL
0,P-0000012,-5437,-5369,Treatment,Chemo,CYCLOPHOSPHAMIDE,N,0


In [28]:
sample['PATIENT_ID'].value_counts()

PATIENT_ID
P-0000012    2
P-0021156    2
P-0070191    2
P-0044645    2
P-0008662    2
            ..
P-0023104    1
P-0023131    1
P-0023173    1
P-0023109    1
P-0029444    1
Name: count, Length: 24950, dtype: int64

In [29]:
treatment[(treatment.SUBTYPE == 'Targeted')&(treatment['PATIENT_ID'].isin(sample['PATIENT_ID'].value_counts()[sample['PATIENT_ID'].value_counts() == 1].index))]

Unnamed: 0,PATIENT_ID,START_DATE,STOP_DATE,EVENT_TYPE,SUBTYPE,AGENT,RX_INVESTIGATIVE,FLAG_OROTOPICAL
30,P-0000036,22,931,Treatment,Targeted,CRIZOTINIB,N,1
31,P-0000036,931,3512,Treatment,Targeted,CRIZOTINIB,N,1
55,P-0000058,1289,1629,Treatment,Targeted,LAPATINIB,N,1
61,P-0000058,1540,1733,Treatment,Targeted,ABEMACICLIB,N,1
70,P-0000066,629,1589,Treatment,Targeted,ABEMACICLIB,N,1
...,...,...,...,...,...,...,...,...
134876,P-0089403,147,177,Treatment,Targeted,OLAPARIB,N,1
134882,P-0089413,-400,225,Treatment,Targeted,ABEMACICLIB,N,1
134888,P-0089445,-212,80,Treatment,Targeted,PALBOCICLIB,N,1
134906,P-0089490,22,181,Treatment,Targeted,OSIMERTINIB,N,1


In [30]:
#choose a random sample with a single sample + has been treated with targeted therapy to test
p = 'P-0000036'

In [31]:
treatment[treatment['PATIENT_ID'] == p]

Unnamed: 0,PATIENT_ID,START_DATE,STOP_DATE,EVENT_TYPE,SUBTYPE,AGENT,RX_INVESTIGATIVE,FLAG_OROTOPICAL
27,P-0000036,-199,-31,Treatment,Biologic,BEVACIZUMAB,N,0
28,P-0000036,-199,-178,Treatment,Chemo,CARBOPLATIN,N,0
29,P-0000036,-199,-31,Treatment,Chemo,PEMETREXED,N,0
30,P-0000036,22,931,Treatment,Targeted,CRIZOTINIB,N,1
31,P-0000036,931,3512,Treatment,Targeted,CRIZOTINIB,N,1


---

In [33]:
patient[patient['PATIENT_ID'] == p].iloc[0]

PATIENT_ID                                       P-0000036
GENDER                                              Female
RACE                                                 Other
ETHNICITY                        Non-Spanish; Non-Hispanic
CURRENT_AGE_DEID                                      68.0
STAGE_HIGHEST_RECORDED                             Stage 4
NUM_ICDO_DX                                              1
ADRENAL_GLANDS                                          No
BONE                                                   Yes
CNS_BRAIN                                               No
INTRA_ABDOMINAL                                         No
LIVER                                                  Yes
LUNG                                                   Yes
LYMPH_NODES                                            Yes
OTHER                                                   No
PLEURA                                                 Yes
REPRODUCTIVE_ORGANS                                     

In [34]:
sample[sample['PATIENT_ID'] == p].iloc[0]

SAMPLE_ID                               P-0000036-T01-IM3
PATIENT_ID                                      P-0000036
GLEASON_SAMPLE_LEVEL                                  NaN
PDL1_POSITIVE                                         NaN
CANCER_TYPE                    Non-Small Cell Lung Cancer
SAMPLE_TYPE                                       Primary
SAMPLE_CLASS                                        Tumor
METASTATIC_SITE                            Not Applicable
PRIMARY_SITE                                         Lung
CANCER_TYPE_DETAILED                  Lung Adenocarcinoma
GENE_PANEL                                      IMPACT341
SAMPLE_COVERAGE                                       380
TUMOR_PURITY                                         30.0
ONCOTREE_CODE                                        LUAD
MSI_COMMENT                                           NaN
MSI_SCORE                                            -1.0
MSI_TYPE                                    Do not report
SOMATIC_STATUS

---
### sample format 

In [62]:
def format_oncopanel_report(patient_df, sample_df):
    # Convert single-row dataframes to dictionaries for easy access
    patient_info = patient_df.to_dict()
    sample_info = sample_df.to_dict()

    # Build report text
    report = []
    report.append(f"Patient ID: {patient_info.get('PATIENT_ID', 'N/A')}")
    report.append(f"Gender: {patient_info.get('GENDER', 'N/A')}")
    report.append(f"Age: {patient_info.get('CURRENT_AGE_DEID', 'N/A')}")
    report.append(f"Race: {patient_info.get('RACE', 'N/A')}")
    report.append(f"Ethnicity: {patient_info.get('ETHNICITY', 'N/A')}")
    report.append(f"Stage (highest recorded): {patient_info.get('STAGE_HIGHEST_RECORDED', 'N/A')}")
    report.append(f"Overall Survival: {patient_info.get('OS_MONTHS', 'N/A'):.1f} months ({patient_info.get('OS_STATUS', 'N/A')})")
    report.append(f"Prior Medication: {patient_info.get('PRIOR_MED_TO_MSK', 'N/A')}")

    # Sample-specific details
    report.append(f"\nSample ID: {sample_info.get('SAMPLE_ID', 'N/A')}")
    report.append(f"Cancer Type: {sample_info.get('CANCER_TYPE_DETAILED', sample_info.get('CANCER_TYPE', 'N/A'))}")
    report.append(f"Primary Site: {sample_info.get('PRIMARY_SITE', 'N/A')}")
    report.append(f"Sample Type: {sample_info.get('SAMPLE_TYPE', 'N/A')}")
    report.append(f"Tumor Purity: {sample_info.get('TUMOR_PURITY', 'N/A')}%")
    report.append(f"Gene Panel: {sample_info.get('GENE_PANEL', 'N/A')}")
    report.append(f"Coverage: {sample_info.get('SAMPLE_COVERAGE', 'N/A')}")
    report.append(f"TMB (nonsynonymous): {sample_info.get('TMB_NONSYNONYMOUS', 'N/A')}")
    report.append(f"MSI Status: {sample_info.get('MSI_TYPE', 'N/A')} (Score: {sample_info.get('MSI_SCORE', 'N/A')})")
    report.append(f"Somatic Status: {sample_info.get('SOMATIC_STATUS', 'N/A')}")

    return "\n".join(report)


# Example usage:
patient_df = patient[patient['PATIENT_ID'] == p].iloc[0]
sample_df = sample[sample['PATIENT_ID'] == p].iloc[0]
report_text = format_oncopanel_report(patient_df, sample_df)
print(report_text)

Patient ID: P-0000036
Gender: Female
Age: 68.0
Race: Other
Ethnicity: Non-Spanish; Non-Hispanic
Stage (highest recorded): Stage 4
Overall Survival: 115.5 months (0:LIVING)
Prior Medication: Unknown

Sample ID: P-0000036-T01-IM3
Cancer Type: Lung Adenocarcinoma
Primary Site: Lung
Sample Type: Primary
Tumor Purity: 30.0%
Gene Panel: IMPACT341
Coverage: 380
TMB (nonsynonymous): 7.764087104
MSI Status: Do not report (Score: -1.0)
Somatic Status: Unmatched


----
### sequencing info

In [None]:
cna = pd.read_csv('msk_chord_2024/data_cna.txt', sep='\t', comment='#')
cna.head(1)

Unnamed: 0,Hugo_Symbol,P-0008840-T01-IM5,P-0050951-T01-IM6,P-0086178-T01-IM7,P-0020358-T01-IM6,P-0089413-T01-IM7,P-0033156-T01-IM6,P-0044605-T01-IM6,P-0077282-T01-IM7,P-0037126-T01-IM6,...,P-0011072-T01-IM5,P-0072579-T02-IM7,P-0050748-T01-IM6,P-0087758-T01-IM7,P-0062922-T01-IM7,P-0052296-T02-IM6,P-0048082-T01-IM6,P-0069255-T01-IM7,P-0060577-T01-IH3,P-0070319-T01-IH3
0,TAP1,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,,


In [43]:
cna_sample = cna[['Hugo_Symbol', sample[sample['PATIENT_ID'] == p].iloc[0]['SAMPLE_ID']]]
cna_sample[(cna_sample[sample[sample['PATIENT_ID'] == p].iloc[0]['SAMPLE_ID']] != 0) & (cna_sample[sample[sample['PATIENT_ID'] == p].iloc[0]['SAMPLE_ID']].notna())]

Unnamed: 0,Hugo_Symbol,P-0000036-T01-IM3


In [None]:
mut = pd.read_csv('msk_chord_2024/data_mutations.txt', sep='\t', comment='#')
mut['allelic_fraction'] = mut['t_alt_count'] / (mut['t_ref_count'] + mut['t_alt_count'])
mut['HGV'] = mut['HGVSc'].apply(lambda x: x.split(':')[1] if pd.notna(x) and ':' in x else x)
mut.head(1)

  mut = pd.read_csv('msk_chord_2024/data_mutations.txt', sep='\t', comment='#')


Unnamed: 0,Hugo_Symbol,Entrez_Gene_Id,Center,NCBI_Build,Chromosome,Start_Position,End_Position,Strand,Consequence,Variant_Classification,...,amino_acid_change,cDNA_Change,cDNA_position,cdna_change,comments,n_depth,t_depth,transcript,allelic_fraction,HGV
0,EGFR,1956,MSKCC,GRCh37,7,55242470,55242487,+,inframe_deletion,In_Frame_Del,...,,,,,,,,,0.474465,c.2240_2257del


In [83]:
mut[mut['Tumor_Sample_Barcode'] == sample[sample['PATIENT_ID'] == p].iloc[0]['SAMPLE_ID']].dropna(axis=1, how='all').columns

Index(['Hugo_Symbol', 'Entrez_Gene_Id', 'Center', 'NCBI_Build', 'Chromosome',
       'Start_Position', 'End_Position', 'Strand', 'Consequence',
       'Variant_Classification', 'Variant_Type', 'Reference_Allele',
       'Tumor_Seq_Allele1', 'Tumor_Seq_Allele2', 'dbSNP_RS',
       'Tumor_Sample_Barcode', 'Validation_Status', 'Mutation_Status', 'Score',
       't_ref_count', 't_alt_count', 'n_ref_count', 'n_alt_count', 'HGVSc',
       'HGVSp', 'HGVSp_Short', 'Transcript_ID', 'RefSeq', 'Protein_position',
       'Codons', 'allelic_fraction', 'HGV'],
      dtype='object')

In [87]:
mut[mut['Tumor_Sample_Barcode'] == sample[sample['PATIENT_ID'] == p].iloc[0]['SAMPLE_ID']][['Hugo_Symbol', 'Chromosome', 'Consequence', 
                                                                                            'Variant_Classification', 'Variant_Type', 'Reference_Allele',
       'Tumor_Seq_Allele1', 'Tumor_Seq_Allele2', 'dbSNP_RS',
       'Validation_Status', 'Mutation_Status', 'allelic_fraction', 'HGVSp_Short', 'HGV']]

Unnamed: 0,Hugo_Symbol,Chromosome,Consequence,Variant_Classification,Variant_Type,Reference_Allele,Tumor_Seq_Allele1,Tumor_Seq_Allele2,dbSNP_RS,Validation_Status,Mutation_Status,allelic_fraction,HGVSp_Short,HGV
207948,NOTCH4,6,missense_variant,Missense_Mutation,SNP,A,A,C,rs150079294,Unknown,UNKNOWN,0.483108,p.C815G,c.2443T>G
207949,IRS1,2,missense_variant,Missense_Mutation,SNP,T,T,C,rs138145752,Unknown,UNKNOWN,0.444976,p.E461G,c.1382A>G
207950,TSHR,14,missense_variant,Missense_Mutation,SNP,C,C,T,rs142063461,Unknown,UNKNOWN,0.498062,p.P68S,c.202C>T
207951,TP53,17,"stop_gained,splice_region_variant",Nonsense_Mutation,SNP,G,G,A,,Unknown,UNKNOWN,0.057878,p.Q331*,c.991C>T
207952,ERBB2,17,missense_variant,Missense_Mutation,SNP,G,G,A,rs140272156,Unknown,UNKNOWN,0.420904,p.G1015E,c.3044G>A
207953,AR,X,missense_variant,Missense_Mutation,SNP,T,T,G,,Unknown,UNKNOWN,0.135274,p.L226V,c.676T>G
207954,FBXW7,4,inframe_deletion,In_Frame_Del,DEL,TCCTCCTCA,TCCTCCTCA,-,,Unknown,UNKNOWN,0.396825,p.D112_E114del,c.336_344del


In [48]:
sv = pd.read_csv('msk_chord_2024/data_sv.txt', sep='\t', comment='#')
sv.head(1)

Unnamed: 0,Sample_Id,SV_Status,Site1_Hugo_Symbol,Site2_Hugo_Symbol,Site1_Chromosome,Site2_Chromosome,Site1_Position,Site2_Position,Site1_Description,Site2_Description,...,Connection_Type,Annotation,DNA_Support,RNA_Support,SV_Length,Normal_Read_Count,Tumor_Read_Count,Normal_Variant_Count,Tumor_Variant_Count,Comments
0,P-0022424-T01-IM6,SOMATIC,SEPTIN12,ARID1A,16,1,4838352.0,27089562.0,5-UTR of SEPT12(-):11Kb before coding start,Exon 8 of ARID1A(+),...,3to3,ARID1A (NM_006015) - SEPT12 (NM_144605) rearra...,yes,unknown,0.0,0.0,0.0,0.0,10.0,Note: The ARID1A - SEPT12 rearrangement is a t...


In [66]:
sv[sv['Sample_Id'] == sample[sample['PATIENT_ID'] == p].iloc[0]['SAMPLE_ID']].iloc[0]

Sample_Id                                              P-0000036-T01-IM3
SV_Status                                                        SOMATIC
Site1_Hugo_Symbol                                                   ROS1
Site2_Hugo_Symbol                                                SLC34A2
Site1_Chromosome                                                       6
Site2_Chromosome                                                       4
Site1_Position                                               117662374.0
Site2_Position                                                25667150.0
Site1_Description                                     Exon 30 of ROS1(-)
Site2_Description              Intron of SLC34A2(+): 600bp before exon 5
Site2_Effect_On_Frame                                                NaN
NCBI_Build                                                        GRCh37
Class                                                      TRANSLOCATION
Tumor_Split_Read_Count                             

### sequencing format

In [101]:
import pandas as pd
def make_msk_variant_tables(snv_df=None, cna_df=None, sv_df=None, sample_id=None):
    """
    Generate MSK-style mutation tables for SNV/indels, CNAs, and SVs.
    Each is returned as a pandas DataFrame.
    """

    tables = {}

    # SNV/Indel table (if provided)
    if snv_df is not None and not snv_df.empty:
        snv_table = snv_df[
            ["Hugo_Symbol", "Variant_Classification", "Consequence", 
             "HGVSp_Short", "HGV", "allelic_fraction"]
        ].copy()
        snv_table.columns = ["Gene", "Variant Type", "Consequence",
                             "Protein Change", "cDNA Change", "Allelic Fraction"]
        snv_table["Allelic Fraction"] = snv_table["Allelic Fraction"].round(3)
        snv_table.sort_values(["Gene"], inplace=True)
        tables["SNV/Indel"] = snv_table

    # CNA table
    if cna_df is not None and sample_id in cna_df.columns and not cna_df.empty:
        cna_table = cna_df[["Hugo_Symbol", sample_id]].copy()
        cna_table.columns = ["Gene", "Copy Number Alteration"]
        cna_table = cna_table[cna_table["Copy Number Alteration"] != 0]  # Remove neutral calls
        tables["CNA"] = cna_table

    # SV / Fusion table
    if sv_df is not None:
        sv_table = sv_df[
            ["Site1_Hugo_Symbol", "Site2_Hugo_Symbol", "Class", "Event_Info",
             "Tumor_Split_Read_Count", "Tumor_Paired_End_Read_Count", 
             "DNA_Support", "RNA_Support"]
        ].copy()
        sv_table.columns = ["Gene 1", "Gene 2", "Event Type", "Event Info",
                            "Split Reads", "Paired-End Reads", "DNA Support", "RNA Support"]
        tables["SV"] = sv_table

    return tables


# Example usage with your CNA and SV data
# Assuming cna_df and sv_df are loaded pandas DataFrames, and sample_id is the sample name string

snv_df = mut[mut['Tumor_Sample_Barcode'] == sample[sample['PATIENT_ID'] == p].iloc[0]['SAMPLE_ID']][['Hugo_Symbol', 'Chromosome', 'Consequence', 
                                                                                            'Variant_Classification', 'Variant_Type', 'Reference_Allele',
       'Tumor_Seq_Allele1', 'Tumor_Seq_Allele2', 'dbSNP_RS',
       'Validation_Status', 'Mutation_Status', 'allelic_fraction', 'HGVSp_Short', 'HGV']]
cna_df = cna_sample[(cna_sample[sample[sample['PATIENT_ID'] == p].iloc[0]['SAMPLE_ID']] != 0) & (cna_sample[sample[sample['PATIENT_ID'] == p].iloc[0]['SAMPLE_ID']].notna())]
sv_df = sv[sv['Sample_Id'] == sample[sample['PATIENT_ID'] == p].iloc[0]['SAMPLE_ID']].iloc[0]

tables = make_msk_variant_tables(cna_df=cna_df, sv_df=sv_df, snv_df=snv_df, sample_id="P-0000036-T01-IM3")

tables

{'SNV/Indel':           Gene       Variant Type                        Consequence  \
 207953      AR  Missense_Mutation                   missense_variant   
 207952   ERBB2  Missense_Mutation                   missense_variant   
 207954   FBXW7       In_Frame_Del                   inframe_deletion   
 207949    IRS1  Missense_Mutation                   missense_variant   
 207948  NOTCH4  Missense_Mutation                   missense_variant   
 207951    TP53  Nonsense_Mutation  stop_gained,splice_region_variant   
 207950    TSHR  Missense_Mutation                   missense_variant   
 
         Protein Change   cDNA Change  Allelic Fraction  
 207953         p.L226V      c.676T>G             0.135  
 207952        p.G1015E     c.3044G>A             0.421  
 207954  p.D112_E114del  c.336_344del             0.397  
 207949         p.E461G     c.1382A>G             0.445  
 207948         p.C815G     c.2443T>G             0.483  
 207951         p.Q331*      c.991C>T             0.0

In [102]:
def format_msk_impact_report_with_variants(tables):
    # Base clinical info from earlier

    # Get variant tables
    variant_tables = tables

    # Build the report with variants
    report_lines = ["\nDETECTED VARIANTS\n"]

    if "SNV/Indel" in variant_tables:
        report_lines.append("  Single Nucleotide Variants / Indels")
        report_lines.append(variant_tables["SNV/Indel"].to_string(index=False))
        report_lines.append("")

    if "CNA" in variant_tables:
        report_lines.append("  Copy Number Alterations")
        report_lines.append(variant_tables["CNA"].to_string(index=False))
        report_lines.append("")

    if "SV" in variant_tables:
        report_lines.append("  Structural Variants / Fusions")
        report_lines.append(variant_tables["SV"].to_string(index=False))
        report_lines.append("")

    return "\n".join(report_lines)
report_text_with_variants = format_msk_impact_report_with_variants(tables)
print(report_text_with_variants)


DETECTED VARIANTS

  Single Nucleotide Variants / Indels
  Gene      Variant Type                       Consequence Protein Change  cDNA Change  Allelic Fraction
    AR Missense_Mutation                  missense_variant        p.L226V     c.676T>G             0.135
 ERBB2 Missense_Mutation                  missense_variant       p.G1015E    c.3044G>A             0.421
 FBXW7      In_Frame_Del                  inframe_deletion p.D112_E114del c.336_344del             0.397
  IRS1 Missense_Mutation                  missense_variant        p.E461G    c.1382A>G             0.445
NOTCH4 Missense_Mutation                  missense_variant        p.C815G    c.2443T>G             0.483
  TP53 Nonsense_Mutation stop_gained,splice_region_variant        p.Q331*     c.991C>T             0.058
  TSHR Missense_Mutation                  missense_variant         p.P68S     c.202C>T             0.498

  Structural Variants / Fusions
            ROS1
         SLC34A2
   TRANSLOCATION
Antisense fusion
 