In [1]:
import os
import pandas as pd

data_dir = r"C:\Users\VN-ChoHyunJae\Downloads\msk_chord_2024\msk_chord_2024"

files = [
    "data_clinical_patient.txt",
    "data_clinical_sample.txt",
    "data_cna.txt",
    "data_mutations.txt",
    "data_timeline_cancer_presence.txt",
    "data_timeline_cea_labs.txt",
    "data_timeline_diagnosis.txt",
    "data_timeline_pdl1.txt",
    "data_timeline_performance_status.txt",
    "data_timeline_progression.txt",
    "data_timeline_radiation.txt",
    "data_timeline_surgery.txt",
    "data_timeline_treatment.txt",
    "data_timeline_tumor_sites.txt",
]

def get_columns(filepath):
    """
    Reads the header from a tab-delimited .txt file.
    cBioPortal clinical/timeline files often have comment lines starting with '#'
    before the actual header row. This function skips those lines.
    """
    with open(filepath, "r", encoding="utf-8") as f:
        for line in f:
            if not line.startswith("#"):
                # First non-comment line is the header
                columns = line.strip().split("\t")
                return columns
    return []

results = {}

for filename in files:
    filepath = os.path.join(data_dir, filename)
    try:
        columns = get_columns(filepath)
        results[filename] = columns
        print(f"\n{'='*60}")
        print(f"File: {filename}")
        print(f"Columns ({len(columns)}):")
        for i, col in enumerate(columns, 1):
            print(f"  {i:>3}. {col}")
    except FileNotFoundError:
        print(f"\n[ERROR] File not found: {filename}")
    except Exception as e:
        print(f"\n[ERROR] Could not read {filename}: {e}")

# Optional: save summary to a text file in the same directory
summary_path = os.path.join(data_dir, "column_summary.txt")
try:
    with open(summary_path, "w", encoding="utf-8") as out:
        for filename, columns in results.items():
            out.write(f"{'='*60}\n")
            out.write(f"File: {filename}\n")
            out.write(f"Columns ({len(columns)}):\n")
            for i, col in enumerate(columns, 1):
                out.write(f"  {i:>3}. {col}\n")
            out.write("\n")
    print(f"\n\nSummary saved to: {summary_path}")
except Exception as e:
    print(f"\n[WARNING] Could not save summary file: {e}")


File: data_clinical_patient.txt
Columns (26):
    1. PATIENT_ID
    2. GENDER
    3. RACE
    4. ETHNICITY
    5. CURRENT_AGE_DEID
    6. STAGE_HIGHEST_RECORDED
    7. NUM_ICDO_DX
    8. ADRENAL_GLANDS
    9. BONE
   10. CNS_BRAIN
   11. INTRA_ABDOMINAL
   12. LIVER
   13. LUNG
   14. LYMPH_NODES
   15. OTHER
   16. PLEURA
   17. REPRODUCTIVE_ORGANS
   18. SMOKING_PREDICTIONS_3_CLASSES
   19. GLEASON_FIRST_REPORTED
   20. GLEASON_HIGHEST_REPORTED
   21. HISTORY_OF_PDL1
   22. PRIOR_MED_TO_MSK
   23. OS_MONTHS
   24. OS_STATUS
   25. HR
   26. HER2

File: data_clinical_sample.txt
Columns (24):
    1. SAMPLE_ID
    2. PATIENT_ID
    3. GLEASON_SAMPLE_LEVEL
    4. PDL1_POSITIVE
    5. CANCER_TYPE
    6. SAMPLE_TYPE
    7. SAMPLE_CLASS
    8. METASTATIC_SITE
    9. PRIMARY_SITE
   10. CANCER_TYPE_DETAILED
   11. GENE_PANEL
   12. SAMPLE_COVERAGE
   13. TUMOR_PURITY
   14. ONCOTREE_CODE
   15. MSI_COMMENT
   16. MSI_SCORE
   17. MSI_TYPE
   18. SOMATIC_STATUS
   19. CLINICAL_GROUP
   20. P