In [None]:
#🧩 Step 1: Combine Annotated TXT Files into a Single CSV
This script reads all .txt files in a specified folder 
(e.g., output/ or output_03/), extracts the sample ID 
from the filename, and concatenates the data into a single CSV file.

In [1]:
import pandas as pd
from pathlib import Path
import re

folder = Path("C:/Users/LENOVO/Documents/ONCO50_PMS2_REANALYSIS/output_03")

dfs = []
for f in folder.glob("*txt"):
    if f.is_file():
        df = pd.read_csv(f, sep="\t", dtype=str)
        match = re.match(r"(\d+)_", f.name)
        file_id = match.group(1) if match else f.stem.split("_")[0]
        df.insert(0, "ID", file_id)
        dfs.append(df)

pd.concat(dfs, ignore_index=True).to_csv("combined_Batch1.csv", index=False)

In [2]:
import os
print(os.getcwd())

c:\Users\LENOVO\Desktop\Python_Learn


In [None]:
##Optional! Combining vcf but with error handling incorporated 

In [None]:
##################
import pandas as pd
from pathlib import Path
import re

folder = Path("C:/Users/LENOVO/Documents/ONCO50_PMS2_REANALYSIS/output")

dfs = []
for f in folder.glob("*txt"):
    if f.is_file():
        print(f"Processing file: {f.name}")
        df = pd.read_csv(f, sep="\t", dtype=str)
        match = re.match(r"(\d+)_", f.name)
        file_id = match.group(1) if match else f.stem.split("_")[0]
        df.insert(0, "ID", file_id)
        dfs.append(df)

if dfs:
    combined = pd.concat(dfs, ignore_index=True)
    combined.to_csv("combined.csv", index=False)
    print(f"Combined CSV written with shape: {combined.shape}")
else:
    print("No .txt files found in the folder.")

Processing file: 10_S10_annotated.hg19_multianno.txt
Processing file: 11_S11_annotated.hg19_multianno.txt
Processing file: 12_S12_annotated.hg19_multianno.txt
Processing file: 13_S13_annotated.hg19_multianno.txt
Processing file: 14_S14_annotated.hg19_multianno.txt
Processing file: 15_S15_annotated.hg19_multianno.txt
Processing file: 16_S16_annotated.hg19_multianno.txt
Processing file: 17_S17_annotated.hg19_multianno.txt
Processing file: 18_S18_annotated.hg19_multianno.txt
Processing file: 19_S19_annotated.hg19_multianno.txt
Processing file: 1_S1_annotated.hg19_multianno.txt
Processing file: 20_S20_annotated.hg19_multianno.txt
Processing file: 21_S21_annotated.hg19_multianno.txt
Processing file: 22_S22_annotated.hg19_multianno.txt
Processing file: 23_S23_annotated.hg19_multianno.txt
Processing file: 24_S24_annotated.hg19_multianno.txt
Processing file: 25_S25_annotated.hg19_multianno.txt
Processing file: 26_S26_annotated.hg19_multianno.txt
Processing file: 27_S27_annotated.hg19_multianno

In [2]:
import os
print(os.getcwd())

c:\Users\LENOVO\Downloads


In [None]:
##Combining normalized vcf files into a combined vcf. Annovar scattered the 
# orientation of the reference and alternate alleles, using the normalized to replace
#reference and alternate alleles in the final csv.

In [3]:
import pandas as pd
from pathlib import Path
import re
import gzip
import io

folder = Path("C:/Users/LENOVO/Documents/ONCO50_PMS2_REANALYSIS/output_03")

vcf_dfs = []

for f in folder.glob("*normalized.vcf.gz"):
    print(f"Processing zipped VCF: {f.name}")
    
    with gzip.open(f, "rt") as fh:
        lines = [line for line in fh if not line.startswith("##")]
    
    if not lines:
        print(f"⚠️ Skipping empty VCF: {f.name}")
        continue

    header_line = next((line for line in lines if line.startswith("#CHROM")), None)
    data_lines = [line for line in lines if not line.startswith("#")]

    if header_line and data_lines:
        columns = header_line.strip().lstrip("#").split("\t")
        vcf_df = pd.read_csv(io.StringIO("".join(data_lines)), sep="\t", names=columns, dtype=str)
        
        # Extract sample ID from filename (e.g., 123_normalized.vcf.gz → 123)
        match = re.match(r"(\d+)_", f.name)
        file_id = match.group(1) if match else f.stem.split("_")[0]
        
        vcf_df.insert(0, "Sample_ID", file_id)
        vcf_dfs.append(vcf_df)
    else:
        print(f"⚠️ No usable content in: {f.name}")

# Save the combined VCF table
if vcf_dfs:
    combined_Batch1vcf = pd.concat(vcf_dfs, ignore_index=True)
    out_path = folder / "combined_Batch1vcf.csv"
    combined_Batch1vcf.to_csv(out_path, index=False)
    print(f"✅ Combined zipped VCF saved to:\n{out_path}\nShape: {combined_Batch1vcf.shape}")
else:
    print("⚠️ No valid zipped VCF files found.")

Processing zipped VCF: 10_S7_normalized.vcf.gz
Processing zipped VCF: 11_S8_normalized.vcf.gz
Processing zipped VCF: 12_S9_normalized.vcf.gz
Processing zipped VCF: 13_S10_normalized.vcf.gz
Processing zipped VCF: 14_S11_normalized.vcf.gz
Processing zipped VCF: 15_S12_normalized.vcf.gz
Processing zipped VCF: 17_S13_normalized.vcf.gz
Processing zipped VCF: 18_S14_normalized.vcf.gz
Processing zipped VCF: 19_S15_normalized.vcf.gz
Processing zipped VCF: 20_S16_normalized.vcf.gz
Processing zipped VCF: 21_S17_normalized.vcf.gz
Processing zipped VCF: 22_S18_normalized.vcf.gz
Processing zipped VCF: 23_S19_normalized.vcf.gz
Processing zipped VCF: 24_S20_normalized.vcf.gz
Processing zipped VCF: 25_S21_normalized.vcf.gz
Processing zipped VCF: 27_S22_normalized.vcf.gz
Processing zipped VCF: 28_S23_normalized.vcf.gz
Processing zipped VCF: 29_S24_normalized.vcf.gz
Processing zipped VCF: 2_S1_normalized.vcf.gz
Processing zipped VCF: 30_S25_normalized.vcf.gz
Processing zipped VCF: 31_S26_normalized.vcf.

In [None]:
#Both merged batches have similar IDs so i need to change to original sample names

In [10]:
import pandas as pd

# Load the main file with plate numbers
main_df = pd.read_csv("C:/Users/LENOVO/Documents/ONCO50_PMS2_REANALYSIS/output_03/combined_1.csv")

# Load the mapping file that maps Plate_No to Sample_ID
mapping_df = pd.read_csv("C:/Users/LENOVO/Documents/ONCO50/sample_name.csv")

# Create a dictionary: ID → Name
id_to_name = mapping_df.set_index("ID")["NAME"].to_dict()

# Replace values in the main dataframe's ID column
main_df["ID"] = main_df["ID"].map(id_to_name).fillna(main_df["ID"])  # fallback if no match

# Save result
main_df.to_csv("vcf_with_named_ids.csv", index=False)

print("✅ ID values replaced with names using mapping.")

✅ ID values replaced with names using mapping.


In [9]:
import os
print(os.getcwd())

c:\Users\LENOVO\Desktop\Python_Learn
