# Viral Phylogenetic Tree Construction from FASTQ Files

In [1]:
import os
import subprocess
from Bio import AlignIO, Phylo
from Bio.Phylo.TreeConstruction import DistanceCalculator, DistanceTreeConstructor
import matplotlib.pyplot as plt

In [2]:
# Define paths and samples
ref_genome = "working_files/viral_ref_genome_output.fasta"
samples = {

}

In [None]:
# Step 1: Preprocess FASTQ files (FastQC)
def preprocess_fastq(sample):
    r1 = f"{sample}_R1.fastq"
    r2 = f"{sample}_R2.fastq"
    print(f"Running FastQC on {sample}")
    subprocess.run(["fastqc", r1, r2, "-o", "working_files/fastqc"])

In [None]:
# Step 2: Map reads to reference genome and create sorted BAM
def map_reads(sample):
    r1 = f"{sample}_R1.fastq"
    r2 = f"{sample}_R2.fastq"
    sam = f"{sample}.sam"
    bam = f"{sample}.bam"
    sorted_bam = f"{sample}_sorted.bam"
    print(f"Mapping {sample} to reference...")
    subprocess.run(["bwa", "mem", ref_genome, r1, r2, "-o", sam])
    subprocess.run(["samtools", "view", "-S", "-b", sam, "-o", bam])
    subprocess.run(["samtools", "sort", bam, "-o", sorted_bam])
    subprocess.run(["samtools", "index", sorted_bam])
    return sorted_bam

In [None]:
# Step 3: Generate consensus FASTA from sorted BAM
def generate_consensus(sample, sorted_bam):
    vcf = f"{sample}.vcf"
    consensus = f"{sample}.fa"
    subprocess.run(f"samtools mpileup -uf {ref_genome} {sorted_bam} | bcftools call -c - > {vcf}",
                   shell=True, executable="/bin/bash")
    subprocess.run(["bcftools", "consensus", "-f", ref_genome, vcf, "-o", consensus])
    return consensus

In [None]:
# Step 4: Align all consensus FASTA files and build a phylogenetic tree
def build_tree(consensus_files):
    aligned_file = "aligned.fasta"
    concat_input = "input_for_mafft.fasta"
    # Combine sequences into one file
    with open(concat_input, "w") as outfile:
        for fname in consensus_files:
            with open(fname) as infile:
                outfile.write(infile.read())
    print("Running MAFFT alignment...")
    subprocess.run(f"mafft --auto {concat_input} > {aligned_file}",
                   shell=True, executable="/bin/bash")
    alignment = AlignIO.read(aligned_file, "fasta")
    calculator = DistanceCalculator('identity')
    dm = calculator.get_distance(alignment)
    constructor = DistanceTreeConstructor()
    tree = constructor.nj(dm)
    fig = plt.figure(figsize=(10, 5))
    ax = fig.add_subplot(1, 1, 1)
    Phylo.draw(tree, axes=ax)
    plt.show()

In [None]:
# === Main Pipeline ===
def run_pipeline():
    consensus_files = []
    for sample in samples:
        preprocess_fastq(sample)
        sorted_bam = map_reads(sample)
        consensus = generate_consensus(sample, sorted_bam)
        consensus_files.append(consensus)
    build_tree(consensus_files)
if __name__ == "__main__":
    run_pipeline()