# Functions

In [1]:
import vcf

def vcf_to_bed(vcf_path, bed_path):
    # Open VCF file
    vcf_reader = vcf.Reader(open(vcf_path, 'r'))

    # Open BED file for writing
    with open(bed_path, 'w') as bed_file:
        # Iterate through VCF records and write to BED file
        for record in vcf_reader:
            chrom = record.CHROM
            start = record.POS - 1  # Convert to 0-based coordinates for BED format
            end = record.POS
            info = record.INFO

            bed_file.write(f"{chrom}\t{start}\t{end}\t{info}\n")


# Remove Spikes from BAM File

In [None]:
import subprocess

def remove_spikes(input_bam, bed_file, output_bam):
    command = [
        'samtools',
        'view',
        '-b',
        '-L',
        bed_file,
        '-o',
        output_bam,
        input_bam
    ]

    try:
        subprocess.run(command, check=True)
        print(f"Spikes removed successfully. Output saved to {output_bam}")
    except subprocess.CalledProcessError as e:
        print(f"Error: {e}")

def extract_regions(input_bam, bed_file, output_bam):
    # Step 2: Extract regions from the first BAM file
    with pysam.AlignmentFile(input_bam, "rb") as bam_file:
        regions = pysam.TabixFile(bed_file)
        extracted_reads = [read for read in bam_file.fetch() if regions.fetch(read.reference_name, read.reference_start, read.reference_end)]

    # Write the extracted reads to a new BAM file
    with pysam.AlignmentFile(output_bam, "wb", header=bam_file.header) as output_bam_file:
        for read in extracted_reads:
            output_bam_file.write(read)

def simulate_insertions(reference_genome, extracted_bam, output_prefix):
    # Step 3: Simulate insertion reads using ART
    command = [
        'art_illumina',
        '-ss', 'HS25',
        '-sam',
        '-i', extracted_bam,
        '-l', '150',
        '-f', '50',
        '-o', output_prefix,
        '-na'  # Disable random read generation to use the input reads
    ]
    subprocess.run(command, check=True)

def merge_bams(original_bam, simulated_bam, merged_bam):
    # Step 4: Merge BAM files
    with pysam.AlignmentFile(original_bam, "rb") as original_file, \
         pysam.AlignmentFile(simulated_bam + '.sam', "rb") as simulated_file, \
         pysam.AlignmentFile(merged_bam, "wb", header=original_file.header) as output_bam:
        for read in original_file:
            output_bam.write(read)
        for read in simulated_file:
            output_bam.write(read)

    # Step 5: Sort and index merged BAM file
    pysam.sort('-o', merged_bam.replace('.bam', '_sorted.bam'), merged_bam)
    pysam.index(merged_bam.replace('.bam', '_sorted.bam'))

if __name__ == "__main__":
    # Specify file paths
    input_bam = "/path/to/your/original.bam"
    bed_file = "/path/to/your/simulated_insertions.bed"
    output_bam = "/path/to/your/extracted_reads.bam"
    reference_genome = "/path/to/your/reference_genome.fasta"
    output_prefix = "/path/to/your/simulated_insertions"
    merged_bam = "/path/to/your/final_merged.bam"

    # Extract regions from the original BAM file
    extract_regions(input_bam, bed_file, output_bam)

    # Simulate insertions
    simulate_insertions(reference_genome, output_bam, output_prefix)

    # Merge the original and simulated BAM files
    merge_bams(input_bam, output_prefix + '.bam', merged_bam)