In [161]:
import sys
import numpy as np
import random
import subprocess

### Simulated Contamination

In [162]:
READ_LENGTH = 150
LINE_LENGTH = 60
CONTAMINATION_RATE = 0.01

In [163]:
CONTAMINATED_HEADER_LINE = ">Contaminated Human Chr. 21 and Escherichia coli, complete genome"
UNCONTAMINATED_HEADER_LINE = ">Uncontaminated Human Chr. 21, complete genome"
FASTA_HEADER = ">reads.fna"

In [164]:
def parse_fasta(filename, read_length, limit=-1):
    reads = []
    read = ""
    with open(filename, "r") as fa:
        read = ""
        for line in fa:
            if len(reads) == limit:
                return reads
            if line.startswith(">"):
                continue
            end = min(read_length - len(read), len(line))
            offset = 0
            while offset+end < len(line):
                read += line[offset:offset+end]
                reads.append(read)
                offset += end
                end = read_length
                read = ""
            read += line[offset:-1]
            if len(read) == READ_LENGTH:
                reads.append(read)
                read = ""
        if read:
            reads.append(read)
    return reads

In [165]:
def write_fasta(filename, reads, header, limit=150):
    index = 0
    with open(filename, "w") as fw:
        fw.write(header + "\n")
        start = 0
        for read in reads:
            offset = 0
            while offset + limit - start < len(read):
                fw.write(read[offset:offset+limit-start] + "\n")
                offset += limit-start
                index += 1
                fw.write(FASTA_HEADER + " read %d:\n"%(index))
                start = 0
            if start > 0 and limit - start < len(read):
                fw.write(read[:limit-start] + "\n")
                index += 1
                fw.write(FASTA_HEADER + " read %d:\n"%(index))
                offset, start = limit-start, 0
            fw.write(read[offset:])
            start += len(read)-offset

In [166]:
human_genome = parse_fasta("chr21.fa", READ_LENGTH)

In [167]:
write_fasta("human.fna", human_genome, UNCONTAMINATED_HEADER_LINE, LINE_LENGTH)

In [168]:
n_contaminants = int(CONTAMINATION_RATE * len(human_genome))
print("introducing %d contaminants"%(n_contaminants))
ecoli_genome = parse_fasta("GCF_000005845.2_ASM584v2_genomic.fna", READ_LENGTH, n_contaminants)

introducing 3114 contaminants


In [169]:
for read in ecoli_genome:
    pos = random.randint(0,len(human_genome))
    human_genome.insert(pos, read)

In [170]:
write_fasta("contaminated_human_ecoli.fna", human_genome, CONTAMINATED_HEADER_LINE, LINE_LENGTH)

In [None]:
bowtie = open("alignments.txt", "w")
subprocess.Popen("bowtie2 -x chrom21 -f contaminant21.fna -S cont.sam", stdout=bowtie)
subprocess.Popen("echo Hello World", shell=True, stdout=subprocess.PIPE)

### RNN for Classification of Target Genome