# Biforcating Tree Dataset

This dataset intends to create a biforcating tree with 16 samples, three runs per sample.

In [1]:
import utils
import gzip
import random
import string
import math

In [2]:
seed = 1003
genome_size = 10 # mbp
num_samples = 16
num_runs = 3

mean_n_reads = 5e6
sd_n_reads = mean_n_reads * 0.1 # Coeff of Var = 0.1
min_n_reads = mean_n_reads / 100.0


Set a random seed and seed the RNG

In [3]:
random.seed(seed)
utils.random.seed(seed)

Make a random genome, and samples derived from it. Write it to a fasta file.

In [None]:
genome = utils.make_rand_genome(mbp=genome_size)

In [None]:
levels = int(math.ceil(math.log2(num_samples)))

In [None]:
bifork = utils.biforcating_sequences(genome, levels=levels, av_rate=0.0001, sd_rate=0.00001)
seqlist = list(utils.flatten(bifork))

In [None]:
with open("data/bifork_10mb.fas", 'w') as fh:
    utils.print_multifasta(seqlist, file=fh)

## Make NJ tree


In [None]:
from skbio import Alignment, DNA
from skbio.tree import nj

In [None]:
aln = Alignment.read('data/bifork_10mb.fas')

In [None]:
distmat = aln.distances()

In [None]:
distmat

In [None]:
tree = nj(distmat)

In [None]:
tree.write('data/bifork_10mb.nwk')

In [None]:
print(tree.ascii_art())

### Generate reads

In [None]:
for subdir  in ['genomes', 'fastq', 'countgraphs']:
    !rm -rf data/{subdir} ; mkdir data/{subdir}

In [None]:
genomes = {}
runs = []
r2g = {}
for i, seq in enumerate(seqlist):
    genome = string.ascii_uppercase[i]
    print('Genome', genome, end=', reps: ')
    genomes[genome] = []
    
    # write genome
    fas = 'data/genomes/bifork_{}.fasta'.format(genome)
    with open(fas, 'wb') as fh:
        fh.write(">{}\n{}\n".format(genome, seq).encode('ascii'))
    
    # create each run
    for j in range(num_runs):
        print(j, end=' ')
        fq = "data/fastq/bifork_{}-{}_il.fq".format(genome, j)
        n_reads = max(int(random.gauss(mean_n_reads, sd_n_reads)), min_n_reads)
        utils.wgsim(n_reads, fas, fq)
        genomes[genome].append(fq)
        runs.append(fq)
        r2g[fq] = genome
    print()

### Hash samples

In [None]:
import subprocess

In [None]:
def countgraph(fq, cg, x=1e9, k=20, n=1, quiet=True):
    lic = "load-into-countgraph.py -N {N} -k {k} -x {x} -s tsv -b {cg} {fq}".format(
            N=n, k=k, x=x, cg=cg, fq=fq)
    print(lic)
    p = subprocess.Popen(lic, shell=True, stdout=subprocess.PIPE, stderr=subprocess.STDOUT)
    for line in p.stdout:
        if quiet:
            continue
        print(line.decode('utf-8'), end='')
    p.wait()

In [None]:
for genome in genomes:
    for i, fq in enumerate(genomes[genome]):
        cg = 'data/countgraphs/bifork_{}-{}.cg'.format(genome, i)
        countgraph(fq, cg, x=1e9, k=20)