## Generator Notebook for (initially) Random Test Data

In [4]:
# Imports
import numpy as np
import os

from genome_helper import generate_gene, generate_genome, generate_reads, partition_reads, save_test_case

In [6]:
# Set correct pathing
curr_dir_path_str = './'
curr_dir_path = os.path.abspath(curr_dir_path_str)

testdata_path_str = './testdata'
testdata_path = os.path.abspath(testdata_path_str)

print('Current Directory: ' + curr_dir_path)
print('Test Data Directory: ' + testdata_path)

Current Directory: /Users/parshangerafian/bilby_encoder/testing
Test Data Directory: /Users/parshangerafian/bilby_encoder/testing/testdata


### Test Case 1 - basictest

In [3]:
# Seed PRNG
np.random.seed(0)
test_case_name = "basictest"

splice_structure_gene1 = [10, 10, 10]
splice_structure_gene2 = [5, 5, 5, 5, 5]

gene1, exon_seq1 = generate_gene(splice_structure_gene1)
gene2, exon_seq2 = generate_gene(splice_structure_gene2)

genes = [gene1, gene2]
exon_seqs = [exon_seq1, exon_seq2]

genome = generate_genome(genes, 100)
reads = generate_reads(exon_seqs, [5, 5])

num_partitions = 1
split_reads = partition_reads(reads, num_partitions)

print("Genome: ", genome)
print("Genes: ", genes)
print("Exon Sequences: ", exon_seqs)
print("Reads: ", split_reads)

bed_windows = [[('chr1', 10, 50), ('chr1', 70, 80)]]
print("Windows: ", bed_windows)

Genome:  GCGATTTGAGCAGGCGCGAAGTAGGGGTGGTTCAGCAAGAACTCGGCATCAAATTCAATTTTATGTAAGGAGTCGTGAGGACGAGATCCGATTGTTGCGG
Genes:  ['AGTAGGGGTGGTTCAGCAAGAACTCGGCAT', 'TTTATGTAAGGAGTCGTGAGGACGA']
Exon Sequences:  ['AGTAGGGGTGAACTCGGCAT', 'TTTATGAGTCGACGA']
Reads:  [['TATGAGTCGACGA', 'TGAGTCGA', 'TGAGTCGAC', 'AGTAGGGGTGAACTCG', 'AGTAGGGGTGAACTCGGCA', 'TGAGTCGAC', 'AGTCGACGA', 'GTAGGGGTGAACTCGGCAT', 'GAACTCGGCA', 'GGGGTGAACTCGGCA']]
Windows:  [[('chr1', 10, 50), ('chr1', 70, 80)]]


In [4]:
save_test_case(testdata_path, test_case_name, genome, split_reads, bed_windows)

Directory '/Users/ritwiksrinivas/Desktop/College/HolmesLab/bilby_encoder/testing/testdata/basictest' created.


In [5]:
%%bash
cd testdata/basictest/
samtools view -h alignment0.bam -o alignment0.sam # for ease of viewing alignment contents
samtools sort alignment0.sam -o sorted_alignment0.bam # sort for upcoming indexing
samtools index sorted_alignment0.bam # index for pysam

### Test Case 2 - reversetest

In [8]:
# Seed PRNG
np.random.seed(291357)
test_case_name = "reversetest"

splice_structure_gene1 = [10, 20, 10]
splice_structure_gene2 = [10, 10, 10, 10, 10]
splice_structure_gene3 = [15, 15, 15, 15, 15]

gene1, exon_seq1 = generate_gene(splice_structure_gene1)
gene2, exon_seq2 = generate_gene(splice_structure_gene2)
gene3, exon_seq3 = generate_gene(splice_structure_gene3)

genes = [gene1, gene2, gene3]
exon_seqs = [exon_seq1, exon_seq2, exon_seq3]

genome = generate_genome(genes, 250)
reads = generate_reads(exon_seqs, [10, 10, 10], rev_probability=0.5)

num_partitions = 4
split_reads = partition_reads(reads, num_partitions)

print("Genome: ", genome)
print("Genes: ", genes)
print("Exon Sequences: ", exon_seqs)
print("Reads: ", split_reads)

bed_windows = [
    [('chr1', 4, 44), ('chr1', 225, 245)], 
    [('chr1', 140, 170), ('chr1', 96, 115)]
]
print("Windows: ", bed_windows)

Genome:  AACTCCTTCATGCAGTGTTCAGCTTAAGTGGCAGGCGATAATCGGAGGAGTGTAGGGCGAAGCAACACTCGCGACGCGTGTGGGCAGAGTGTACGCCCCGTATTTCTAGCGTTTGCAACGTTATTGATCCTTTTCCCCAAAGGAGGTTGTGGGGCGGTTTTAGTGAAGGACGGGGTTGGTCAATTTGTCCGAGTTTTTGCGGCTGTGCCTTAGTTTTGCGACCATTTTCAGATACTGATTCGACCTCCTG
Genes:  ['CCTTCATGCAGTGTTCAGCTTAAGTGGCAGGCGATAATCG', 'CGCGACGCGTGTGGGCAGAGTGTACGCCCCGTATTTCTAGCGTTTGCAAC', 'TCCCCAAAGGAGGTTGTGGGGCGGTTTTAGTGAAGGACGGGGTTGGTCAATTTGTCCGAGTTTTTGCGGCTGTGC']
Exon Sequences:  ['CCTTCATGCAGCGATAATCG', 'CGCGACGCGTTGTACGCCCCCGTTTGCAAC', 'TCCCCAAAGGAGGTTTGAAGGACGGGGTTGTTTTTGCGGCTGTGC']
Reads:  [['CGTTGTACGCC', 'GATTATCGCTG', 'GGTTTG', 'GGGGGCGTACAACG', 'CCCAAAGGAGGTTTGAAGGACGGGGTTGTTTTTGCGG', 'CAAACGGGGGC', 'CTTCATGCAGCGAT', 'CTTCATGCAGCGATA'], ['GCGTTG', 'ATTAT', 'GGGCGTACAACGCGTCG', 'GCGTACAACGCGTCGC', 'CAAAGGAGGTTTGAAGGACGGGGTTGTTTTTGCG', 'CGCGACGCGTTG', 'CAGCCGCAAAAACAACCCCGTCCTTCAAACCTCCTTTGG', 'GCATGAAGG'], ['TTCATGCAGCGATAATCG', 'ACGGGGTTGTTTT', 'AAAACAACCCCGTCCTTCAAACCTCCTTTGGG', 'ACAACC', 'TCATG', 

In [7]:
save_test_case(testdata_path, test_case_name, genome, split_reads, bed_windows)

Directory '/Users/ritwiksrinivas/Desktop/College/HolmesLab/bilby_encoder/testing/testdata/reversetest' created.


In [8]:
%%bash
cd testdata/reversetest/
samtools view -h alignment0.bam -o alignment0.sam # for ease of viewing alignment contents
samtools view -h alignment1.bam -o alignment1.sam
samtools view -h alignment2.bam -o alignment2.sam
samtools view -h alignment3.bam -o alignment3.sam
samtools sort alignment0.sam -o sorted_alignment0.bam # sort for upcoming indexing
samtools sort alignment1.sam -o sorted_alignment1.bam
samtools sort alignment2.sam -o sorted_alignment2.bam
samtools sort alignment3.sam -o sorted_alignment3.bam
samtools index sorted_alignment0.bam # index for pysam
samtools index sorted_alignment1.bam
samtools index sorted_alignment2.bam
samtools index sorted_alignment3.bam

### Test Case 3 - Read completely aligned to BED window (All M states)

In [69]:
# Seed PRNG
np.random.seed(1573544)
test_case_name = "allexontest"

splice_structure_gene1 = [20]
splice_structure_gene2 = [10]

gene1, exon_seq1 = generate_gene(splice_structure_gene1)
gene2, exon_seq2 = generate_gene(splice_structure_gene2)


genes = [gene1, gene2]
exon_seqs = [exon_seq1, exon_seq2]

genome = generate_genome(genes, 100)
reads = generate_reads(exon_seqs, [5,5], rev_probability=0.5)

num_partitions = 1
split_reads = partition_reads(reads, num_partitions)

print("Genome: ", genome)
print("Genes: ", genes)
print("Exon Sequences: ", exon_seqs)
print("Reads: ", split_reads)

bed_windows = [[('chr1', 43, 58), ('chr1',65, 80)]]
print("Windows: ", bed_windows)

Genome:  TCTTTACGACCACTGAAACGTCTCCATTCGGAGCCGAGTTCGACAACCAACCCCCGTTCTAGCTACCGCCTACGCGAAAGCTCCCTACATCGGGTATTTA
Genes:  ['ACAACCAACCCCCGTTCTAG', 'ATCGGGTATT']
Exon Sequences:  ['ACAACCAACCCCCGTTCTAG', 'ATCGGGTATT']
Reads:  [['GGGTA', 'AATACCCG', 'AATACC', 'CTAGA', 'AACCA', 'ACAACCAACCCCCGT', 'AACGGGGGTTGG', 'ATCGGGTATT', 'AACCCCCG', 'CGGGTATT']]
Windows:  [[('chr1', 43, 58), ('chr1', 65, 80)]]


In [71]:
save_test_case(testdata_path, test_case_name, genome, split_reads, bed_windows)

Directory '/Users/parshangerafian/bilby_encoder/testing/testdata/allexontest' created.


In [74]:
%%bash
cd testdata/allexontest/
samtools view -h alignment0.bam -o alignment0.sam # for ease of viewing alignment contents
samtools sort alignment0.sam -o sorted_alignment0.bam # sort for upcoming indexing
samtools index sorted_alignment0.bam # index for pysam