In [1]:
import replaceReadsUtils
import pysam

# set up some demo reads

```
    10
 a: AAAAAAAAAAAAAAAAAAA
 b:      TTTTTTTTTTTTTTTTTTTT
 m: AAAAAAAAAA-----AAAAACCCCC
 m2:       AAA---CCC
 ```

In [2]:
genome = "G"*100
genome_start = 3

a = pysam.AlignedSegment()
qseq = "A"*20
a.query_name = "read1"
a.query_sequence=qseq
a.flag = 0
a.reference_id = 0
a.reference_start = 10
a.mapping_quality = 20
a.cigarstring = str(len(qseq)) + "M"
a.query_qualities = pysam.qualitystring_to_array("<"*len(qseq))
a.tags = (("NM", 1),("RG", "L1"))

b = pysam.AlignedSegment()
qseq = "T"*20
b.query_name = "read2"
b.query_sequence=qseq
b.flag = 0
b.reference_id = 0
b.reference_start = 15
b.mapping_quality = 20
b.cigarstring = str(len(qseq)) + "M"
b.query_qualities = pysam.qualitystring_to_array("<"*len(qseq))
b.tags = (("NM", 1),("RG", "L1"))


m = pysam.AlignedSegment()
qseq = "AAAAAAAAAAAAAAACCCCC"
m.query_name = "mod"
m.query_sequence=qseq
m.flag = 0
m.reference_id = 0
m.reference_start = 10
m.mapping_quality = 20
m.cigarstring = "10M5D10M"
m.query_qualities = pysam.qualitystring_to_array("<"*len(qseq))
m.tags = (("NM", 1),("RG", "L1"))

m2 = pysam.AlignedSegment()
qseq = "AAACCC"
m2.query_name = "mod2"
m2.query_sequence=qseq
m2.flag = 0
m2.reference_id = 0
m2.reference_start = 18
m2.mapping_quality = 20
m2.cigarstring = "3M3D3M"
m2.query_qualities = pysam.qualitystring_to_array("<"*len(qseq))
m2.tags = (("NM", 1),("RG", "L1"))

readA: read1	0	0	10	20	20M	-1	-1	20	AAAAAAAAAAAAAAAAAAAA	array('B', [27, 27, 27, 27, 27, 27, 27, 27, 27, 27, 27, 27, 27, 27, 27, 27, 27, 27, 27, 27])	[('NM', 1), ('RG', 'L1')]
readB: read2	0	0	15	20	20M	-1	-1	20	TTTTTTTTTTTTTTTTTTTT	array('B', [27, 27, 27, 27, 27, 27, 27, 27, 27, 27, 27, 27, 27, 27, 27, 27, 27, 27, 27, 27])	[('NM', 1), ('RG', 'L1')]


# Demo inserting the edit in m2 into read b

Read b has 20 T's. Read m2 has 3 A's, a 3bp deletion, and 3 C's. We will insert this into read b.

In [9]:
new = replaceReadsUtils.replaceRead(b,m2,genome,genome_start)

print("read b: " + str(b))
print("read m2: " + str(m2))
print("new:" + str(new))

read b: read2	0	0	15	20	20M	-1	-1	20	TTTTTTTTTTTTTTTTTTTT	array('B', [27, 27, 27, 27, 27, 27, 27, 27, 27, 27, 27, 27, 27, 27, 27, 27, 27, 27, 27, 27])	[('NM', 1), ('RG', 'L1')]
read m2: mod2	0	0	18	20	3M3D3M	-1	-1	6	AAACCC	array('B', [27, 27, 27, 27, 27, 27])	[('NM', 1), ('RG', 'L1')]
new:read2	0	0	15	20	6M3D14M	-1	-1	20	TTTAAACCCTTTTTTTTGGG	array('B', [27, 27, 27, 27, 27, 27, 27, 27, 27, 27, 27, 27, 27, 27, 27, 27, 27, 27, 27, 27])	[('NM', 1), ('RG', 'L1')]


Note that the new read is the same seqencing length as the original read b, but that it now includes a 3bp deletion as shown in the cigar sequence.

# Demo creating simulated datasets

In [33]:
! if [ ! -f data/chr2.fa ]; then gunzip data/chr2.fa.gz; fi

In [34]:
!python replaceReads.py --downsampleNumber 100 --swapFreq 0.5 \
    --qualAdd 0 --swapChr chr2 --swapLoc 72933870 --reference data/chr2.fa \
    --unalteredBam data/unaltered.bam --unalteredNamesortedBam data/unaltered.nameSort.bam \
    --alteredBam data/ampSeq.bam \
    --outfile out/demo.bam  \
    --onlyIncludeAlteredWithIndel

Namespace(alteredBam='data/ampSeq.bam', downsampleNumber=100, onlyIncludeAlteredWithIndel=True, outfile='out/demo.bam', qualAdd=0, reference='data/chr2.fa', swapChr='chr2', swapFreq=0.5, swapLoc=72933870, unalteredBam='data/unaltered.bam', unalteredNamesortedBam='data/unaltered.nameSort.bam')
fetching chr2:72933870
read 274 reads, kept 273 reads at target
making changes in 50/273 sites
sorting...
sorting control...
Finished
read 16000 reads
printed 5652 reads not at the cut site (downsample pct was 0.36496350364963503)
printed 50 reads at cut site without modification
printed 50 reads at cut site with modification
printed 5752 reads to the treatment bam
printed 5744 reads to the control bam



# Run Pindel on simulated dataset

In [44]:
!echo -e "out/demo.bam.ctl.bam\t550\tNORMAL\nout/demo.bam\t550\tTUMOR" > out/demo.pindelConfig

In [45]:
!pindel -f data/chr2.fa -i out/demo.pindelConfig -o out/demo.pindel

Initializing parameters...
Pindel version 0.2.5b8, 20151210.
Loading reference genome ...
Loading reference genome done.
Initializing parameters done.
SearchRegion::SearchRegion
Processing region: chr2	1	242193529
Chromosome Size: 242193530
NumBoxes: 60006	BoxSize: 8079

Looking at chromosome chr2 bases 1 to 5000001 of the bed region: chromosome chr2:1-242193529 
No discordant RP reads in Bamfile out/demo.bam.ctl.bam
No discordant RP reads in Bamfile out/demo.bam
Discovery RP: 0
sorting RP complete.
Reads_RP.size(): 0
Modify RP complete.
adding BD from RP.
modify and summarize interchr RP.
adding BD from interChr RP.
summarize BP as BD complete. Now start sorting BD...
sorting BD... done.
external BD events: 0 Added BreakDancer-like events: 0

Insertsize in config: 550
The number of one end mapped read: 0
Number of problematic reads in current window:            	0, + 0 - 0
Number of split-reads where the close end could be mapped:	0, + 0 - 0
Percentage of problematic reads with close 

no reads 
InterChromosome_SR.size(): 0

Looking at chromosome chr2 bases 30000001 to 35000001 of the bed region: chromosome chr2:1-242193529 
out/demo.bam.ctl.bam RP 4
out/demo.bam RP 5
Discovery RP: 5
sorting RP complete.
Reads_RP.size(): 5
sorting read-pair
sorting read-pair finished.
Modify RP complete.
adding BD from RP.
modify and summarize interchr RP.
adding BD from interChr RP.
summarize BP as BD complete. Now start sorting BD...
sorting BD... done.
external BD events: 0 Added BreakDancer-like events: 0

Insertsize in config: 550
The number of one end mapped read: 0
Number of problematic reads in current window:            	4, + 3 - 1
Number of split-reads where the close end could be mapped:	0, + 0 - 0
Percentage of problematic reads with close end mapped:    	+ 0.00% - 0.00%
No currentState.Reads for chr2 found in out/demo.bam.ctl.bam
BAM file index	0
Bam file name	out/demo.bam.ctl.bam
Number of split-reads so far	0


Insertsize in config: 550
The number of one end mapped rea

The number of one end mapped read: 0
Number of problematic reads in current window:            	0, + 0 - 0
Number of split-reads where the close end could be mapped:	0, + 0 - 0
Percentage of problematic reads with close end mapped:    	+ 0.00% - 0.00%
No currentState.Reads for chr2 found in out/demo.bam.ctl.bam
BAM file index	0
Bam file name	out/demo.bam.ctl.bam
Number of split-reads so far	0


Insertsize in config: 550
The number of one end mapped read: 0
Number of problematic reads in current window:            	0, + 0 - 0
Number of split-reads where the close end could be mapped:	0, + 0 - 0
Percentage of problematic reads with close end mapped:    	+ 0.00% - 0.00%
No currentState.Reads for chr2 found in out/demo.bam
BAM file index	1
Bam file name	out/demo.bam
Number of split-reads so far	0

The number of one end mapped read: 0
There are 0 reads supporting the reference allele.
There are 2 samples.
SampleName2Index done
declaring g_RefCoverageRegion for 2 samples and 5000001 position

The number of one end mapped read: 0
Number of problematic reads in current window:            	0, + 0 - 0
Number of split-reads where the close end could be mapped:	0, + 0 - 0
Percentage of problematic reads with close end mapped:    	+ 0.00% - 0.00%
No currentState.Reads for chr2 found in out/demo.bam
BAM file index	1
Bam file name	out/demo.bam
Number of split-reads so far	0

The number of one end mapped read: 0
There are 0 reads supporting the reference allele.
There are 2 samples.
SampleName2Index done
declaring g_RefCoverageRegion for 2 samples and 5000001 positions.
no reads 
InterChromosome_SR.size(): 0

Looking at chromosome chr2 bases 80000001 to 85000001 of the bed region: chromosome chr2:1-242193529 
No discordant RP reads in Bamfile out/demo.bam.ctl.bam
No discordant RP reads in Bamfile out/demo.bam
Discovery RP: 0
sorting RP complete.
Reads_RP.size(): 0
Modify RP complete.
adding BD from RP.
modify and summarize interchr RP.
adding BD from interChr RP.
summarize BP as BD c

no reads 
InterChromosome_SR.size(): 0

Looking at chromosome chr2 bases 105000001 to 110000001 of the bed region: chromosome chr2:1-242193529 
No discordant RP reads in Bamfile out/demo.bam.ctl.bam
No discordant RP reads in Bamfile out/demo.bam
Discovery RP: 0
sorting RP complete.
Reads_RP.size(): 0
Modify RP complete.
adding BD from RP.
modify and summarize interchr RP.
adding BD from interChr RP.
summarize BP as BD complete. Now start sorting BD...
sorting BD... done.
external BD events: 0 Added BreakDancer-like events: 0

Insertsize in config: 550
The number of one end mapped read: 0
Number of problematic reads in current window:            	0, + 0 - 0
Number of split-reads where the close end could be mapped:	0, + 0 - 0
Percentage of problematic reads with close end mapped:    	+ 0.00% - 0.00%
No currentState.Reads for chr2 found in out/demo.bam.ctl.bam
BAM file index	0
Bam file name	out/demo.bam.ctl.bam
Number of split-reads so far	0


Insertsize in config: 550
The number of one 

no reads 
InterChromosome_SR.size(): 0

Looking at chromosome chr2 bases 135000001 to 140000001 of the bed region: chromosome chr2:1-242193529 
No discordant RP reads in Bamfile out/demo.bam.ctl.bam
No discordant RP reads in Bamfile out/demo.bam
Discovery RP: 0
sorting RP complete.
Reads_RP.size(): 0
Modify RP complete.
adding BD from RP.
modify and summarize interchr RP.
adding BD from interChr RP.
summarize BP as BD complete. Now start sorting BD...
sorting BD... done.
external BD events: 0 Added BreakDancer-like events: 0

Insertsize in config: 550
The number of one end mapped read: 0
Number of problematic reads in current window:            	0, + 0 - 0
Number of split-reads where the close end could be mapped:	0, + 0 - 0
Percentage of problematic reads with close end mapped:    	+ 0.00% - 0.00%
No currentState.Reads for chr2 found in out/demo.bam.ctl.bam
BAM file index	0
Bam file name	out/demo.bam.ctl.bam
Number of split-reads so far	0


Insertsize in config: 550
The number of one 


Insertsize in config: 550
The number of one end mapped read: 0
Number of problematic reads in current window:            	0, + 0 - 0
Number of split-reads where the close end could be mapped:	0, + 0 - 0
Percentage of problematic reads with close end mapped:    	+ 0.00% - 0.00%
No currentState.Reads for chr2 found in out/demo.bam.ctl.bam
BAM file index	0
Bam file name	out/demo.bam.ctl.bam
Number of split-reads so far	0


Insertsize in config: 550
The number of one end mapped read: 0
Number of problematic reads in current window:            	0, + 0 - 0
Number of split-reads where the close end could be mapped:	0, + 0 - 0
Percentage of problematic reads with close end mapped:    	+ 0.00% - 0.00%
No currentState.Reads for chr2 found in out/demo.bam
BAM file index	1
Bam file name	out/demo.bam
Number of split-reads so far	0

The number of one end mapped read: 0
There are 0 reads supporting the reference allele.
There are 2 samples.
SampleName2Index done
declaring g_RefCoverageRegion for 2 s

no reads 
InterChromosome_SR.size(): 0

Looking at chromosome chr2 bases 190000001 to 195000001 of the bed region: chromosome chr2:1-242193529 
No discordant RP reads in Bamfile out/demo.bam.ctl.bam
No discordant RP reads in Bamfile out/demo.bam
Discovery RP: 0
sorting RP complete.
Reads_RP.size(): 0
Modify RP complete.
adding BD from RP.
modify and summarize interchr RP.
adding BD from interChr RP.
summarize BP as BD complete. Now start sorting BD...
sorting BD... done.
external BD events: 0 Added BreakDancer-like events: 0

Insertsize in config: 550
The number of one end mapped read: 0
Number of problematic reads in current window:            	0, + 0 - 0
Number of split-reads where the close end could be mapped:	0, + 0 - 0
Percentage of problematic reads with close end mapped:    	+ 0.00% - 0.00%
No currentState.Reads for chr2 found in out/demo.bam.ctl.bam
BAM file index	0
Bam file name	out/demo.bam.ctl.bam
Number of split-reads so far	0


Insertsize in config: 550
The number of one 

no reads 
InterChromosome_SR.size(): 0

Looking at chromosome chr2 bases 220000001 to 225000001 of the bed region: chromosome chr2:1-242193529 
No discordant RP reads in Bamfile out/demo.bam.ctl.bam
No discordant RP reads in Bamfile out/demo.bam
Discovery RP: 0
sorting RP complete.
Reads_RP.size(): 0
Modify RP complete.
adding BD from RP.
modify and summarize interchr RP.
adding BD from interChr RP.
summarize BP as BD complete. Now start sorting BD...
sorting BD... done.
external BD events: 0 Added BreakDancer-like events: 0

Insertsize in config: 550
The number of one end mapped read: 0
Number of problematic reads in current window:            	0, + 0 - 0
Number of split-reads where the close end could be mapped:	0, + 0 - 0
Percentage of problematic reads with close end mapped:    	+ 0.00% - 0.00%
No currentState.Reads for chr2 found in out/demo.bam.ctl.bam
BAM file index	0
Bam file name	out/demo.bam.ctl.bam
Number of split-reads so far	0


Insertsize in config: 550
The number of one 