In [1]:
import replaceReadsUtils
import pysam

# set up some demo reads

```
    10
 a: AAAAAAAAAAAAAAAAAAA
 b:      TTTTTTTTTTTTTTTTTTTT
 m: AAAAAAAAAA-----AAAAACCCCC
 m2:       AAA---CCC
 ```

In [2]:
genome = "G"*100
genome_start = 3

a = pysam.AlignedSegment()
qseq = "A"*20
a.query_name = "read1"
a.query_sequence=qseq
a.flag = 0
a.reference_id = 0
a.reference_start = 10
a.mapping_quality = 20
a.cigarstring = str(len(qseq)) + "M"
a.query_qualities = pysam.qualitystring_to_array("<"*len(qseq))
a.tags = (("NM", 1),("RG", "L1"))

b = pysam.AlignedSegment()
qseq = "T"*20
b.query_name = "read2"
b.query_sequence=qseq
b.flag = 0
b.reference_id = 0
b.reference_start = 15
b.mapping_quality = 20
b.cigarstring = str(len(qseq)) + "M"
b.query_qualities = pysam.qualitystring_to_array("<"*len(qseq))
b.tags = (("NM", 1),("RG", "L1"))


m = pysam.AlignedSegment()
qseq = "AAAAAAAAAAAAAAACCCCC"
m.query_name = "mod"
m.query_sequence=qseq
m.flag = 0
m.reference_id = 0
m.reference_start = 10
m.mapping_quality = 20
m.cigarstring = "10M5D10M"
m.query_qualities = pysam.qualitystring_to_array("<"*len(qseq))
m.tags = (("NM", 1),("RG", "L1"))

m2 = pysam.AlignedSegment()
qseq = "AAACCC"
m2.query_name = "mod2"
m2.query_sequence=qseq
m2.flag = 0
m2.reference_id = 0
m2.reference_start = 18
m2.mapping_quality = 20
m2.cigarstring = "3M3D3M"
m2.query_qualities = pysam.qualitystring_to_array("<"*len(qseq))
m2.tags = (("NM", 1),("RG", "L1"))

readA: read1	0	0	10	20	20M	-1	-1	20	AAAAAAAAAAAAAAAAAAAA	array('B', [27, 27, 27, 27, 27, 27, 27, 27, 27, 27, 27, 27, 27, 27, 27, 27, 27, 27, 27, 27])	[('NM', 1), ('RG', 'L1')]
readB: read2	0	0	15	20	20M	-1	-1	20	TTTTTTTTTTTTTTTTTTTT	array('B', [27, 27, 27, 27, 27, 27, 27, 27, 27, 27, 27, 27, 27, 27, 27, 27, 27, 27, 27, 27])	[('NM', 1), ('RG', 'L1')]


# Demo inserting the edit in m2 into read b

Read b has 20 T's. Read m2 has 3 A's, a 3bp deletion, and 3 C's. We will insert this into read b.

In [9]:
new = replaceReadsUtils.replaceRead(b,m2,genome,genome_start)

print("read b: " + str(b))
print("read m2: " + str(m2))
print("new:" + str(new))

read b: read2	0	0	15	20	20M	-1	-1	20	TTTTTTTTTTTTTTTTTTTT	array('B', [27, 27, 27, 27, 27, 27, 27, 27, 27, 27, 27, 27, 27, 27, 27, 27, 27, 27, 27, 27])	[('NM', 1), ('RG', 'L1')]
read m2: mod2	0	0	18	20	3M3D3M	-1	-1	6	AAACCC	array('B', [27, 27, 27, 27, 27, 27])	[('NM', 1), ('RG', 'L1')]
new:read2	0	0	15	20	6M3D14M	-1	-1	20	TTTAAACCCTTTTTTTTGGG	array('B', [27, 27, 27, 27, 27, 27, 27, 27, 27, 27, 27, 27, 27, 27, 27, 27, 27, 27, 27, 27])	[('NM', 1), ('RG', 'L1')]


Note that the new read is the same seqencing length as the original read b, but that it now includes a 3bp deletion as shown in the cigar sequence.

# Demo creating simulated datasets

In [19]:
!wget http://hgdownload.soe.ucsc.edu/goldenPath/hg38/chromosomes/chr2.fa.gz

--2017-11-13 11:52:32--  http://hgdownload.soe.ucsc.edu/goldenPath/hg38/chromosomes/chr2.fa.gz
Resolving hgdownload.soe.ucsc.edu... 128.114.119.163
Connecting to hgdownload.soe.ucsc.edu|128.114.119.163|:80... connected.
HTTP request sent, awaiting response... 200 OK
Length: 78561132 (75M) [application/x-gzip]
Saving to: “chr2.fa.gz”


2017-11-13 12:00:21 (164 KB/s) - “chr2.fa.gz” saved [78561132/78561132]



In [18]:
!python replaceReads.py --downsampleNumber 100 --swapFreq 0.5 \
    --qualAdd 0 --swapChr chr2 --swapLoc 72933870 --reference data/ref.chr2.fa.gz \
    --unalteredBam data/unaltered.bam --unalteredNamesortedBam data/unaltered.nameSort.bam \
    --alteredBam data/ampSeq.bam \
    --outfile out/demo.bam  \
    --onlyIncludeAlteredWithIndel

Namespace(alteredBam='data/ampSeq.bam', downsampleNumber=100, onlyIncludeAlteredWithIndel=True, outfile='out/demo.bam', qualAdd=0, reference='data/ref.chr2.fa.gz', swapChr='chr2', swapFreq=0.5, swapLoc=72933870, unalteredBam='data/unaltered.bam', unalteredNamesortedBam='data/unaltered.nameSort.bam')
[E::fai_build3] Cannot index files compressed with gzip, please use bgzip
Traceback (most recent call last):
  File "replaceReads.py", line 52, in <module>
    refFile = pysam.Fastafile(args.reference)
  File "pysam/libcfaidx.pyx", line 114, in pysam.libcfaidx.FastaFile.__cinit__ (pysam/libcfaidx.c:2166)
  File "pysam/libcfaidx.pyx", line 155, in pysam.libcfaidx.FastaFile._open (pysam/libcfaidx.c:2757)
OSError: could not open file `data/ref.chr2.fa.gz`
