In [36]:
import random
import io

from Bio.Seq import Seq
from Bio import SeqIO

import numpy as np

In [1]:
from statsnerds.io import MultiSequenceReader

In [6]:
def random_seq(minlength=100, maxlength=200):
    """Create a random sequence.

    :rtype: Bio.Seq.Seq
    """
    length = random.randint(minlength, maxlength)
    seqstr = ''.join(random.choices('ATGC', k=length))
    return Seq(seqstr)

def record_eq(rec1, rec2):
    """Check that two sequence records are equal (sequence and ID)."""
    return str(rec1.seq) == str(rec2.seq) and rec1.id == rec2.id

def tuple_eq(tup1, tup2):
    """Check that two sequence record tuples are equal."""
    return len(tup1) == len(tup2) and \
        all(self.record_eq(t1, t2) for t1, t2 in zip(tup1, tup2))

In [7]:
N_FILES = 3

In [49]:
seq_tuples = []

for i in range(10):
    _records = []

    for j in range(N_FILES):
        seq = random_seq(50, 50)
        id_ = 'test-{:02d}-{:02d}'.format(i, j)
        quality = np.random.randint(40, size=len(seq))

        _records.append(SeqIO.SeqRecord(seq, id=id_, letter_annotations={'phred_quality': quality}))

    seq_tuples.append(tuple(_records))

In [50]:
_files = []

for records in zip(*seq_tuples):
    file = io.StringIO()
    SeqIO.write(records, file, 'fastq')
    file.seek(0)
    _files.append(file)

reader = MultiSequenceReader(_files, 'fastq')

In [51]:
contents = [f.getvalue() for f in reader.files]

# ?

In [52]:
reader.seek(100)

10

In [55]:
np.asarray(reader._offsets)

array([[   0,    0,    0],
       [ 172,  172,  172],
       [ 310,  310,  310],
       [ 448,  448,  448],
       [ 586,  586,  586],
       [ 724,  724,  724],
       [ 862,  862,  862],
       [1000, 1000, 1000],
       [1138, 1138, 1138],
       [1276, 1276, 1276],
       [1380, 1380, 1380]])

In [56]:
np.diff(_, axis=0)

array([[172, 172, 172],
       [138, 138, 138],
       [138, 138, 138],
       [138, 138, 138],
       [138, 138, 138],
       [138, 138, 138],
       [138, 138, 138],
       [138, 138, 138],
       [138, 138, 138],
       [104, 104, 104]])

In [57]:
seq_tuples[0][0]

SeqRecord(seq=Seq('CTTATCCGGGTCCTCGATAAGATTCTGTATTTTCTAACCATTAAACTCAA', Alphabet()), id='test-00-00', name='<unknown name>', description='<unknown description>', dbxrefs=[])

In [58]:
contents[0][:172]

'@test-00-00 <unknown description>\nCTTATCCGGGTCCTCGATAAGATTCTGTATTTTCTAACCATTAAACTCAA\n+\n0G30@,9C><8<+\'5B%\'!9G?.FC<"@D%?#+4E/+%+5BC:;%(EC"=\n@test-01-00 <unknown description>\n'

In [45]:
contents[0][:85]

'>test-00-00 <unknown description>\nGTAACGATATTCGATTCACACAGCCGCAGGACATTCTCTTAGGTGGGCTT\n'

In [47]:
SeqIO.__file__

'/Users/student/anaconda/lib/python3.6/site-packages/Bio/SeqIO/__init__.py'