In [1]:
import sys
sys.path.insert(0, '../..')
from allel.io_vcf_read import read_vcf, read_vcf_chunks, vcf_to_npz, vcf_to_hdf5, vcf_to_zarr
# from allel.opt.io_vcf_read import (iter_vcf, 
#                                    CalldataParser_parse, 
#                                    GenotypeInt8Parser_parse, 
#                                    ParserContext_next, 
#                                    BufferedReader_read
#                                   )

prof_vcf_fn = '../../profdata/2L_2358158_2431617.vcf'
sample_vcf_fn = '../../fixture/sample.vcf'

In [2]:
# create a slightly larger profiling file
!cat {prof_vcf_fn} > ../../profdata/prof_gt.vcf
!for i in `seq 1 100`; do cat {prof_vcf_fn} | grep -v '^#' >> ../../profdata/prof_gt.vcf; done
!gzip -f ../../profdata/prof_gt.vcf

In [3]:
!cat {prof_vcf_fn} | wc -l

1979


In [4]:
!zcat ../../profdata/prof_gt.vcf.gz | wc -l

198679


In [5]:
!cat {sample_vcf_fn}

##fileformat=VCFv4.0
##fileDate=20090805
##source=myImputationProgramV3.1
##reference=1000GenomesPilot-NCBI36
##phasing=partial
##INFO=<ID=NS,Number=1,Type=Integer,Description="Number of Samples With Data">
##INFO=<ID=AN,Number=1,Type=Integer,Description="Total number of alleles in called genotypes">
##INFO=<ID=AC,Number=.,Type=Integer,Description="Allele count in genotypes, for each ALT allele, in the same order as listed">
##INFO=<ID=DP,Number=1,Type=Integer,Description="Total Depth">
##INFO=<ID=AF,Number=.,Type=Float,Description="Allele Frequency">
##INFO=<ID=AA,Number=1,Type=String,Description="Ancestral Allele">
##INFO=<ID=DB,Number=0,Type=Flag,Description="dbSNP membership, build 129">
##INFO=<ID=H2,Number=0,Type=Flag,Description="HapMap2 membership">
##FILTER=<ID=s50,Description="Less than 50% of samples have data">
##FILTER=<ID=q10,Description="Quality below 10">
##FORMAT=<ID=GT,Number=1,Type=String,Description="Genotype">
##FORMAT=<ID=GQ,Number=1,Type=Integer,D

In [6]:
headers, chunks = read_vcf_chunks(sample_vcf_fn, buffer_size=2**15, chunk_length=3, 
                                  fields='*',
                                  types={'CHROM': 'S4', 'ID': 'S20', 'DP': 'i8', 'AF': 'f8', 'HQ': 'i1', 'GQ': 'i1'},
                                  numbers={'ALT': 2, 'AF': 2, 'AA': 2}
                                 )
headers.samples

['NA00001', 'NA00002', 'NA00003']

In [7]:
headers.filters

{'q10': {'Description': 'Quality below 10', 'ID': 'q10'},
 's50': {'Description': 'Less than 50% of samples have data', 'ID': 's50'}}

In [8]:
headers.infos

{'AA': {'Description': 'Ancestral Allele',
  'ID': 'AA',
  'Number': '1',
  'Type': 'String'},
 'AC': {'Description': 'Allele count in genotypes, for each ALT allele, in the same order as listed',
  'ID': 'AC',
  'Number': '.',
  'Type': 'Integer'},
 'AF': {'Description': 'Allele Frequency',
  'ID': 'AF',
  'Number': '.',
  'Type': 'Float'},
 'AN': {'Description': 'Total number of alleles in called genotypes',
  'ID': 'AN',
  'Number': '1',
  'Type': 'Integer'},
 'DB': {'Description': 'dbSNP membership, build 129',
  'ID': 'DB',
  'Number': '0',
  'Type': 'Flag'},
 'DP': {'Description': 'Total Depth',
  'ID': 'DP',
  'Number': '1',
  'Type': 'Integer'},
 'H2': {'Description': 'HapMap2 membership',
  'ID': 'H2',
  'Number': '0',
  'Type': 'Flag'},
 'NS': {'Description': 'Number of Samples With Data',
  'ID': 'NS',
  'Number': '1',
  'Type': 'Integer'}}

In [9]:
headers.formats

{'DP': {'Description': 'Read Depth',
  'ID': 'DP',
  'Number': '1',
  'Type': 'Integer'},
 'GQ': {'Description': 'Genotype Quality',
  'ID': 'GQ',
  'Number': '1',
  'Type': 'Integer'},
 'GT': {'Description': 'Genotype',
  'ID': 'GT',
  'Number': '1',
  'Type': 'String'},
 'HQ': {'Description': 'Haplotype Quality',
  'ID': 'HQ',
  'Number': '2',
  'Type': 'Integer'}}

In [10]:
chunks = list(chunks)
len(chunks)

3

In [11]:
sorted(chunks[0].keys())

['calldata/DP',
 'calldata/GQ',
 'calldata/GT',
 'calldata/HQ',
 'variants/AA',
 'variants/AC',
 'variants/AF',
 'variants/ALT',
 'variants/AN',
 'variants/CHROM',
 'variants/DB',
 'variants/DP',
 'variants/FILTER_PASS',
 'variants/FILTER_q10',
 'variants/FILTER_s50',
 'variants/H2',
 'variants/ID',
 'variants/NS',
 'variants/POS',
 'variants/QUAL',
 'variants/REF']

In [12]:
sum([r['variants/CHROM'].shape[0] for r in chunks])

9

In [13]:
chunks[0]['variants/CHROM']

array([b'19', b'19', b'20'], 
      dtype='|S4')

In [14]:
chunks[-1]['variants/CHROM']

array([b'20', b'20', b'X'], 
      dtype='|S4')

In [15]:
chunks[0]['variants/POS']

array([  111,   112, 14370], dtype=int32)

In [16]:
chunks[-1]['variants/POS']

array([1234567, 1235237,      10], dtype=int32)

In [17]:
chunks[0]['variants/ID']

array([b'.', b'.', b'rs6054257'], 
      dtype='|S20')

In [18]:
chunks[0]['variants/REF']

array([b'A', b'A', b'G'], 
      dtype='|S1')

In [19]:
chunks[0]['variants/ALT']

array([[b'C', b''],
       [b'G', b''],
       [b'A', b'']], 
      dtype='|S1')

In [20]:
chunks[-1]['variants/ALT']

array([[b'G', b'G'],
       [b'.', b''],
       [b'A', b'A']], 
      dtype='|S1')

In [21]:
chunks[0]['variants/QUAL']

array([  9.60000038,  10.        ,  29.        ], dtype=float32)

In [22]:
chunks[0]['variants/FILTER_PASS']

array([False, False,  True], dtype=bool)

In [23]:
chunks[0]['variants/NS']

array([-1, -1,  3], dtype=int32)

In [24]:
chunks[0]['variants/DP']

array([-1, -1, 14])

In [25]:
chunks[0]['variants/AF']

array([[ nan,  nan],
       [ nan,  nan],
       [ 0.5,  nan]])

In [26]:
chunks[-1]['variants/AC']

array([[ 3,  1, -1],
       [-1, -1, -1],
       [-1, -1, -1]], dtype=int32)

In [27]:
chunks[0]['variants/NS']

array([-1, -1,  3], dtype=int32)

In [28]:
chunks[0]['variants/AN']

array([-1, -1, -1], dtype=int32)

In [29]:
chunks[0]['variants/DB']

array([False, False,  True], dtype=bool)

In [30]:
chunks[0]['variants/H2']

array([False, False,  True], dtype=bool)

In [31]:
chunks[1]['variants/AA']

array([[b'', b''],
       [b'T', b''],
       [b'T', b'']], 
      dtype='|S12')

In [32]:
chunks[0]['calldata/GT']

array([[[0, 0],
        [0, 0],
        [0, 1]],

       [[0, 0],
        [0, 0],
        [0, 1]],

       [[0, 0],
        [1, 0],
        [1, 1]]], dtype=int8)

In [33]:
chunks[0]['calldata/GQ']

array([[-1, -1, -1],
       [-1, -1, -1],
       [48, 48, 43]], dtype=int8)

In [34]:
chunks[0]['calldata/HQ']

array([[[10, 10],
        [10, 10],
        [ 3,  3]],

       [[10, 10],
        [10, 10],
        [ 3,  3]],

       [[51, 51],
        [51, 51],
        [-1, -1]]], dtype=int8)

In [35]:
chunks[0]['calldata/DP']

array([[-1, -1, -1],
       [-1, -1, -1],
       [ 1,  8,  5]], dtype=int16)

In [36]:
callset = read_vcf(sample_vcf_fn, buffer_size=2**15, chunk_length=1000)
sorted(callset.keys())

['calldata/GT',
 'samples',
 'variants/ALT',
 'variants/CHROM',
 'variants/FILTER_PASS',
 'variants/ID',
 'variants/POS',
 'variants/QUAL',
 'variants/REF']

In [37]:
callset = read_vcf(sample_vcf_fn, fields='FILTER',
                   buffer_size=2**15, chunk_length=1000)

In [38]:
sorted(callset.keys())

['variants/FILTER_PASS', 'variants/FILTER_q10', 'variants/FILTER_s50']

In [39]:
callset['variants/FILTER_q10']

array([False, False, False,  True, False, False, False, False, False], dtype=bool)

In [40]:
callset['variants/FILTER_s50']

array([False, False, False, False, False, False, False, False, False], dtype=bool)

In [41]:
callset = read_vcf(sample_vcf_fn, fields='variants/*')

In [42]:
callset = read_vcf(sample_vcf_fn, fields='*', numbers=dict(ALT=1, AA=1),
                   buffer_size=2**15, chunk_length=1000)
sorted(callset.keys())

['calldata/DP',
 'calldata/GQ',
 'calldata/GT',
 'calldata/HQ',
 'samples',
 'variants/AA',
 'variants/AC',
 'variants/AF',
 'variants/ALT',
 'variants/AN',
 'variants/CHROM',
 'variants/DB',
 'variants/DP',
 'variants/FILTER_PASS',
 'variants/FILTER_q10',
 'variants/FILTER_s50',
 'variants/H2',
 'variants/ID',
 'variants/NS',
 'variants/POS',
 'variants/QUAL',
 'variants/REF']

In [43]:
callset['variants/ALT'].shape

(9,)

In [44]:
callset['variants/ALT']

array([b'C', b'G', b'A', b'A', b'G', b'.', b'G', b'.', b'A'], 
      dtype='|S1')

In [45]:
callset['variants/AA']

array([b'', b'', b'', b'', b'T', b'T', b'G', b'', b''], 
      dtype='|S12')

In [46]:
callset = read_vcf(prof_vcf_fn, buffer_size=2**15, chunk_length=1000)

In [47]:
sorted(callset.keys())

['calldata/GT',
 'samples',
 'variants/ALT',
 'variants/CHROM',
 'variants/FILTER_PASS',
 'variants/ID',
 'variants/POS',
 'variants/QUAL',
 'variants/REF']

In [48]:
callset['samples']

array([b'AB0085-C', b'AB0087-C', b'AB0088-C', b'AB0089-C', b'AB0090-C',
       b'AB0091-C', b'AB0092-C', b'AB0094-C', b'AB0095-C', b'AB0097-C',
       b'AB0098-C', b'AB0099-C', b'AB0100-C', b'AB0101-C', b'AB0103-C',
       b'AB0104-C', b'AB0109-C', b'AB0110-C', b'AB0111-C', b'AB0112-C',
       b'AB0113-C', b'AB0114-C', b'AB0117-C', b'AB0119-C', b'AB0122-C',
       b'AB0123-C', b'AB0124-C', b'AB0126-C', b'AB0127-C', b'AB0128-C',
       b'AB0129-C', b'AB0130-C', b'AB0133-C', b'AB0134-C', b'AB0135-C',
       b'AB0136-C', b'AB0137-C', b'AB0138-C', b'AB0139-C', b'AB0140-C',
       b'AB0142-C', b'AB0143-C', b'AB0145-C', b'AB0146-C', b'AB0147-C',
       b'AB0148-C', b'AB0151-C', b'AB0153-C', b'AB0155-C', b'AB0157-C',
       b'AB0158-C', b'AB0159-C', b'AB0160-C', b'AB0161-C', b'AB0164-C',
       b'AB0166-C', b'AB0169-C', b'AB0170-C', b'AB0171-C', b'AB0172-C',
       b'AB0173-C', b'AB0174-C', b'AB0175-C', b'AB0176-C', b'AB0177-C',
       b'AB0178-C', b'AB0179-C', b'AB0181-C', b'AB0182-C', b'AB0

In [49]:
callset['calldata/GT'].shape

(1967, 773, 2)

In [50]:
callset['calldata/GT'].shape

(1967, 773, 2)

In [51]:
callset['variants/CHROM']

array([b'2L', b'2L', b'2L', ..., b'2L', b'2L', b'2L'], 
      dtype='|S12')

In [52]:
callset['variants/POS']

array([2353212, 2353223, 2353234, ..., 2436558, 2436585, 2436615], dtype=int32)

In [53]:
callset['variants/REF']

array([b'G', b'T', b'G', ..., b'G', b'A', b'C'], 
      dtype='|S1')

In [54]:
callset['variants/ALT']

array([[b'A', b'', b''],
       [b'G', b'', b''],
       [b'C', b'', b''],
       ..., 
       [b'A', b'', b''],
       [b'C', b'', b''],
       [b'A', b'', b'']], 
      dtype='|S1')

## Format conversion

In [55]:
npz_fn = 'sample.npz'
vcf_to_npz(sample_vcf_fn, npz_fn, fields='*', chunk_length=3, overwrite=False)

ValueError: file exists at path 'sample.npz'; use overwrite=True to replace

In [56]:
vcf_to_npz(sample_vcf_fn, npz_fn, fields='*', chunk_length=3, overwrite=True)

In [57]:
!ls -lh {npz_fn}

-rw-r--r-- 1 aliman aliman 4.3K Jun  8 15:59 sample.npz


In [58]:
import numpy as np

In [59]:
callset = np.load(npz_fn)
callset

<numpy.lib.npyio.NpzFile at 0x7f196851eba8>

In [60]:
sorted(callset.keys())

['calldata/DP',
 'calldata/GQ',
 'calldata/GT',
 'calldata/HQ',
 'samples',
 'variants/AA',
 'variants/AC',
 'variants/AF',
 'variants/ALT',
 'variants/AN',
 'variants/CHROM',
 'variants/DB',
 'variants/DP',
 'variants/FILTER_PASS',
 'variants/FILTER_q10',
 'variants/FILTER_s50',
 'variants/H2',
 'variants/ID',
 'variants/NS',
 'variants/POS',
 'variants/QUAL',
 'variants/REF']

In [61]:
callset['samples']

array([b'NA00001', b'NA00002', b'NA00003'], 
      dtype='|S7')

In [62]:
callset['variants/POS']

array([    111,     112,   14370,   17330, 1110696, 1230237, 1234567,
       1235237,      10], dtype=int32)

In [63]:
callset['variants/CHROM']

array([b'19', b'19', b'20', b'20', b'20', b'20', b'20', b'20', b'X'], 
      dtype='|S12')

In [64]:
callset['calldata/GT']

array([[[ 0,  0],
        [ 0,  0],
        [ 0,  1]],

       [[ 0,  0],
        [ 0,  0],
        [ 0,  1]],

       [[ 0,  0],
        [ 1,  0],
        [ 1,  1]],

       [[ 0,  0],
        [ 0,  1],
        [ 0,  0]],

       [[ 1,  2],
        [ 2,  1],
        [ 2,  2]],

       [[ 0,  0],
        [ 0,  0],
        [ 0,  0]],

       [[ 0,  1],
        [ 0,  2],
        [-1, -1]],

       [[ 0,  0],
        [ 0,  0],
        [-1, -1]],

       [[ 0, -1],
        [ 0,  1],
        [ 0,  2]]], dtype=int8)

In [65]:
hdf5_fn = 'sample.h5'
vcf_to_hdf5(sample_vcf_fn, hdf5_fn, fields='*', chunk_length=3)

ValueError: dataset exists at path 'samples'; use overwrite=True to replace

In [66]:
vcf_to_hdf5(sample_vcf_fn, hdf5_fn, fields='*', chunk_length=3, overwrite=True)

In [67]:
!ls -lh {hdf5_fn}

-rw-r--r-- 1 aliman aliman 88K Jun  8 15:59 sample.h5


In [68]:
!h5ls {hdf5_fn}

calldata                 Group
samples                  Dataset {3}
variants                 Group


In [69]:
!h5ls {hdf5_fn}/variants

AA                       Dataset {9/Inf}
AC                       Dataset {9/Inf, 3}
AF                       Dataset {9/Inf, 3}
ALT                      Dataset {9/Inf, 3}
AN                       Dataset {9/Inf}
CHROM                    Dataset {9/Inf}
DB                       Dataset {9/Inf}
DP                       Dataset {9/Inf}
FILTER                   Dataset {9/Inf, 1}
FILTER_PASS              Dataset {9/Inf}
FILTER_q10               Dataset {9/Inf}
FILTER_s50               Dataset {9/Inf}
H2                       Dataset {9/Inf}
ID                       Dataset {9/Inf}
NS                       Dataset {9/Inf}
POS                      Dataset {9/Inf}
QUAL                     Dataset {9/Inf}
REF                      Dataset {9/Inf}


In [70]:
!h5ls {hdf5_fn}/variants/CHROM

CHROM                    Dataset {9/Inf}


In [71]:
!h5ls {hdf5_fn}/calldata

DP                       Dataset {9/Inf, 3}
GQ                       Dataset {9/Inf, 3}
GT                       Dataset {9/Inf, 3, 2}
HQ                       Dataset {9/Inf, 3, 2}


In [72]:
!h5ls {hdf5_fn}/calldata/GT

GT                       Dataset {9/Inf, 3, 2}


In [73]:
import h5py

In [74]:
with h5py.File(hdf5_fn, mode='r') as h5f:
    print(h5f['samples'][:])
    print(h5f['variants/CHROM'][:])
    print(h5f['variants/POS'][:])
    print(h5f['calldata/GT'][:])
    

[b'NA00001' b'NA00002' b'NA00003']
[b'19' b'19' b'20' b'20' b'20' b'20' b'20' b'20' b'X']
[    111     112   14370   17330 1110696 1230237 1234567 1235237      10]
[[[ 0  0]
  [ 0  0]
  [ 0  1]]

 [[ 0  0]
  [ 0  0]
  [ 0  1]]

 [[ 0  0]
  [ 1  0]
  [ 1  1]]

 [[ 0  0]
  [ 0  1]
  [ 0  0]]

 [[ 1  2]
  [ 2  1]
  [ 2  2]]

 [[ 0  0]
  [ 0  0]
  [ 0  0]]

 [[ 0  1]
  [ 0  2]
  [-1 -1]]

 [[ 0  0]
  [ 0  0]
  [-1 -1]]

 [[ 0 -1]
  [ 0  1]
  [ 0  2]]]


In [75]:
zarr_fn = 'sample.zarr'
vcf_to_zarr(sample_vcf_fn, zarr_fn, fields='*', chunk_length=3)

KeyError: "path 'samples' contains an array"

In [76]:
vcf_to_zarr(sample_vcf_fn, zarr_fn, fields='*', chunk_length=3, overwrite=True)

In [77]:
import zarr
callset = zarr.open_group('sample.zarr')
callset

Group(/, 3)
  arrays: 1; samples
  groups: 2; calldata, variants
  store: DirectoryStore

In [78]:
callset['samples'][:]

array([b'NA00001', b'NA00002', b'NA00003'], 
      dtype='|S7')

In [79]:
callset['variants']

Group(/variants, 18)
  arrays: 18; AA, AC, AF, ALT, AN, CHROM, DB, DP, FILTER, FILTER_PASS, FILTE...
  store: DirectoryStore

In [80]:
callset['variants/POS'][:]

array([    111,     112,   14370,   17330, 1110696, 1230237, 1234567,
       1235237,      10], dtype=int32)

In [81]:
callset['variants/CHROM'][:]

array([b'19', b'19', b'20', b'20', b'20', b'20', b'20', b'20', b'X'], 
      dtype='|S12')

In [82]:
callset['calldata']

Group(/calldata, 4)
  arrays: 4; DP, GQ, GT, HQ
  store: DirectoryStore

In [83]:
callset['calldata/GT'][:]

array([[[ 0,  0],
        [ 0,  0],
        [ 0,  1]],

       [[ 0,  0],
        [ 0,  0],
        [ 0,  1]],

       [[ 0,  0],
        [ 1,  0],
        [ 1,  1]],

       [[ 0,  0],
        [ 0,  1],
        [ 0,  0]],

       [[ 1,  2],
        [ 2,  1],
        [ 2,  2]],

       [[ 0,  0],
        [ 0,  0],
        [ 0,  0]],

       [[ 0,  1],
        [ 0,  2],
        [-1, -1]],

       [[ 0,  0],
        [ 0,  0],
        [-1, -1]],

       [[ 0, -1],
        [ 0,  1],
        [ 0,  2]]], dtype=int8)

In [84]:
callset['calldata/GQ'][:]

array([[-1, -1, -1],
       [-1, -1, -1],
       [48, 48, 43],
       [49,  3, 41],
       [21,  2, 35],
       [54, 48, 61],
       [-1, 17, 40],
       [-1, -1, -1],
       [-1, -1, -1]], dtype=int8)

## Profiling

In [1]:
import sys
sys.path.insert(0, '../..')
import cProfile
from allel.io_vcf_read import read_vcf, vcf_to_npz, vcf_to_hdf5, vcf_to_zarr

sample_vcf_fn = '../../fixture/sample.vcf'
prof_vcf_fn = '../../profdata/2L_2358158_2431617.vcf'

In [2]:
%time read_vcf(prof_vcf_fn, fields='*', chunk_length=1000)

CPU times: user 92 ms, sys: 8 ms, total: 100 ms
Wall time: 94.3 ms


{'calldata/GT': array([[[0, 0],
         [0, 0],
         [0, 0],
         ..., 
         [0, 0],
         [0, 0],
         [0, 0]],
 
        [[0, 0],
         [0, 0],
         [0, 0],
         ..., 
         [0, 0],
         [0, 0],
         [0, 0]],
 
        [[0, 0],
         [0, 0],
         [0, 0],
         ..., 
         [0, 0],
         [0, 0],
         [0, 0]],
 
        ..., 
        [[0, 0],
         [0, 0],
         [0, 0],
         ..., 
         [0, 0],
         [0, 0],
         [0, 0]],
 
        [[1, 1],
         [0, 1],
         [1, 1],
         ..., 
         [0, 0],
         [1, 1],
         [0, 1]],
 
        [[0, 0],
         [0, 0],
         [0, 0],
         ..., 
         [0, 0],
         [0, 0],
         [0, 0]]], dtype=int8),
 'samples': array(['AB0085-C', 'AB0087-C', 'AB0088-C', 'AB0089-C', 'AB0090-C',
        'AB0091-C', 'AB0092-C', 'AB0094-C', 'AB0095-C', 'AB0097-C',
        'AB0098-C', 'AB0099-C', 'AB0100-C', 'AB0101-C', 'AB0103-C',
        'AB0104-C', 'AB0

In [3]:
%time _ = read_vcf(prof_vcf_fn, n_threads=1, chunk_length=1000, block_length=100)

CPU times: user 116 ms, sys: 4 ms, total: 120 ms
Wall time: 208 ms


In [4]:
!cat {prof_vcf_fn} | wc -l

1979


In [5]:
#!zcat ../../profdata/prof_gt.vcf.gz | wc -l

In [6]:
%timeit _ = read_vcf(prof_vcf_fn, chunk_length=1000)

10 loops, best of 3: 70.4 ms per loop


In [7]:
cProfile.run('read_vcf(prof_vcf_fn, chunk_length=1000)', sort='time')

         184 function calls in 0.093 seconds

   Ordered by: internal time

   ncalls  tottime  percall  cumtime  percall filename:lineno(function)
        1    0.089    0.089    0.089    0.089 io_vcf_read.py:261(<listcomp>)
        1    0.002    0.002    0.003    0.003 io_vcf_read.py:1433(_iter_vcf_stream)
        8    0.001    0.000    0.001    0.000 {built-in method numpy.core.multiarray.concatenate}
        1    0.000    0.000    0.093    0.093 <string>:1(<module>)
        1    0.000    0.000    0.003    0.003 io_vcf_read.py:857(iter_vcf_chunks)
        1    0.000    0.000    0.000    0.000 io_vcf_read.py:1492(_read_vcf_headers)
       17    0.000    0.000    0.000    0.000 {method 'split' of 'str' objects}
        1    0.000    0.000    0.093    0.093 io_vcf_read.py:191(read_vcf)
       12    0.000    0.000    0.000    0.000 {method 'readline' of 'allel.opt.io_vcf_read.FileInputStream' objects}
        1    0.000    0.000    0.093    0.093 {built-in method builtins.exec}
        1

In [8]:
# from allel.opt.io_vcf_read import vcf_genotype_parse
# import line_profiler
# l = line_profiler.LineProfiler()
# l.add_function(vcf_genotype_parse)
# l.runcall(read_vcf, prof_vcf_fn, buffer_size=2**15, chunk_length=1000)
# l.print_stats()

In [9]:
%time vcf_to_npz(prof_vcf_fn, 'prof.npz', chunk_length=200, overwrite=True)

CPU times: user 196 ms, sys: 8 ms, total: 204 ms
Wall time: 199 ms


In [10]:
%time vcf_to_hdf5(prof_vcf_fn, 'prof.h5', chunk_length=200, overwrite=True)

CPU times: user 164 ms, sys: 8 ms, total: 172 ms
Wall time: 167 ms


In [11]:
%time vcf_to_zarr(prof_vcf_fn, 'prof.zarr', chunk_length=200, overwrite=True)

CPU times: user 196 ms, sys: 28 ms, total: 224 ms
Wall time: 229 ms


In [12]:
!zcat ../../profdata/prof_gt.vcf.gz | wc -l

198679


In [12]:
%time _ = read_vcf('../../profdata/prof_gt.vcf.gz', log=sys.stderr)

[read_vcf] 65536 rows in 3.19s; chunk in 3.19s (20547 rows/s); 2L :2383306
[read_vcf] 131072 rows in 6.32s; chunk in 3.13s (20955 rows/s); 2L :2403926
[read_vcf] 196608 rows in 9.42s; chunk in 3.11s (21085 rows/s); 2L :2434126
[read_vcf] 198667 rows in 9.61s; chunk in 0.18s (11297 rows/s); :0
[read_vcf] all done (20677 rows/s)


CPU times: user 9.71 s, sys: 64 ms, total: 9.77 s
Wall time: 9.76 s


In [13]:
%time _ = read_vcf('../../profdata/prof_gt.vcf.gz', n_threads=1, log=sys.stderr)

[read_vcf] 65536 rows in 3.71s; chunk in 3.71s (17685 rows/s); 2L :2383306
[read_vcf] 131072 rows in 7.36s; chunk in 3.65s (17947 rows/s); 2L :2403926
[read_vcf] 196608 rows in 11.02s; chunk in 3.66s (17903 rows/s); 2L :2434126
[read_vcf] 198667 rows in 11.22s; chunk in 0.20s (10187 rows/s); :0
[read_vcf] all done (17550 rows/s)


CPU times: user 11.4 s, sys: 128 ms, total: 11.5 s
Wall time: 11.5 s


In [14]:
_ = read_vcf('../../profdata/prof_gt.vcf.gz', n_threads=2, log=sys.stderr)

[read_vcf] 65536 rows in 3.66s; chunk in 3.66s (17894 rows/s); 2L :2383306
[read_vcf] 131072 rows in 7.28s; chunk in 3.62s (18092 rows/s); 2L :2403926
[read_vcf] 196608 rows in 10.90s; chunk in 3.62s (18116 rows/s); 2L :2434126
[read_vcf] 198667 rows in 11.11s; chunk in 0.20s (10075 rows/s); :0
[read_vcf] all done (17728 rows/s)


In [14]:
%time _ = read_vcf('../../profdata/prof_gt.vcf.gz', n_threads=3, log=sys.stderr)

[read_vcf] 65536 rows in 1.67s; chunk in 1.67s (39302 rows/s); 2L :2383306
[read_vcf] 131072 rows in 3.32s; chunk in 1.65s (39739 rows/s); 2L :2403926
[read_vcf] 196608 rows in 5.16s; chunk in 1.84s (35526 rows/s); 2L :2434126
[read_vcf] 198667 rows in 5.37s; chunk in 0.21s (9895 rows/s); :0
[read_vcf] all done (35779 rows/s)


CPU times: user 12.1 s, sys: 92 ms, total: 12.2 s
Wall time: 5.73 s


In [15]:
%time _ = read_vcf('../../profdata/prof_gt.vcf.gz', n_threads=4, log=sys.stderr)

[read_vcf] 65536 rows in 2.26s; chunk in 2.26s (28991 rows/s); 2L :2383306
[read_vcf] 131072 rows in 4.51s; chunk in 2.25s (29074 rows/s); 2L :2403926
[read_vcf] 196608 rows in 6.46s; chunk in 1.95s (33637 rows/s); 2L :2434126
[read_vcf] 198667 rows in 6.66s; chunk in 0.20s (10390 rows/s); :0
[read_vcf] all done (29477 rows/s)


CPU times: user 14.8 s, sys: 112 ms, total: 14.9 s
Wall time: 6.91 s


In [16]:
%time _ = read_vcf('../../profdata/prof_gt.vcf.gz', n_threads=8, log=sys.stderr)

[read_vcf] 65536 rows in 1.95s; chunk in 1.95s (33672 rows/s); 2L :2383306
[read_vcf] 131072 rows in 3.64s; chunk in 1.69s (38788 rows/s); 2L :2403926
[read_vcf] 196608 rows in 5.87s; chunk in 2.24s (29291 rows/s); 2L :2434126
[read_vcf] 198667 rows in 6.13s; chunk in 0.26s (7874 rows/s); :0
[read_vcf] all done (31343 rows/s)


CPU times: user 14.1 s, sys: 124 ms, total: 14.2 s
Wall time: 6.57 s


In [23]:
%time vcf_to_hdf5('../../profdata/prof_gt.vcf.gz', '../../profdata/prof_gt.h5', overwrite=True, log=sys.stderr)

[vcf_to_hdf5] 65536 rows in 3.11s; chunk in 3.11s (21104 rows/s); 2L:2383306
[vcf_to_hdf5] 131072 rows in 6.93s; chunk in 3.82s (17143 rows/s); 2L:2403926
[vcf_to_hdf5] 196608 rows in 10.78s; chunk in 3.85s (17034 rows/s); 2L:2434126
[vcf_to_hdf5] 198667 rows in 11.60s; chunk in 0.83s (2482 rows/s); 2L:2436615
[vcf_to_hdf5] all done (16397 rows/s)


CPU times: user 12.1 s, sys: 100 ms, total: 12.2 s
Wall time: 12.2 s


In [24]:
%time vcf_to_hdf5('../../profdata/prof_gt.vcf.gz', '../../profdata/prof_gt.h5', n_threads=2, overwrite=True, log=sys.stderr)

[vcf_to_hdf5] 65536 rows in 2.40s; chunk in 2.40s (27332 rows/s); 2L:2383306
[vcf_to_hdf5] 131072 rows in 5.42s; chunk in 3.02s (21714 rows/s); 2L:2403926
[vcf_to_hdf5] 196608 rows in 8.42s; chunk in 3.00s (21842 rows/s); 2L:2434126
[vcf_to_hdf5] 198667 rows in 9.23s; chunk in 0.81s (2545 rows/s); 2L:2436615
[vcf_to_hdf5] all done (20347 rows/s)


CPU times: user 20.1 s, sys: 116 ms, total: 20.2 s
Wall time: 9.86 s


In [25]:
%time vcf_to_zarr('../../profdata/prof_gt.vcf.gz', '../../profdata/prof_gt.zarr', overwrite=True, log=sys.stderr)

[vcf_to_zarr] 65536 rows in 3.10s; chunk in 3.10s (21118 rows/s); 2L:2383306
[vcf_to_zarr] 131072 rows in 6.23s; chunk in 3.13s (20937 rows/s); 2L:2403926
[vcf_to_zarr] 196608 rows in 9.32s; chunk in 3.09s (21241 rows/s); 2L:2434126
[vcf_to_zarr] 198667 rows in 9.56s; chunk in 0.24s (8454 rows/s); 2L:2436615
[vcf_to_zarr] all done (20699 rows/s)


CPU times: user 9.71 s, sys: 140 ms, total: 9.85 s
Wall time: 9.69 s


In [29]:
%time vcf_to_zarr('../../profdata/prof_gt.vcf.gz', '../../profdata/prof_gt.zarr', n_threads=2, overwrite=True, log=sys.stderr)

[vcf_to_zarr] 65536 rows in 1.68s; chunk in 1.68s (39087 rows/s); 2L:2383306
[vcf_to_zarr] 131072 rows in 3.39s; chunk in 1.71s (38350 rows/s); 2L:2403926
[vcf_to_zarr] 196608 rows in 5.07s; chunk in 1.69s (38845 rows/s); 2L:2434126
[vcf_to_zarr] 198667 rows in 5.33s; chunk in 0.26s (7916 rows/s); 2L:2436615
[vcf_to_zarr] all done (35716 rows/s)


CPU times: user 11.5 s, sys: 124 ms, total: 11.6 s
Wall time: 5.65 s


In [19]:
%time read_vcf('../../profdata/accessibility.X.vcf.gz', chunk_length=500000, log=sys.stderr, region='X:5000000-7000000')

  'scanning to region' % e)
[read_vcf] 500000 rows in 7.04s; chunk in 7.04s (71057 rows/s); X :5499999
[read_vcf] 1000000 rows in 7.72s; chunk in 0.69s (728332 rows/s); X :5999999
[read_vcf] 1500000 rows in 8.43s; chunk in 0.71s (708055 rows/s); X :6499999
[read_vcf] 2000000 rows in 9.14s; chunk in 0.71s (705014 rows/s); X :6999999
[read_vcf] 2000001 rows in 9.14s; chunk in 0.00s (322 rows/s); X :7000001
[read_vcf] all done (218765 rows/s)


CPU times: user 9.1 s, sys: 72 ms, total: 9.18 s
Wall time: 9.18 s


{'samples': array([], 
       dtype='|S32'), 'variants/ALT': array([[b'A', b'C', b'T'],
        [b'A', b'C', b'T'],
        [b'A', b'C', b'T'],
        ..., 
        [b'A', b'C', b'T'],
        [b'A', b'C', b'T'],
        [b'A', b'C', b'T']], 
       dtype='|S1'), 'variants/CHROM': array([b'X', b'X', b'X', ..., b'X', b'X', b'X'], 
       dtype='|S12'), 'variants/FILTER_PASS': array([ True,  True,  True, ...,  True,  True,  True], dtype=bool), 'variants/ID': array([b'.', b'.', b'.', ..., b'.', b'.', b'.'], 
       dtype='|S12'), 'variants/POS': array([5000000, 5000001, 5000002, ..., 6999998, 6999999, 7000000], dtype=int32), 'variants/QUAL': array([ nan,  nan,  nan, ...,  nan,  nan,  nan], dtype=float32), 'variants/REF': array([b'C', b'C', b'C', ..., b'C', b'T', b'G'], 
       dtype='|S1')}

In [5]:
%time read_vcf('../../profdata/accessibility.X.vcf.gz', chunk_length=500000, log=sys.stderr)

[read_vcf] 500000 rows in 0.77s; chunk in 0.77s (648096 rows/s); X:500000
[read_vcf] 1000000 rows in 1.49s; chunk in 0.72s (697566 rows/s); X:1000000
[read_vcf] 1500000 rows in 2.21s; chunk in 0.72s (692344 rows/s); X:1500000
[read_vcf] 2000000 rows in 2.93s; chunk in 0.72s (691420 rows/s); X:2000000
[read_vcf] 2500000 rows in 3.64s; chunk in 0.71s (706240 rows/s); X:2500000
[read_vcf] 3000000 rows in 4.34s; chunk in 0.70s (716039 rows/s); X:3000000
[read_vcf] 3500000 rows in 5.06s; chunk in 0.72s (694415 rows/s); X:3500000
[read_vcf] 4000000 rows in 5.76s; chunk in 0.70s (709976 rows/s); X:4000000
[read_vcf] 4500000 rows in 6.46s; chunk in 0.69s (720302 rows/s); X:4500000
[read_vcf] 5000000 rows in 7.16s; chunk in 0.71s (708804 rows/s); X:5000000
[read_vcf] 5500000 rows in 7.86s; chunk in 0.70s (717294 rows/s); X:5500000
[read_vcf] 6000000 rows in 8.56s; chunk in 0.70s (712266 rows/s); X:6000000
[read_vcf] 6500000 rows in 9.29s; chunk in 0.73s (689261 rows/s); X:6500000
[read_vcf] 700

CPU times: user 38.4 s, sys: 396 ms, total: 38.8 s
Wall time: 38.7 s


[read_vcf] 24393108 rows in 38.57s; chunk in 0.69s (569157 rows/s); X:24393108
[read_vcf] all done (632384 rows/s)


{'samples': array([], 
       dtype='|S32'), 'variants/ALT': array([[b'A', b'C', b'T'],
        [b'A', b'C', b'T'],
        [b'A', b'C', b'T'],
        ..., 
        [b'A', b'C', b'T'],
        [b'A', b'C', b'T'],
        [b'A', b'C', b'T']], 
       dtype='|S1'), 'variants/CHROM': array([b'X', b'X', b'X', ..., b'X', b'X', b'X'], 
       dtype='|S12'), 'variants/FILTER_PASS': array([False, False, False, ..., False, False, False], dtype=bool), 'variants/ID': array([b'.', b'.', b'.', ..., b'.', b'.', b'.'], 
       dtype='|S12'), 'variants/POS': array([       1,        2,        3, ..., 24393106, 24393107, 24393108], dtype=int32), 'variants/QUAL': array([ nan,  nan,  nan, ...,  nan,  nan,  nan], dtype=float32), 'variants/REF': array([b'G', b'C', b'G', ..., b'T', b'G', b'G'], 
       dtype='|S1')}

In [10]:
%time read_vcf('../../profdata/accessibility.X.vcf.gz', chunk_length=500000, block_length=50000, n_threads=2, log=sys.stderr)

[read_vcf] 500000 rows in 0.76s; chunk in 0.76s (655865 rows/s); X:500000
[read_vcf] 1000000 rows in 1.47s; chunk in 0.70s (711281 rows/s); X:1000000
[read_vcf] 1500000 rows in 2.41s; chunk in 0.95s (528492 rows/s); X:1500000
[read_vcf] 2000000 rows in 3.15s; chunk in 0.74s (678544 rows/s); X:2000000
[read_vcf] 2500000 rows in 3.85s; chunk in 0.70s (712446 rows/s); X:2500000
[read_vcf] 3000000 rows in 4.55s; chunk in 0.70s (717525 rows/s); X:3000000
[read_vcf] 3500000 rows in 5.26s; chunk in 0.71s (705292 rows/s); X:3500000
[read_vcf] 4000000 rows in 5.96s; chunk in 0.70s (713554 rows/s); X:4000000
[read_vcf] 4500000 rows in 6.68s; chunk in 0.72s (691443 rows/s); X:4500000
[read_vcf] 5000000 rows in 7.41s; chunk in 0.73s (687898 rows/s); X:5000000
[read_vcf] 5500000 rows in 8.20s; chunk in 0.79s (630294 rows/s); X:5500000
[read_vcf] 6000000 rows in 8.93s; chunk in 0.73s (687695 rows/s); X:6000000
[read_vcf] 6500000 rows in 10.13s; chunk in 1.20s (415256 rows/s); X:6500000
[read_vcf] 70

CPU times: user 55.8 s, sys: 556 ms, total: 56.3 s
Wall time: 42.7 s


{'samples': array([], 
       dtype='|S32'), 'variants/ALT': array([[b'A', b'C', b'T'],
        [b'A', b'C', b'T'],
        [b'A', b'C', b'T'],
        ..., 
        [b'A', b'C', b'T'],
        [b'A', b'C', b'T'],
        [b'A', b'C', b'T']], 
       dtype='|S1'), 'variants/CHROM': array([b'X', b'X', b'X', ..., b'X', b'X', b'X'], 
       dtype='|S12'), 'variants/FILTER_PASS': array([False, False, False, ..., False, False, False], dtype=bool), 'variants/ID': array([b'.', b'.', b'.', ..., b'.', b'.', b'.'], 
       dtype='|S12'), 'variants/POS': array([       1,        2,        3, ..., 24393106, 24393107, 24393108], dtype=int32), 'variants/QUAL': array([ nan,  nan,  nan, ...,  nan,  nan,  nan], dtype=float32), 'variants/REF': array([b'G', b'C', b'G', ..., b'T', b'G', b'G'], 
       dtype='|S1')}

In [11]:
%time read_vcf('../../profdata/accessibility.X.vcf.gz', fields='*', chunk_length=500000, log=sys.stderr)

[read_vcf] 500000 rows in 1.39s; chunk in 1.39s (360821 rows/s); X:500000
[read_vcf] 1000000 rows in 2.79s; chunk in 1.41s (355866 rows/s); X:1000000
[read_vcf] 1500000 rows in 3.75s; chunk in 0.96s (522496 rows/s); X:1500000
[read_vcf] 2000000 rows in 4.72s; chunk in 0.97s (515406 rows/s); X:2000000
[read_vcf] 2500000 rows in 5.73s; chunk in 1.01s (494436 rows/s); X:2500000
[read_vcf] 3000000 rows in 6.87s; chunk in 1.14s (438925 rows/s); X:3000000
[read_vcf] 3500000 rows in 8.22s; chunk in 1.35s (371203 rows/s); X:3500000
[read_vcf] 4000000 rows in 9.20s; chunk in 0.99s (506288 rows/s); X:4000000
[read_vcf] 4500000 rows in 10.18s; chunk in 0.98s (512664 rows/s); X:4500000
[read_vcf] 5000000 rows in 11.16s; chunk in 0.98s (508986 rows/s); X:5000000
[read_vcf] 5500000 rows in 12.13s; chunk in 0.97s (513810 rows/s); X:5500000
[read_vcf] 6000000 rows in 13.10s; chunk in 0.97s (516498 rows/s); X:6000000
[read_vcf] 6500000 rows in 14.11s; chunk in 1.00s (497744 rows/s); X:6500000
[read_vcf

CPU times: user 53.2 s, sys: 792 ms, total: 54 s
Wall time: 54 s


{'samples': array([], 
       dtype='|S32'), 'variants/ALT': array([[b'A', b'C', b'T'],
        [b'A', b'C', b'T'],
        [b'A', b'C', b'T'],
        ..., 
        [b'A', b'C', b'T'],
        [b'A', b'C', b'T'],
        [b'A', b'C', b'T']], 
       dtype='|S1'), 'variants/Accessible': array([False, False, False, ..., False, False, False], dtype=bool), 'variants/CHROM': array([b'X', b'X', b'X', ..., b'X', b'X', b'X'], 
       dtype='|S12'), 'variants/Coverage': array([10955, 11176, 11579, ...,  3969,  3848,  1743], dtype=int32), 'variants/CoverageMQ0': array([   0,    0,    0, ..., 3933, 3821, 1727], dtype=int32), 'variants/FILTER_HighCoverage': array([False, False, False, ..., False, False, False], dtype=bool), 'variants/FILTER_HighMQ0': array([False, False, False, ...,  True,  True,  True], dtype=bool), 'variants/FILTER_LowCoverage': array([ True,  True,  True, ...,  True,  True,  True], dtype=bool), 'variants/FILTER_LowMQ': array([ True,  True,  True, ...,  True,  True,  True], dty

In [9]:
%time vcf_to_zarr('../../profdata/accessibility.X.vcf.gz', '../../profdata/accessibility.X.zarr', buffer_size=2**15, chunk_length=50000, overwrite=True)

CPU times: user 38.2 s, sys: 1.08 s, total: 39.3 s
Wall time: 37.9 s


In [10]:
%time vcf_to_zarr('../../profdata/accessibility.X.vcf.gz', '../../profdata/accessibility.X.zarr', fields='*', buffer_size=2**15, chunk_length=50000, overwrite=True)

CPU times: user 1min 2s, sys: 2.85 s, total: 1min 5s
Wall time: 1min 1s


In [37]:
!zcat ../../profdata/ag1000g.phase1.ar3.2L.partial.vcf.gz | wc -l

39984


In [12]:
%time vcf_to_zarr('../../profdata/ag1000g.phase1.ar3.2L.partial.vcf.gz', '../../profdata/ag1000g.phase1.ar3.2L.partial.zarr', fields='*', chunk_length=10000, overwrite=True, log=sys.stderr)

[vcf_to_zarr] 10000 rows in 4.51s; chunk in 4.51s (2217 rows/s); 2L:98451
[vcf_to_zarr] 20000 rows in 9.22s; chunk in 4.71s (2125 rows/s); 2L:196622
[vcf_to_zarr] 30000 rows in 13.71s; chunk in 4.49s (2226 rows/s); 2L:301246
[vcf_to_zarr] 39894 rows in 18.73s; chunk in 5.02s (1969 rows/s); 2L:399982


CPU times: user 20.2 s, sys: 400 ms, total: 20.6 s
Wall time: 19.2 s


[vcf_to_zarr] all done (2087 rows/s)


In [17]:
%time vcf_to_zarr('../../profdata/ag1000g.phase1.ar3.2L.partial.vcf.gz', '../../profdata/ag1000g.phase1.ar3.2L.partial.zarr', fields='*', chunk_length=10000, block_length=1000, n_threads=4, overwrite=True, log=sys.stderr)

[vcf_to_zarr] 10000 rows in 3.01s; chunk in 3.01s (3321 rows/s); 2L:98451
[vcf_to_zarr] 20000 rows in 6.26s; chunk in 3.25s (3074 rows/s); 2L:196622
[vcf_to_zarr] 30000 rows in 9.38s; chunk in 3.12s (3203 rows/s); 2L:301246
[vcf_to_zarr] 39894 rows in 12.68s; chunk in 3.29s (3004 rows/s); 2L:399982


CPU times: user 22.3 s, sys: 400 ms, total: 22.7 s
Wall time: 13.2 s


[vcf_to_zarr] all done (3039 rows/s)


In [18]:
%time vcf_to_zarr('../../profdata/ag1000g.phase1.ar3.2L.partial.vcf.gz', '../../profdata/ag1000g.phase1.ar3.2L.partial.zarr', fields='INFO', chunk_length=10000, block_length=1000, n_threads=1, overwrite=True, log=sys.stderr)

[vcf_to_zarr] 10000 rows in 2.97s; chunk in 2.97s (3369 rows/s); 2L:98451
[vcf_to_zarr] 20000 rows in 5.64s; chunk in 2.67s (3741 rows/s); 2L:196622
[vcf_to_zarr] 30000 rows in 8.20s; chunk in 2.56s (3903 rows/s); 2L:301246


CPU times: user 11 s, sys: 60 ms, total: 11 s
Wall time: 11 s


[vcf_to_zarr] 39894 rows in 10.97s; chunk in 2.77s (3574 rows/s); 2L:399982
[vcf_to_zarr] all done (3627 rows/s)


In [19]:
%time read_vcf('../../profdata/ag1000g.phase1.ar3.2L.partial.vcf.gz', fields='INFO', chunk_length=10000, n_threads=1, log=sys.stderr)

[read_vcf] 10000 rows in 3.03s; chunk in 3.03s (3297 rows/s); 2L:98451
[read_vcf] 20000 rows in 5.65s; chunk in 2.61s (3827 rows/s); 2L:196622
[read_vcf] 30000 rows in 8.18s; chunk in 2.54s (3938 rows/s); 2L:301246


CPU times: user 10.8 s, sys: 64 ms, total: 10.9 s
Wall time: 10.9 s


[read_vcf] 39894 rows in 10.88s; chunk in 2.69s (3672 rows/s); 2L:399982
[read_vcf] all done (3667 rows/s)


{'variants/ABHet': array([        nan,         nan,  0.667     , ...,         nan,
         0.64999998,  0.755     ], dtype=float32),
 'variants/ABHom': array([ 0.801     ,  0.74699998,  0.97600001, ...,         nan,
         0.99900001,  0.93300003], dtype=float32),
 'variants/AC': array([[ 4, -1, -1],
        [38, -1, -1],
        [ 8, -1, -1],
        ..., 
        [ 1,  1, -1],
        [ 4, -1, -1],
        [15, -1, -1]], dtype=int32),
 'variants/AF': array([[ 0.14300001,         nan,         nan],
        [ 0.54299998,         nan,         nan],
        [ 0.068     ,         nan,         nan],
        ..., 
        [ 0.0006536 ,  0.0006536 ,         nan],
        [ 0.002614  ,         nan,         nan],
        [ 0.009804  ,         nan,         nan]], dtype=float32),
 'variants/AN': array([  28,   70,  118, ..., 1530, 1530, 1530], dtype=int32),
 'variants/ANN': array([b'T|intergenic', b'A|intergenic', b'A|intergenic', ...,
        b'A|intergenic', b'A|intergenic', b'G|intergenic'

## Check region and tabix

In [1]:
import sys
sys.path.insert(0, '../..')
from allel.io_vcf_read import read_vcf

In [2]:
read_vcf('../../profdata/ag1000g.phase1.ar3.2L.partial.vcf.gz', fields=['CHROM', 'POS'], chunk_length=10000, log=sys.stderr, region='2L:1-100000')

[read_vcf] 10000 rows in 0.96s; chunk in 0.96s (10450 rows/s); 2L :98451
[read_vcf] 10301 rows in 0.99s; chunk in 0.03s (8646 rows/s); :0
[read_vcf] all done (10378 rows/s)


{'variants/CHROM': array([b'2L', b'2L', b'2L', ..., b'2L', b'2L', b'2L'], 
       dtype='|S12'),
 'variants/POS': array([  103,   163,   192, ..., 99993, 99996, 99997], dtype=int32)}

In [3]:
read_vcf('../../profdata/accessibility.X.vcf.gz', fields=['CHROM', 'POS'], chunk_length=100000, log=sys.stderr, region='X:1000000-2000000')

  'scanning to region' % e)
[read_vcf] 100000 rows in 1.43s; chunk in 1.43s (69955 rows/s); X :1099999
[read_vcf] 200000 rows in 1.59s; chunk in 0.16s (636490 rows/s); X :1199999
[read_vcf] 300000 rows in 1.74s; chunk in 0.15s (646018 rows/s); X :1299999
[read_vcf] 400000 rows in 1.90s; chunk in 0.16s (625514 rows/s); X :1399999
[read_vcf] 500000 rows in 2.06s; chunk in 0.16s (628348 rows/s); X :1499999
[read_vcf] 600000 rows in 2.22s; chunk in 0.16s (644555 rows/s); X :1599999
[read_vcf] 700000 rows in 2.37s; chunk in 0.15s (645166 rows/s); X :1699999
[read_vcf] 800000 rows in 2.53s; chunk in 0.16s (634715 rows/s); X :1799999
[read_vcf] 900000 rows in 2.70s; chunk in 0.17s (597541 rows/s); X :1899999
[read_vcf] 1000000 rows in 2.85s; chunk in 0.15s (647056 rows/s); X :1999999
[read_vcf] 1000001 rows in 2.85s; chunk in 0.00s (1495 rows/s); X :2000001
[read_vcf] all done (350738 rows/s)


{'variants/CHROM': array([b'X', b'X', b'X', ..., b'X', b'X', b'X'], 
       dtype='|S12'),
 'variants/POS': array([1000000, 1000001, 1000002, ..., 1999998, 1999999, 2000000], dtype=int32)}

In [4]:
read_vcf('../../profdata/accessibility.X.vcf.gz', fields=['CHROM', 'POS'], chunk_length=100000, log=sys.stderr, region='X:1000000-2000000', tabix=None)

[read_vcf] 100000 rows in 1.46s; chunk in 1.46s (68691 rows/s); X :1099999
[read_vcf] 200000 rows in 1.62s; chunk in 0.16s (626667 rows/s); X :1199999
[read_vcf] 300000 rows in 1.77s; chunk in 0.16s (638279 rows/s); X :1299999
[read_vcf] 400000 rows in 1.93s; chunk in 0.16s (614899 rows/s); X :1399999
[read_vcf] 500000 rows in 2.10s; chunk in 0.17s (591599 rows/s); X :1499999
[read_vcf] 600000 rows in 2.27s; chunk in 0.17s (605465 rows/s); X :1599999
[read_vcf] 700000 rows in 2.43s; chunk in 0.16s (611707 rows/s); X :1699999
[read_vcf] 800000 rows in 2.60s; chunk in 0.17s (595219 rows/s); X :1799999
[read_vcf] 900000 rows in 2.77s; chunk in 0.17s (587222 rows/s); X :1899999
[read_vcf] 1000000 rows in 2.95s; chunk in 0.18s (561287 rows/s); X :1999999
[read_vcf] 1000001 rows in 2.95s; chunk in 0.00s (782 rows/s); X :2000001
[read_vcf] all done (338868 rows/s)


{'variants/CHROM': array([b'X', b'X', b'X', ..., b'X', b'X', b'X'], 
       dtype='|S12'),
 'variants/POS': array([1000000, 1000001, 1000002, ..., 1999998, 1999999, 2000000], dtype=int32)}

## Profile INFO

In [1]:
import sys
sys.path.insert(0, '../..')
import cProfile
from allel.io_vcf_read import read_vcf, vcf_to_npz, vcf_to_hdf5, vcf_to_zarr, ANNTransformer, vcf_to_csv, \
    vcf_to_dataframe, vcf_to_recarray
# from allel.opt.io_vcf_read import (iter_vcf, 
#                                    CalldataParser_parse, 
#                                    GenotypeInt8Parser_parse, 
#                                    ParserContext_next, 
#                                    BufferedReader_read
#                                  )

sample_vcf_fn = '../../fixture/sample.vcf'
prof_vcf_fn = '../../profdata/2L_2358158_2431617.vcf'

In [2]:
vcf_to_csv('../../profdata/accessibility.X.vcf.gz', '../../profdata/accessibility.X.tsv', 
           fields='*', chunk_length=100000, log=sys.stderr, 
           region='X:1000000-2000000', tabix=None, sep='\t')

[vcf_to_csv] 100000 rows in 1.47s; chunk in 1.47s (68026 rows/s); X :1099999
[vcf_to_csv] 200000 rows in 2.90s; chunk in 1.43s (69884 rows/s); X :1199999
[vcf_to_csv] 300000 rows in 4.27s; chunk in 1.37s (73235 rows/s); X :1299999
[vcf_to_csv] 400000 rows in 5.63s; chunk in 1.36s (73475 rows/s); X :1399999
[vcf_to_csv] 500000 rows in 6.98s; chunk in 1.36s (73719 rows/s); X :1499999
[vcf_to_csv] 600000 rows in 8.35s; chunk in 1.36s (73444 rows/s); X :1599999
[vcf_to_csv] 700000 rows in 9.70s; chunk in 1.36s (73627 rows/s); X :1699999
[vcf_to_csv] 800000 rows in 11.11s; chunk in 1.41s (71133 rows/s); X :1799999
[vcf_to_csv] 900000 rows in 12.49s; chunk in 1.38s (72488 rows/s); X :1899999
[vcf_to_csv] 1000000 rows in 13.87s; chunk in 1.38s (72423 rows/s); X :1999999
[vcf_to_csv] 1000001 rows in 15.05s; chunk in 1.18s (0 rows/s); X :2000001
[vcf_to_csv] all done (66422 rows/s)


In [3]:
!head ../../profdata/accessibility.X.tsv

CHROM	POS	ID	REF	ALT_1	ALT_2	ALT_3	QUAL	HighCoverage	RepeatTRF	Coverage	LowPairing	Accessible	CoverageMQ0	RefN	HighMQ0	NoCoverage	LowMQ	RepeatMasker	RefMasked	LowCoverage	RepeatDUST	FILTER_PASS	FILTER_HighCoverage	FILTER_HighMQ0	FILTER_NoCoverage	FILTER_LowMQ	FILTER_LowCoverage	FILTER_RefN	FILTER_RepeatDUST	numalt	svlen_1	svlen_2	svlen_3
X	1000000	.	T	A	C	T		3	False	25835	2	True	2	False	0	0	0	False	False	1	False	True	False	False	False	False	False	False	False	4	0	0	0
X	1000001	.	C	A	C	T		2	False	25854	2	True	2	False	0	0	0	False	False	1	False	True	False	False	False	False	False	False	False	4	0	0	0
X	1000002	.	A	A	C	T		2	False	25708	2	True	2	False	0	0	0	False	False	1	False	True	False	False	False	False	False	False	False	4	0	0	0
X	1000003	.	C	A	C	T		2	False	25662	2	True	2	False	0	0	0	False	False	1	False	True	False	False	False	False	False	False	False	4	0	0	0
X	1000004	.	A	A	C	T		2	False	25626	2	True	2	False	0	0	0	False	False	1	False	True	False	False	False	False	False	False	False	4	0	0	0

In [2]:
df = vcf_to_dataframe('../../profdata/accessibility.X.vcf.gz',
                      fields='*', chunk_length=100000, log=sys.stderr, 
                      region='X:1000000-1500000', tabix=None)
df.head()

[vcf_to_dataframe] 100000 rows in 1.46s; chunk in 1.46s (68271 rows/s); X :1099999
[vcf_to_dataframe] 200000 rows in 1.68s; chunk in 0.21s (466964 rows/s); X :1199999
[vcf_to_dataframe] 300000 rows in 1.89s; chunk in 0.21s (466957 rows/s); X :1299999
[vcf_to_dataframe] 400000 rows in 2.11s; chunk in 0.21s (471392 rows/s); X :1399999
[vcf_to_dataframe] 500000 rows in 2.31s; chunk in 0.21s (486027 rows/s); X :1499999
[vcf_to_dataframe] 500001 rows in 2.31s; chunk in 0.00s (336 rows/s); X :1500001
[vcf_to_dataframe] all done (216024 rows/s)


Unnamed: 0,CHROM,POS,ID,REF,ALT_1,ALT_2,ALT_3,QUAL,RefMasked,LowMQ,...,FILTER_HighMQ0,FILTER_NoCoverage,FILTER_RefN,FILTER_RepeatDUST,FILTER_LowCoverage,FILTER_HighCoverage,numalt,svlen_1,svlen_2,svlen_3
0,X,1000000,.,T,A,C,T,,False,0,...,False,False,False,False,False,False,4,0,0,0
1,X,1000001,.,C,A,C,T,,False,0,...,False,False,False,False,False,False,4,0,0,0
2,X,1000002,.,A,A,C,T,,False,0,...,False,False,False,False,False,False,4,0,0,0
3,X,1000003,.,C,A,C,T,,False,0,...,False,False,False,False,False,False,4,0,0,0
4,X,1000004,.,A,A,C,T,,False,0,...,False,False,False,False,False,False,4,0,0,0


In [3]:
ra = vcf_to_recarray('../../profdata/accessibility.X.vcf.gz',
                     fields='*', chunk_length=100000, log=sys.stderr, 
                     region='X:1000000-1500000', tabix=None)
ra

[vcf_to_recarray] 100000 rows in 1.54s; chunk in 1.54s (64981 rows/s); X :1099999
[vcf_to_recarray] 200000 rows in 1.85s; chunk in 0.31s (317530 rows/s); X :1199999
[vcf_to_recarray] 300000 rows in 2.09s; chunk in 0.24s (418168 rows/s); X :1299999
[vcf_to_recarray] 400000 rows in 2.31s; chunk in 0.21s (469614 rows/s); X :1399999
[vcf_to_recarray] 500000 rows in 2.51s; chunk in 0.21s (478443 rows/s); X :1499999
[vcf_to_recarray] 500001 rows in 2.52s; chunk in 0.00s (278 rows/s); X :1500001
[vcf_to_recarray] all done (198483 rows/s)


array([ (b'X', 1000000, b'.', b'T', b'A', b'C', b'T', nan, False, 0, 0, True, 2, False, False, 2, False, 1, 0, False, 25835, 3, True, False, False, False, False, False, False, False, 4, 0, 0, 0),
       (b'X', 1000001, b'.', b'C', b'A', b'C', b'T', nan, False, 0, 0, True, 2, False, False, 2, False, 1, 0, False, 25854, 2, True, False, False, False, False, False, False, False, 4, 0, 0, 0),
       (b'X', 1000002, b'.', b'A', b'A', b'C', b'T', nan, False, 0, 0, True, 2, False, False, 2, False, 1, 0, False, 25708, 2, True, False, False, False, False, False, False, False, 4, 0, 0, 0),
       ...,
       (b'X', 1499998, b'.', b'G', b'A', b'C', b'T', nan, False, 0, 0, True, 12, False, False, 4, False, 9, 0, False, 21716, 0, True, False, False, False, False, False, False, False, 4, 0, 0, 0),
       (b'X', 1499999, b'.', b'C', b'A', b'C', b'T', nan, False, 0, 0, True, 12, False, False, 4, False, 11, 0, False, 21698, 0, True, False, False, False, False, False, False, False, 4, 0, 0, 0),
       (b

In [4]:
import allel
allel.VariantTable(ra)

Unnamed: 0,CHROM,POS,ID,REF,ALT_1,ALT_2,ALT_3,QUAL,RefMasked,LowMQ,HighMQ0,Accessible,CoverageMQ0,RepeatDUST,RepeatMasker,LowPairing,RepeatTRF,LowCoverage,NoCoverage,RefN,Coverage,HighCoverage,FILTER_PASS,FILTER_LowMQ,FILTER_HighMQ0,FILTER_NoCoverage,FILTER_RefN,FILTER_RepeatDUST,FILTER_LowCoverage,FILTER_HighCoverage,numalt,svlen_1,svlen_2,svlen_3,Unnamed: 35
0,b'X',1000000,b'.',b'T',b'A',b'C',b'T',,False,0,0,True,2,False,False,2,False,1,0,False,25835,3,True,False,False,False,False,False,False,False,4,0,0,0,
1,b'X',1000001,b'.',b'C',b'A',b'C',b'T',,False,0,0,True,2,False,False,2,False,1,0,False,25854,2,True,False,False,False,False,False,False,False,4,0,0,0,
2,b'X',1000002,b'.',b'A',b'A',b'C',b'T',,False,0,0,True,2,False,False,2,False,1,0,False,25708,2,True,False,False,False,False,False,False,False,4,0,0,0,
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
499998,b'X',1499998,b'.',b'G',b'A',b'C',b'T',,False,0,0,True,12,False,False,4,False,9,0,False,21716,0,True,False,False,False,False,False,False,False,4,0,0,0,
499999,b'X',1499999,b'.',b'C',b'A',b'C',b'T',,False,0,0,True,12,False,False,4,False,11,0,False,21698,0,True,False,False,False,False,False,False,False,4,0,0,0,
500000,b'X',1500000,b'.',b'G',b'A',b'C',b'T',,False,0,0,True,12,False,False,4,False,12,0,False,21756,0,True,False,False,False,False,False,False,False,4,0,0,0,


In [2]:
read_vcf('../../profdata/ag1000g.phase1.ar3.2L.partial.vcf.gz', fields='ANN', types={'ANN': 'S200'}, chunk_length=10000, log=sys.stderr)

[read_vcf] 10000 rows in 2.54s; chunk in 2.54s (3934 rows/s); 2L :98451
[read_vcf] 20000 rows in 5.02s; chunk in 2.48s (4031 rows/s); 2L :196622
[read_vcf] 30000 rows in 7.44s; chunk in 2.41s (4144 rows/s); 2L :301246
[read_vcf] 39894 rows in 10.02s; chunk in 2.58s (3835 rows/s); :0
[read_vcf] all done (3983 rows/s)


{'variants/ANN': array([ b'T|intergenic_region|MODIFIER|AGAP004677|AGAP004677|intergenic_region|AGAP004677|||||||||',
        b'A|intergenic_region|MODIFIER|AGAP004677|AGAP004677|intergenic_region|AGAP004677|||||||||',
        b'A|intergenic_region|MODIFIER|AGAP004677|AGAP004677|intergenic_region|AGAP004677|||||||||',
        ...,
        b'A|intergenic_region|MODIFIER|AGAP004681-AGAP004682|AGAP004681-AGAP004682|intergenic_region|AGAP004681-AGAP004682|||||||||',
        b'A|intergenic_region|MODIFIER|AGAP004681-AGAP004682|AGAP004681-AGAP004682|intergenic_region|AGAP004681-AGAP004682|||||||||',
        b'G|intergenic_region|MODIFIER|AGAP004681-AGAP004682|AGAP004681-AGAP004682|intergenic_region|AGAP004681-AGAP004682|||||||||'], 
       dtype='|S200')}

In [3]:
read_vcf('../../profdata/ag1000g.phase1.ar3.2L.partial.vcf.gz', fields='ANN', transformers=[ANNTransformer()], chunk_length=10000, log=sys.stderr)

[read_vcf] 10000 rows in 2.55s; chunk in 2.55s (3915 rows/s); 2L :98451
[read_vcf] 20000 rows in 5.05s; chunk in 2.50s (4002 rows/s); 2L :196622
[read_vcf] 30000 rows in 7.48s; chunk in 2.43s (4112 rows/s); 2L :301246
[read_vcf] 39894 rows in 10.07s; chunk in 2.59s (3821 rows/s); :0
[read_vcf] all done (3960 rows/s)


{'variants/ANN_AA': array([[-1, -1],
        [-1, -1],
        [-1, -1],
        ..., 
        [-1, -1],
        [-1, -1],
        [-1, -1]], dtype=int32),
 'variants/ANN_Allele': array([b'T', b'A', b'A', ..., b'A', b'A', b'G'], 
       dtype='|S1'),
 'variants/ANN_Annotation': array([b'intergenic_region', b'intergenic_region', b'intergenic_region',
        ..., b'intergenic_region', b'intergenic_region',
        b'intergenic_region'], 
       dtype='|S34'),
 'variants/ANN_Annotation_Impact': array([b'MODIFIER', b'MODIFIER', b'MODIFIER', ..., b'MODIFIER',
        b'MODIFIER', b'MODIFIER'], 
       dtype='|S8'),
 'variants/ANN_CDS': array([[-1, -1],
        [-1, -1],
        [-1, -1],
        ..., 
        [-1, -1],
        [-1, -1],
        [-1, -1]], dtype=int32),
 'variants/ANN_Distance': array([-1, -1, -1, ..., -1, -1, -1], dtype=int32),
 'variants/ANN_Feature_ID': array([b'AGAP004677', b'AGAP004677', b'AGAP004677', ..., b'AGAP004681-AGA',
        b'AGAP004681-AGA', b'AGAP004681-AGA

In [41]:
%time vcf_to_zarr('../../profdata/ag1000g.phase1.ar3.2L.partial.vcf.gz', '../../profdata/ag1000g.phase1.ar3.2L.partial.zarr', fields='*', buffer_size=2**15, chunk_length=10000, block_length=1000, n_threads=4, overwrite=True)

CPU times: user 24.8 s, sys: 248 ms, total: 25.1 s
Wall time: 14.5 s


In [17]:
%time vcf_to_zarr('../../profdata/ag1000g.phase1.ar3.2L.partial.vcf.gz', '../../profdata/ag1000g.phase1.ar3.2L.partial.zarr', fields=['CHROM', 'POS'], buffer_size=2**15, chunk_length=50000, overwrite=True)

CPU times: user 10.5 s, sys: 32 ms, total: 10.5 s
Wall time: 10.5 s


In [2]:
%time vcf_to_zarr('../../profdata/ag1000g.phase1.ar3.2L.partial.vcf.gz', '../../profdata/ag1000g.phase1.ar3.2L.partial.zarr', fields='calldata/*', buffer_size=2**15, chunk_length=50000, overwrite=True)

CPU times: user 19.6 s, sys: 1.09 s, total: 20.7 s
Wall time: 18.8 s


In [3]:
%time vcf_to_zarr('../../profdata/ag1000g.phase1.ar3.2L.partial.vcf.gz', '../../profdata/ag1000g.phase1.ar3.2L.partial.zarr', fields='calldata/GT', buffer_size=2**15, chunk_length=50000, overwrite=True)

CPU times: user 12.5 s, sys: 60 ms, total: 12.6 s
Wall time: 12.5 s


In [4]:
%time vcf_to_zarr('../../profdata/ag1000g.phase1.ar3.2L.partial.vcf.gz', '../../profdata/ag1000g.phase1.ar3.2L.partial.zarr', fields='*', buffer_size=2**15, chunk_length=50000, overwrite=True)

CPU times: user 20.5 s, sys: 1 s, total: 21.5 s
Wall time: 19.6 s


In [15]:
import zarr
callset = zarr.open_group('../../profdata/ag1000g.phase1.ar3.2L.partial.zarr')
callset

Group(/, 2)
  groups: 2; calldata, variants
  store: DirectoryStore

In [18]:
callset['variants']['ABHet'][:]

array([        nan,         nan,  0.667     , ...,         nan,
        0.64999998,  0.755     ], dtype=float32)

In [19]:
callset['calldata']

Group(/calldata, 7)
  arrays: 7; AB, AD, DP, GQ, GT, MQ0, PL
  store: DirectoryStore

In [20]:
callset['calldata/GT']

Array(/calldata/GT, (39894, 765, 2), int8, chunks=(50000, 64, 2), order=C)
  nbytes: 58.2M; nbytes_stored: 7.2M; ratio: 8.1; initialized: 12/12
  compressor: Blosc(cname='lz4', clevel=5, shuffle=1)
  store: DirectoryStore

In [21]:
callset['calldata/GT'][:]

array([[[-1, -1],
        [-1, -1],
        [-1, -1],
        ..., 
        [ 0,  0],
        [-1, -1],
        [-1, -1]],

       [[-1, -1],
        [-1, -1],
        [-1, -1],
        ..., 
        [-1, -1],
        [-1, -1],
        [-1, -1]],

       [[-1, -1],
        [-1, -1],
        [-1, -1],
        ..., 
        [-1, -1],
        [-1, -1],
        [-1, -1]],

       ..., 
       [[ 0,  0],
        [ 0,  0],
        [ 0,  0],
        ..., 
        [ 0,  0],
        [ 0,  0],
        [ 0,  0]],

       [[ 0,  0],
        [ 0,  0],
        [ 0,  0],
        ..., 
        [ 0,  0],
        [ 0,  0],
        [ 0,  0]],

       [[ 0,  0],
        [ 0,  0],
        [ 0,  0],
        ..., 
        [ 0,  0],
        [ 0,  0],
        [ 0,  0]]], dtype=int8)

In [15]:
(20000000/20000)/60

16.666666666666668

In [16]:
cProfile.run("vcf_to_zarr(prof_vcf_fn, 'prof.zarr', buffer_size=2**15, chunk_length=1000, overwrite=True)", sort='time')

         71850 function calls (70986 primitive calls) in 0.191 seconds

   Ordered by: internal time

   ncalls  tottime  percall  cumtime  percall filename:lineno(function)
        1    0.040    0.040    0.191    0.191 io_vcf_read.py:360(vcf_to_zarr)
       76    0.033    0.000    0.056    0.001 {built-in method builtins.next}
      377    0.023    0.000    0.023    0.000 {method 'decompress' of 'zlib.Decompress' objects}
      473    0.010    0.000    0.010    0.000 {built-in method zlib.crc32}
8157/7773    0.010    0.000    0.052    0.000 {method 'read' of '_io.BufferedReader' objects}
     7744    0.009    0.000    0.066    0.000 gzip.py:269(read)
       12    0.007    0.001    0.031    0.003 io_vcf_read.py:829(_binary_readline)
     7744    0.004    0.000    0.006    0.000 _compression.py:12(_check_not_closed)
      649    0.004    0.000    0.004    0.000 {built-in method posix.stat}
      219    0.002    0.000    0.002    0.000 {built-in method posix.unlink}
      378    0.002   

In [103]:
%prun vcf_to_hdf5(prof_vcf_fn, 'prof.h5', buffer_size=2**15, chunk_length=1000, overwrite=True)

 

In [11]:
import line_profiler
l = line_profiler.LineProfiler()
# l.add_function(_read_vcf)
l.add_function(iter_vcf)
# l.add_function(CalldataParser_parse)
l.add_function(GenotypeInt8Parser_parse)
# l.add_function(ParserContext_next)
# l.add_function(BufferedReader_read)
l.runcall(read_vcf, prof_vcf_fn, buffer_size=2**15, chunk_length=1000)
l.print_stats()

Timer unit: 1e-06 s

Total time: 23.9421 s
File: /home/aliman/src/github/cggh/scikit-allel/allel/opt/io_vcf_read.pyx
Function: iter_vcf at line 71

Line #      Hits         Time  Per Hit   % Time  Line Contents
    71                                           def iter_vcf(binary_file, int buffer_size, int chunk_length, int temp_max_size, headers, fields,
    72                                                        types, numbers):
    73                                               cdef:
    74                                                   ParserContext context
    75                                                   Parser chrom_parser
    76                                                   Parser pos_parser
    77                                                   Parser id_parser
    78                                                   Parser ref_parser
    79                                                   Parser alt_parser
    80                                            

In [26]:
import vcfnp

In [28]:
%time vcfnp.calldata(prof_vcf_fn, fields=('genotype',))

[vcfnp] 2017-05-24 14:49:30.651145 :: caching is disabled
[vcfnp] 2017-05-24 14:49:30.651788 :: building array


CPU times: user 4.31 s, sys: 0 ns, total: 4.31 s
Wall time: 4.3 s


array([ (([0, 0],), ([0, 0],), ([0, 0],), ([0, 0],), ([0, 0],), ([0, 0],), ([0, 0],), ([0, 0],), ([0, 0],), ([0, 0],), ([0, 0],), ([0, 0],), ([0, 0],), ([0, 0],), ([0, 0],), ([0, 0],), ([0, 0],), ([0, 0],), ([0, 0],), ([0, 0],), ([0, 0],), ([0, 0],), ([0, 0],), ([0, 0],), ([0, 0],), ([0, 0],), ([0, 0],), ([0, 0],), ([0, 0],), ([0, 0],), ([0, 0],), ([0, 0],), ([0, 0],), ([0, 0],), ([0, 0],), ([0, 0],), ([0, 0],), ([0, 0],), ([0, 0],), ([0, 0],), ([0, 0],), ([0, 0],), ([0, 0],), ([0, 0],), ([0, 0],), ([0, 0],), ([0, 0],), ([0, 0],), ([0, 0],), ([0, 0],), ([0, 0],), ([0, 0],), ([0, 0],), ([0, 0],), ([0, 0],), ([0, 0],), ([0, 0],), ([0, 0],), ([0, 0],), ([0, 0],), ([0, 0],), ([0, 0],), ([0, 0],), ([0, 0],), ([0, 0],), ([0, 0],), ([0, 0],), ([0, 0],), ([0, 0],), ([0, 0],), ([0, 0],), ([0, 0],), ([0, 0],), ([0, 0],), ([0, 0],), ([0, 0],), ([0, 0],), ([0, 0],), ([0, 0],), ([0, 0],), ([0, 0],), ([0, 0],), ([0, 0],), ([0, 0],), ([0, 0],), ([0, 0],), ([0, 0],), ([0, 0],), ([0, 0],), ([0, 0],), (

In [29]:
4.3 / 0.09

47.77777777777778

## Legacy

In [None]:
vcf_block_read(vcf_fn, buffer_size=2**15, block_size=2**25)

In [3]:
%time spike_read_len(vcf_fn, buffer_size=10)

CPU times: user 700 ms, sys: 0 ns, total: 700 ms
Wall time: 697 ms


6140661

In [4]:
%timeit spike_read_len(vcf_fn, buffer_size=100)

10 loops, best of 3: 105 ms per loop


In [5]:
%timeit spike_read_len(vcf_fn, buffer_size=1000)

10 loops, best of 3: 50 ms per loop


In [6]:
%timeit spike_read_len(vcf_fn, buffer_size=2**15)

10 loops, best of 3: 39 ms per loop


In [7]:
%timeit spike_read_len(vcf_fn, buffer_size=2**12)

10 loops, best of 3: 45.3 ms per loop


In [8]:
import cProfile

In [9]:
cProfile.run('spike_read_len(vcf_fn, buffer_size=2**15)', sort='time')

         6146762 function calls (6146566 primitive calls) in 0.941 seconds

   Ordered by: internal time

   ncalls  tottime  percall  cumtime  percall filename:lineno(function)
        1    0.584    0.584    0.941    0.941 io_vcf.pyx:90(spike_read_len)
  6140662    0.322    0.000    0.356    0.000 io_vcf.pyx:74(BufferedInputStream_next)
      189    0.019    0.000    0.019    0.000 {method 'decompress' of 'zlib.Decompress' objects}
      285    0.008    0.000    0.008    0.000 {built-in method zlib.crc32}
  385/189    0.001    0.000    0.033    0.000 {method 'read' of '_io.BufferedReader' objects}
      190    0.001    0.000    0.031    0.000 gzip.py:436(read)
      190    0.001    0.000    0.033    0.000 _compression.py:66(readinto)
      761    0.001    0.000    0.001    0.000 gzip.py:80(read)
      189    0.001    0.000    0.034    0.000 gzip.py:269(read)
      189    0.000    0.000    0.035    0.000 io_vcf.pyx:56(BufferedInputStream_fill_buffer)
       97    0.000    0.000    0.00

In [10]:
import line_profiler

l = line_profiler.LineProfiler()
l.add_function(spike_read_len)
l.add_function(BufferedInputStream_next)
l.add_function(BufferedInputStream_fill_buffer)
l.runcall(spike_read_len, vcf_fn, buffer_size=2**14)
l.print_stats()

## Legacy

In [None]:
l = line_profiler.CLineProfiler

In [3]:
2**15

32768

In [None]:
blocks = io_vcf.vcf_block_read(vcf_fn, buffer_size=2**16, block_size=1000)

HeaderParser_parse 37873392 35
20
b'##fileformat=VCFv4.1'
HeaderParser_parse 37873413 35
60
b'##FORMAT=<ID=GT,Number=1,Type=String,Description="Genotype">'
HeaderParser_parse 37873474 35
124
b'##INFO=<ID=AC,Number=A,Type=Integer,Description="Allele countin genotypes, for each ALT allele, in the same order aslisted">'
HeaderParser_parse 37873599 35
32
b'##contig=<ID=2L,length=49364325>'
HeaderParser_parse 37873632 35
32
b'##contig=<ID=2R,length=61545105>'
HeaderParser_parse 37873665 35
32
b'##contig=<ID=3L,length=41963435>'
HeaderParser_parse 37873698 35
32
b'##contig=<ID=3R,length=53200684>'
HeaderParser_parse 37873731 35
34
b'##contig=<ID=UNKN,length=42389979>'
HeaderParser_parse 37873766 35
31
b'##contig=<ID=X,length=24393108>'
HeaderParser_parse 37873798 35
38
b'##contig=<ID=Y_unplaced,length=237045>'
HeaderParser_parse 37873837 35
106
b'##reference=file:///data/anopheles/ag1000g/data/genome/AgamP3/Anopheles-gambiae-PEST_CHROMOSOMES_AgamP3.fa'
HeaderParser_parse 37873944 35
7002
