In [1]:
#!wget -c ftp://ngs.sanger.ac.uk/production/ag1000g/phase1/AR3/variation/main/hdf5/ag1000g.phase1.ar3.pass.3L.h5


In [2]:
from math import ceil

import numpy as np

import h5py

import dask.array as da

  from ._conv import register_converters as _register_converters


In [3]:
h5_3L = h5py.File('ag1000g.phase1.ar3.pass.3L.h5')
samples = h5_3L['/3L/samples']
calldata_genotype = h5_3L['/3L/calldata/genotype']
positions = h5_3L['/3L/variants/POS']

In [4]:
calldata_genotype

<HDF5 dataset "genotype": shape (9643193, 765, 2), type "|i1">

In [5]:
samples

<HDF5 dataset "samples": shape (765,), type "|S8">

In [6]:
last_position = positions[-1]

In [7]:
window_size = 50000

In [8]:
num_windows = ceil(last_position / window_size)

In [9]:
limits = np.full((num_windows, 2), -1)

In [10]:
curr_window = positions[0] // window_size
limits[curr_window, 0] = 0

In [25]:
for index, position in enumerate(positions):
    my_window = position // window_size
    if index % 1000000 == 0:
        print(index, position)
    if my_window != curr_window:
        limits[my_window, 0] = index
        limits[curr_window, 1] = index - 1
        curr_window = my_window
limits[num_windows - 1, 1] = len(positions)

0 9790
1000000 11842226
2000000 16486667
3000000 19736472
4000000 23745887
5000000 27394730
6000000 30373865
7000000 33609094
8000000 36894938
9000000 40144443


In [17]:
limits[0], limits[-1]

(array([ 0, 43]), array([     -1, 9643193]))

In [24]:
positions[-1] // window_size, num_windows

(839, 840)

In [51]:
def calc_statistics(v):
    print(v)
    return v[1] - v[0] + 1

In [52]:
calc_statistics_v = np.vectorize(calc_statistics, signature='(m)->()')

In [53]:
%time calc_statistics_v(limits)

[ 0 43]
[ 44 965]
[ 966 1912]
[1913 3420]
[3421 3436]
[3437 3803]
[3804 5038]
[5039 6608]
[6609 6801]
[6802 7056]
[7057 7527]
[-1 -1]
[7528 8135]
[8136 8979]
[ 8980 10035]
[10036 11654]
[11655 13523]
[13524 15684]
[15685 16197]
[16198 16987]
[16988 17733]
[17734 17855]
[17856 19218]
[19219 19778]
[19779 20896]
[20897 23642]
[23643 24606]
[24607 25212]
[-1 -1]
[25213 26007]
[26008 26564]
[26565 29529]
[29530 32647]
[32648 33988]
[33989 36228]
[36229 39103]
[39104 41131]
[41132 44302]
[44303 47250]
[47251 51721]
[51722 57082]
[57083 61307]
[61308 65457]
[65458 69848]
[69849 74369]
[74370 78494]
[78495 80187]
[80188 84162]
[84163 88148]
[88149 91890]
[91891 94756]
[94757 98578]
[ 98579 103286]
[103287 106658]
[106659 109187]
[109188 112293]
[112294 116933]
[116934 121643]
[121644 128315]
[128316 131181]
[131182 134910]
[134911 139026]
[139027 143012]
[143013 147393]
[147394 152103]
[152104 156018]
[156019 160019]
[160020 164125]
[164126 169844]
[169845 175005]
[175006 180016]
[180017 1855

array([   44,   922,   947,  1508,    16,   367,  1235,  1570,   193,
         255,   471,     1,   608,   844,  1056,  1619,  1869,  2161,
         513,   790,   746,   122,  1363,   560,  1118,  2746,   964,
         606,     1,   795,   557,  2965,  3118,  1341,  2240,  2875,
        2028,  3171,  2948,  4471,  5361,  4225,  4150,  4391,  4521,
        4125,  1693,  3975,  3986,  3742,  2866,  3822,  4708,  3372,
        2529,  3106,  4640,  4710,  6672,  2866,  3729,  4116,  3986,
        4381,  4710,  3915,  4001,  4106,  5719,  5161,  5011,  5495,
        4527,  4518,  4488,  3837,  4584,  4038,  5364,  2917,  3237,
        3837,  3691,  2930,  3083,   917,   368,   156,   705,    83,
           2,   112,   126,     1,    37,   223,  3992,  3508,  2452,
         405,  3070,  3722,  4343,  5154,  4086,  3949,  5600,  5617,
        5373,  5003,  4531,  3858,  5463,  5895,  5544,  4980,  5603,
        3133,  4859,  6090,  4047,  5047,  3224,  3732,  4010,  6464,
        3667,  4161,