In [1]:
#!wget -c ftp://ngs.sanger.ac.uk/production/ag1000g/phase1/AR3/variation/main/hdf5/ag1000g.phase1.ar3.pass.3L.h5


In [2]:
from multiprocessing.pool import Pool
from math import ceil

import numpy as np

import h5py

import dask
import dask.array as da
import dask.multiprocessing
dask.__version__

  from ._conv import register_converters as _register_converters


'0.19.2'

In [3]:
h5_3L = h5py.File('ag1000g.phase1.ar3.pass.3L.h5', 'r')
samples = h5_3L['/3L/samples']
#calldata_genotype = h5_3L['/3L/calldata/genotype']
positions = h5_3L['/3L/variants/POS']
#alt_alleles = h5_3L['/3L/variants/ALT']
#is_snp = h5_3L['/3L/variants/is_snp']
num_samples = len(samples)
del samples

In [4]:
last_position = positions[-1]

In [5]:
window_size = 50000

In [6]:
num_windows = ceil(last_position / window_size)

In [7]:
limits = np.full((num_windows, 2), -1)

In [8]:
curr_window = positions[0] // window_size
limits[curr_window][0] = 0

In [9]:
for index, position in enumerate(positions):
    my_window = position // window_size
    if index % 1000000 == 0:
        print(index, position)
    if my_window != curr_window:
        limits[my_window, 0] = index
        limits[curr_window, 1] = index - 1
        curr_window = my_window
limits[num_windows - 1, 1] = len(positions)

0 9790
1000000 11842226
2000000 16486667
3000000 19736472
4000000 23745887
5000000 27394730
6000000 30373865
7000000 33609094
8000000 36894938
9000000 40144443


In [10]:
limits = da.from_array(limits, chunks=(60, 2))

In [11]:
limits[0], limits[-1]

(dask.array<getitem, shape=(2,), dtype=int64, chunksize=(2,)>,
 dask.array<getitem, shape=(2,), dtype=int64, chunksize=(2,)>)

In [12]:
positions[-1] // window_size, num_windows

(839, 840)

In [13]:
h5_3L.close()
del h5_3L, positions

In [None]:
def get_hdf5():
    global calldata_genotype, alt_alleles, is_snp, num_samples
    
    try:
        calldata_genotype
    except NameError:
        import os
        print('Open', os.getpid())
        h5_3L = h5py.File('ag1000g.phase1.ar3.pass.3L.h5', 'r')
        samples = h5_3L['/3L/samples']
        calldata_genotype = h5_3L['/3L/calldata/genotype']
        alt_alleles = h5_3L['/3L/variants/ALT']
        is_snp = h5_3L['/3L/variants/is_snp']
    
    return calldata_genotype, is_snp, alt_alleles, num_samples
    
@da.as_gufunc(signature="(i)->()", output_dtypes=dict, vectorize=True)
def calc_statistics(v):
    calldata_genotype, is_snp, alt_alleles, num_samples = get_hdf5()
    print(v)
    start, end = v[0], v[1]
    min_maf = 0.5
    max_maf = 0.0
    non_bi = 0
    non_snp = 0
    #num_samples = len(samples)
    for pos in range(start, end + 1):
        if not is_snp[pos]:
            non_snp += 1
            continue
        if len(set(alt_alleles[pos])) > 2:  #There is empty
               non_bi += 1
               continue
        num_alt = np.sum(calldata_genotype[pos])  # Because they are coded as 1
        num_ref = num_samples * 2 - num_alt  # (because all are called)
        min_called = min(num_ref, num_alt)
        maf = min_called / (2 * num_samples)
        if maf < min_maf:
            min_maf = maf
        if maf > max_maf:
            max_maf = maf
    return {'total': end - start + 1,
            'non_snp': non_snp, 'non_bi': non_bi,
            'min_maf': min_maf, 'max_maf': max_maf}


In [19]:
#Check persist
#stats = calc_statistics_v(limits[:100])
stats = None
with dask.config.set(scheduler='multiprocessing'):
    stats = calc_statistics(limits[:10]).compute()
#dask.config.set(scheduler='synchronous')
#dask.config.set(scheduler='threads')


Open 7859
999 [ 0 43]
[ 0 43]
999 [ 44 965]
[ 44 965]
999 [ 966 1912]
[ 966 1912]
999 [1913 3420]
[1913 3420]
999 [3421 3436]
[3421 3436]
999 [3437 3803]
[3437 3803]
999 [3804 5038]
[3804 5038]
999 [5039 6608]
[5039 6608]
999 [6609 6801]
[6609 6801]
999 [6802 7056]
[6802 7056]


In [None]:
limits[0,0].compute()

In [None]:
limits[0]

In [43]:
#persist

In [20]:
stats

array([{'total': 44, 'non_snp': 0, 'non_bi': 4, 'min_maf': -0.00065359477124183, 'max_maf': 0.44575163398692813},
       {'total': 922, 'non_snp': 0, 'non_bi': 32, 'min_maf': -0.00196078431372549, 'max_maf': 0.4869281045751634},
       {'total': 947, 'non_snp': 0, 'non_bi': 37, 'min_maf': -0.00065359477124183, 'max_maf': 0.4516339869281046},
       {'total': 1508, 'non_snp': 0, 'non_bi': 49, 'min_maf': -0.00065359477124183, 'max_maf': 0.39477124183006534},
       {'total': 16, 'non_snp': 0, 'non_bi': 2, 'min_maf': 0.00065359477124183, 'max_maf': 0.3235294117647059},
       {'total': 367, 'non_snp': 0, 'non_bi': 10, 'min_maf': -0.01241830065359477, 'max_maf': 0.4503267973856209},
       {'total': 1235, 'non_snp': 0, 'non_bi': 54, 'min_maf': -0.00196078431372549, 'max_maf': 0.3954248366013072},
       {'total': 1570, 'non_snp': 0, 'non_bi': 68, 'min_maf': -0.00065359477124183, 'max_maf': 0.3934640522875817},
       {'total': 193, 'non_snp': 0, 'non_bi': 4, 'min_maf': 0.00065359477124183,