In [1]:
!cat ../run_ukb_hap_bgen_to_hdf5.py

import argparse
parser = argparse.ArgumentParser(prog='run_ukb_hap_bgen_to_hdf5.py', description='''
    Convert BGEN into HDF5. 
    Work with ukb_hap_v2 BGEN.
''')

parser.add_argument('--bgen', help='''
    BGEN file path
''')
parser.add_argument('--bgi', help='''
    BGEN BGI file path
''')
parser.add_argument('--sample', help='''
    BGEN SAMPLE file path
''')
parser.add_argument('--output-hdf5', help='''
    HDF5 file name of output (if not exists, it will be created)
''')
parser.add_argument('--snp-chunk-size', type=int, default=100, help='''
    Number of SNPs to process at a time
''')
parser.add_argument('--bgen-writing-cache-size', type=int, default=50, help='''
    BGEN reading cache size in MB. (It should be set carefully: 
    pre-factor x nvariant x size of dtype x snp chunk size / 1024 ** 2
    where pre-factor ~ 10)
''')
parser.add_argument('--max-sample-chunk-size', type=int, default=10000, help='''
    Maximum size of chunk on sample axis.


In [2]:
import sys
sys.path.insert(0, '../../prs/')
import ukb_hap_reader
import pandas as pd
import numpy as np
from rpy2.robjects.packages import importr
from rpy2.robjects import pandas2ri
pandas2ri.activate()
rbgen = importr('rbgen')

In [3]:
CHR = 16
BGEN = f'/vol/bmd/meliao/data/haplotype/hap/ukb_hap_chr{CHR}_v2.bgen'
BGI = f'/vol/bmd/meliao/data/haplotype/hap_bgi/ukb_hap_chr{CHR}_v2.bgen.bgi'
SAMPLE = '/vol/bmd/meliao/data/haplotype/link_files/ukb1952_v2_s487398.sample'
reader = ukb_hap_reader.UKBhapReader(
    bgen_path=BGEN,
    bgen_bgi_path=BGI,
    sample_path=SAMPLE
)

In [4]:
nsnp_ = 27
pos = [ int(i.split(':')[1]) for i in reader.variant_index.keys() ][:nsnp_]
non_effect_allele = [ i.split(':')[2] for i in reader.variant_index.keys() ][:nsnp_]
effect_allele = [ i.split(':')[3] for i in reader.variant_index.keys() ][:nsnp_]
chrom = [ '' for i in reader.variant_index.keys() ][:nsnp_]

In [5]:
def query(pos):
    query = pd.DataFrame({
        'chromosome': [''],
        'start': [int(pos)], # int(gwas_df.pos[i])],
        'end': [int(pos)], # [int(gwas_df.pos[i])]
    })
    cached_data = rbgen.bgen_load(
        BGEN,
        index_filename=BGI,
        ranges=query, 
        max_entries_per_sample=4
    )
    all_variants = pandas2ri.ri2py(cached_data[0])
    if all_variants.shape[0] != 1:
        raise ValueError('Extract no or more than 1 variant. Cannot handle.')
    all_probs = pandas2ri.ri2py(cached_data[4])
    return all_variants, all_probs
def get_haplotype(probs):
    return probs[:, 1], probs[:, 3]

In [6]:
_, p = query(pos[0])
nsample = p.shape[1]
out_mat = np.zeros((2, nsnp_, nsample), dtype=int)
for i, pp in enumerate(pos):
    print(f'Processing {i}')
    _, probs = query(pp)
    h1, h2 = get_haplotype(probs[0, :, :])
    out_mat[0, i, :] = h1
    out_mat[1, i, :] = h2

  res = PandasDataFrame.from_items(items)


Processing 0
Processing 1
Processing 2
Processing 3
Processing 4
Processing 5
Processing 6
Processing 7
Processing 8
Processing 9
Processing 10
Processing 11
Processing 12
Processing 13
Processing 14
Processing 15
Processing 16
Processing 17
Processing 18
Processing 19
Processing 20
Processing 21
Processing 22
Processing 23
Processing 24
Processing 25
Processing 26


In [7]:
import h5py
with h5py.File('test_out.h5', 'r') as f:
    genotype = f['genotype'][:]
    pos_ = f['position'][:]

In [8]:
genotype.shape

(2, 27, 487409)

In [9]:
out_mat.shape

(2, 27, 487409)

In [10]:
pos_

array([b'85629', b'85667', b'89659', b'92224', b'92370', b'92391',
       b'92688', b'97610', b'101277', b'105325', b'106596', b'107211',
       b'111247', b'112593', b'114123', b'114535', b'115072', b'118809',
       b'119006', b'123220', b'125462', b'126588', b'128054', b'129025',
       b'129223', b'131565', b'133946'], dtype=object)

In [11]:
pos

[85629,
 85667,
 89659,
 92224,
 92370,
 92391,
 92688,
 97610,
 101277,
 105325,
 106596,
 107211,
 111247,
 112593,
 114123,
 114535,
 115072,
 118809,
 119006,
 123220,
 125462,
 126588,
 128054,
 129025,
 129223,
 131565,
 133946]

In [12]:
np.testing.assert_array_equal(out_mat, genotype)