In [3]:
import math
import numpy as np

def extract_snps_hdf5(h5, ids_ref, markers, diploid=False):
        """Extract genotypes from h5 on ids and markers.
        If diploid, concatenate haplotypes along 0 axis.
        Extract indivuals first, and then subset to SNPs.
        Return 2D array [# haplotypes, # markers]"""
        # Important: Swap of Dimensions [loci<->individuals]
        nblocks = math.ceil(len(markers)/8)
        nsample = len(ids_ref)
        ploidy = 2 if diploid else 1
        gts = np.zeros((ploidy*nsample, nblocks), dtype=np.uint8)
        haplotype_id_with_missing_data = set() # maintain a list of haplotype ids with missing data at any site of interest
        for i in range(nblocks):
            j = min((i+1)*8, len(markers))
            raw_gt = h5["calldata/GT"][markers[i*8:j], :, :ploidy] # can only indexing one dimension at a time, so need to split this into two lines of code
            raw_gt = raw_gt[:, ids_ref, :].reshape((-1, ploidy*nsample)).T
            haplotype_id_with_missing_data.update(np.where(raw_gt == -1)[0])
            gts[:, i] = np.packbits(raw_gt, axis=1).flatten()

        # get rid of haplotypes with missing data
        # eg. male samples only have one chrX
        gts = gts[np.setdiff1d(np.arange(ploidy*nsample), list(haplotype_id_with_missing_data)), :]
        return gts, markers%8

In [3]:
import h5py
import numpy as np

path2h5 = '/mnt/archgen/users/yilei/Data/1000G/1000g1240khdf5/all1240/maf5_auto/maf5_chr1.hdf5'
f = h5py.File(path2h5, 'r')
nloci, nsample, ploidy = f['calldata/GT'].shape
print(f['calldata/GT'].shape)
ids_ref = np.array([5,10])
markers = np.sort(np.random.choice(nloci, 11, replace=False))

gts_compact = extract_snps_hdf5(f, ids_ref, markers, diploid=True)
print(f['calldata/GT'].shape)
gts_raw = f['calldata/GT'][markers, :, :]
gts_raw = gts_raw[:, ids_ref, :].reshape((-1, 4)).T
print(gts_compact)
print(gts_raw)
print(np.packbits(gts_raw, axis=1))

(530434, 2504, 2)
(530434, 2504, 2)
(array([[ 56,   0],
       [176,   0],
       [ 24,   0],
       [220,   0]], dtype=uint8), array([5, 3, 7, 2, 5, 7, 7, 2, 5, 4, 0]))
[[0 0 1 1 1 0 0 0 0 0 0]
 [1 0 1 1 0 0 0 0 0 0 0]
 [0 0 0 1 1 0 0 0 0 0 0]
 [1 1 0 1 1 1 0 0 0 0 0]]
[[ 56   0]
 [176   0]
 [ 24   0]
 [220   0]]


In [2]:
def get_ith_bit(number, i):
    # Shift the number i positions to the right and perform a bitwise AND with 1
    # This will isolate the i-th bit
    return (number >> i) & 1

# Example usage:
binary_number = 0b101011   # Example binary number
i = 3                       # Index of the bit to extract (0-based indexing)
result = get_ith_bit(binary_number, i)
print(f"The {i}-th bit of {bin(binary_number)} is: {result}")


The 3-th bit of 0b101011 is: 1


# check my bit-wise operation for allele frequency calculation

In [None]:
import h5py
import numpy as np

path2h5 = '/mnt/archgen/users/yilei/Data/1000G/1000g1240khdf5/all1240/maf5_auto/maf5_chr18.hdf5'
f = h5py.File(path2h5, 'r')
nloci, nsample, ploidy = f['calldata/GT'].shape
print(f['calldata/GT'].shape)
ids_ref = np.arange(nsample)
markers = np.arange(nloci)

blocksize = 8
gts_compact, overhang = extract_snps_hdf5(f, ids_ref, markers, diploid=True)


In [17]:
def allele_freq_per_block(gts_one_block, blocksize):
    """Calculate allele frequency per block.
    Return 1D array of length blocksize"""
    freq = np.zeros(blocksize)
    for i in range(blocksize):
        freq[i] = np.mean(gts_one_block >> (blocksize -1 - i) & 1)
    return freq

allele_freq_per_block(gts_compact[:, -1], blocksize)

array([0.21345847, 0.09704473, 0.10023962, 0.19968051, 0.24001597,
       0.20507188, 0.06908946, 0.12679712])

In [29]:
import time

allelefreq = np.zeros(nloci)
print(gts_compact.shape)
t1 = time.time()
for i in range(nloci):
    index = i//blocksize
    offset = i%blocksize
    allelefreq[i] = np.sum(gts_compact[:, index] >> (blocksize - 1 - offset) & 1)/gts_compact.shape[0]
print(f'for loop takes {time.time()-t1} seconds')

gts = f['calldata/GT'][:, :, :]
gts = gts.reshape((nloci, nsample*2)).T
t1 = time.time()
allelefreq_normal_calculation = np.mean(gts, axis=0)
print(f'normal calculation takes {time.time()-t1} seconds')

# blockwise computation
t1 = time.time()
allelefreq_blockwise = np.concatenate(np.apply_along_axis(allele_freq_per_block, 0, gts_compact, blocksize).T, axis=0)
print(f'blockwise calculation takes {time.time()-t1} seconds')

print(allelefreq)
print(allelefreq_normal_calculation)
print(allelefreq_blockwise)

(5008, 12152)
for loop takes 1.5874099731445312 seconds
normal calculation takes 0.30373620986938477 seconds
blockwise calculation takes 1.7471349239349365 seconds
[0.11242013 0.14117412 0.09384984 ... 0.20507188 0.06908946 0.12679712]
[0.11242013 0.14117412 0.09384984 ... 0.20507188 0.06908946 0.12679712]
[0.11242013 0.14117412 0.09384984 ... 0.20507188 0.06908946 0.12679712]


In [6]:
gts = f['calldata/GT'][:, :, :]
gts = gts.reshape((nloci, nsample*2)).T
allelefreq_normal_calculation = np.mean(gts, axis=0)
print(allelefreq_normal_calculation)

[0.51517572 0.22364217 0.07268371 ... 0.11980831 0.12000799 0.16972843]


In [18]:
print(np.isclose(allelefreq - allelefreq_normal_calculation, 0, atol=1e-6).all())

True


In [41]:
import sys
print(sys.getsizeof(gts_compact))
print(sys.getsizeof(gts))
gts_bool = gts.astype(bool)
print(sys.getsizeof(gts_bool))

60857344
128
486857856


In [40]:
ploidy=2
raw_gt = f["calldata/GT"][np.arange(nloci), :, :ploidy]
raw_gt = raw_gt[:, np.arange(nsample), :].reshape((-1, ploidy*nsample)).T
print(sys.getsizeof(raw_gt))
print(raw_gt)

128
[[0 0 0 ... 0 0 0]
 [0 1 0 ... 1 0 0]
 [1 0 1 ... 1 0 0]
 ...
 [1 0 1 ... 0 0 0]
 [0 0 0 ... 0 0 0]
 [0 0 0 ... 0 0 0]]


In [8]:
import numpy as np
from scipy.stats import binom

DTYPE = np.float64
e_rate_ref = 0.001
e_rate_read = 0.01
rc = np.random.randint(0, 100, (2, 1000))
print(rc)

p_derived_read = np.empty(3, dtype=DTYPE)
p_derived_read[0] = e_rate_read
p_derived_read[1] = 0.5
p_derived_read[2] = 1 - e_rate_read

### precompute one component of the emission probability for the ROH state
# aka, the binomial pmf at each marker for each of the three possible underlying genotype 00,01,11
genotype_prob = np.empty((2, 3), dtype=DTYPE)
genotype_prob[0,0] = 1-e_rate_ref
genotype_prob[0,1] = e_rate_ref/2
genotype_prob[0,2] = e_rate_ref/2
genotype_prob[1,0] = e_rate_ref/2
genotype_prob[1,1] = e_rate_ref/2
genotype_prob[1,2] = 1-e_rate_ref
binom_pmf = binom.pmf(rc[1, :], rc[0, :] + rc[1, :], p_derived_read[:, None])
print(binom_pmf.shape)
print(p_derived_read[:, None])

[[31 46 37 ... 23 68 20]
 [62 37 54 ... 31 19 76]]
(3, 1000)
[[0.01]
 [0.5 ]
 [0.99]]


# test directly loading binary genotype array from hdf5 file

In [7]:
import math
import numpy as np
import time


def extract_snps_hdf5(h5, ids_ref, markers, diploid=False):
        """Extract genotypes from h5 on ids and markers.
        If diploid, concatenate haplotypes along 0 axis.
        Extract indivuals first, and then subset to SNPs.
        Return 2D array [# haplotypes, # markers]"""
        # Important: Swap of Dimensions [loci<->individuals]
        nblocks = math.ceil(len(markers)/8)
        nsample = len(ids_ref)
        ploidy = 2 if diploid else 1
        gts = np.zeros((ploidy*nsample, nblocks), dtype=np.uint8)
        t1 = time.time()
        for i in range(nblocks):
            j = min((i+1)*8, len(markers))
            raw_gt = h5["calldata/GT"][markers[i*8:j], :, :ploidy] # can only indexing one dimension at a time, so need to split this into two lines of code
            raw_gt = raw_gt[:, ids_ref, :].reshape((-1, ploidy*nsample)).T
            gts[:, i] = np.packbits(raw_gt, axis=1).flatten()
        print(f'extracting genotypes from hdf5 calldata/GT took {time.time()-t1:.2f} seconds')

        time1 = time.time()
        raw_gt = h5["calldata/GTbinary"][:, ids_ref, :ploidy]
        indices = markers // 8
        offset = markers % 8
        # get the offset bit from the uint8 integer at indices
        gts2 = np.packbits((raw_gt[indices, :, :] >> (7 - offset[:, None, None])) & 1, axis=0)
        gts2 = gts2.reshape((-1, ploidy*nsample)).T
        print(f'extracting genotypes from hdf5 calldata/GTbinary took {time.time()-time1:.2f} seconds')
        print(f'gts and gts2 are equal: {np.array_equal(gts, gts2)}')

        return gts, markers%8

In [8]:
import h5py
import numpy as np

path2h5 = '/mnt/archgen/users/yilei/Data/1000G/1000g1240khdf5/all1240/maf5_auto/binary/maf5_chr20_binary.hdf5'
f = h5py.File(path2h5, 'r')
nloci, nsample, ploidy = f['calldata/GT'].shape
print(f['calldata/GT'].shape)
ids_ref = np.sort(np.random.choice(nsample, 100, replace=False))
markers = np.sort(np.random.choice(nloci, 207, replace=False))

gts_compact, overhang = extract_snps_hdf5(f, ids_ref, markers, diploid=True)


(150277, 2504, 2)
extracting genotypes from hdf5 calldata/GT took 2.04 seconds
extracting genotypes from hdf5 calldata/GTbinary took 0.06 seconds
gts and gts2 are equal: True
