In [7]:
import h5py
import numpy as np

def get_individual_idx(f, iid="", f_col="samples"):
        """Return index of individual iid"""
        samples = f[f_col].asstr()[:]
        idx = (samples == iid)
        if np.sum(idx)!=1:
            raise RuntimeWarning(f"{np.sum(idx)} entries found for {iid}")
        assert(np.sum(idx)>0) # Sanity Check
        idx=np.where(idx)[0][0]
        return idx  

def computeHetRate4TransitionSNPs(path2hdf5, sampleID):
    f = h5py.File(path2hdf5)
    idx = get_individual_idx(f, iid=sampleID)
    gt = np.sum(f["calldata/GT"][:,idx,:], axis=1)
    ref, alt = f['variants/REF'].asstr()[:], f['variants/ALT'].asstr()[:]
    CT = np.logical_or(np.logical_and(ref == 'C', alt == 'T'), np.logical_and(ref == 'T', alt == 'C'))
    GA = np.logical_or(np.logical_and(ref == 'G', alt == 'A'), np.logical_and(ref == 'A', alt == 'G'))
    transition = np.logical_or(CT, GA)
    gt_transition = gt[transition]
    gt_transversion = gt[~transition]
    #print(f'number of transition site: {len(gt_transition)}')
    #print(f'number of transversion site: {len(gt_transversion)}')
    return np.sum(gt_transition == 1)/len(gt_transition), np.sum(gt_transversion == 1)/len(gt_transversion)

def computeHetsOneBatch(basepath, sampleID):
    hets_transition = []
    hets_transversion = []
    for b in np.arange(1,51):
        het1, het2 = computeHetRate4TransitionSNPs(f'{basepath}/batch{b}/processed_1KG_MAF5/ch3.h5', sampleID)
        hets_transition.append(het1)
        hets_transversion.append(het2)
    return hets_transition, hets_transversion

In [8]:
covs = ['cov5', 'cov2', 'cov1', 'cov3over4', 'cov1over2', 'cov1over4', 'cov1over10']
sampleID = 'I5279'


for cov in covs:
    hets_transition, hets_transversion = computeHetsOneBatch(f'/mnt/archgen/users/yilei/IBDsim/downsample/callIBD/wgs/{cov}', sampleID)
    print(f'{cov}: {np.mean(hets_transition)}, {np.mean(hets_transversion)}')

for cov in covs[1:5]:
    hets = computeHetsOneBatch(f'/mnt/archgen/users/yilei/IBDsim/downsample/callIBD/1240k/{cov}', sampleID)
    print(f'{cov}: {np.mean(hets_transition)}, {np.mean(hets_transversion)}')


cov5: 0.2850984281323389, 0.27711847398339107
cov2: 0.2817506668802018, 0.27631901222911187
cov1: 0.27803894944438834, 0.27402974585207013
cov3over4: 0.2761233155652371, 0.2725157864499784
cov1over2: 0.273185393632403, 0.26978129822402364
cov1over4: 0.26534991573371297, 0.26252431493788325
cov1over10: 0.24336701002750558, 0.24154925561731277
cov2: 0.24336701002750558, 0.24154925561731277
cov1: 0.24336701002750558, 0.24154925561731277
cov3over4: 0.24336701002750558, 0.24154925561731277
cov1over2: 0.24336701002750558, 0.24154925561731277
