Note:

1. Run `bash test_run.sh` from `../`. 
2. Now we do the same PRS calculation using plain Python code. (with shortcut)

In [1]:
! bash -c "if [[ -f test_out.h5 ]]; then rm test_out.h5; fi"
! cd ../; bash test_inputs/test_run.sh

2020-04-24 10:06:37 PM  Loading GWAS
2020-04-24 10:06:38 PM  gwas_reader: processing 20002_1262.gwas.imputed_v3.both_sexes, 1/2
2020-04-24 10:06:38 PM  gwas_reader: processing 20002_1262.gwas.imputed_v3.male, 2/2
2020-04-24 10:06:39 PM  Generating variant list
2020-04-24 10:06:39 PM  build_var_df: processing 20002_1262.gwas.imputed_v3.both_sexes
2020-04-24 10:06:39 PM  build_var_df: processing 20002_1262.gwas.imputed_v3.male
2020-04-24 10:06:39 PM  Build BGEN reader
2020-04-24 10:06:39 PM  Initialize PRS matrix
2020-04-24 10:06:39 PM  Update PRS
  res = PandasDataFrame.from_items(items)
100%|█████████████████████████████████████████████| 2/2 [00:02<00:00,  1.00s/it]
2020-04-24 10:06:41 PM  Save PRS
2020-04-24 10:07:48 PM  2020-04-24 22:07:48.030905 PRS file complete!


In [2]:
import pandas as pd
import numpy as np
from rpy2.robjects.packages import importr
from rpy2.robjects import pandas2ri
pandas2ri.activate()
rbgen = importr('rbgen')

In [3]:
pvals = [1e-5,0.001,0.1,1]
CHR = 16
bgen_path = f'/vol/bmd/meliao/data/haplotype/hap/ukb_hap_chr{CHR}_v2.bgen'
bgi_path = f'/vol/bmd/meliao/data/haplotype/hap_bgi/ukb_hap_chr{CHR}_v2.bgen.bgi'

def query(pos):
    query = pd.DataFrame({
        'chromosome': [''],
        'start': [int(pos)], # int(gwas_df.pos[i])],
        'end': [int(pos)], # [int(gwas_df.pos[i])]
    })
    cached_data = rbgen.bgen_load(
        bgen_path,
        index_filename=bgi_path,
        ranges=query, 
        max_entries_per_sample=4
    )
    all_variants = pandas2ri.ri2py(cached_data[0])
    if all_variants.shape[0] != 1:
        raise ValueError('Extract no or more than 1 variant. Cannot handle.')
    all_probs = pandas2ri.ri2py(cached_data[4])
    return all_variants, all_probs

def compute_prs(gwas, clump):
    gwas_df = pd.read_csv(gwas, header=0, sep=' ')
    gwas_clump = pd.read_csv(clump, header=None)
    gwas_df = gwas_df[ gwas_df['variant'].isin(gwas_clump[0]) ].reset_index()
    gwas_df['pos'] = gwas_df['variant'].map(lambda x: x.split(':')[1])
    gwas_df['nea'] = gwas_df['variant'].map(lambda x: x.split(':')[2])
    gwas_df['ea'] = gwas_df['variant'].map(lambda x: x.split(':')[3])
    prs_mat = None
    for i in range(gwas_df.shape[0]):
        all_variants, all_probs = query(gwas_df.pos[i])
        if all_variants.allele0[0] == gwas_df.nea[i] and all_variants.allele1[0] == gwas_df.ea[i]:
            dosage = all_probs[0, :, :]
        elif all_variants.allele1[0] == gwas_df.nea[i] and all_variants.allele0[0] == gwas_df.ea[i]:
            print('flip')
            dosage = 1 - all_probs[0, :, :]
        else:
            raise ValueError('Alleles does not match.')
        if prs_mat is None:
            prs_mat = np.zeros((dosage.shape[0], len(pvals), 2))
#             print('init', prs_mat.sum())
        h1 = dosage[:, 1]
        h2 = dosage[:, 3]
        for pi in range(len(pvals)):
            if pvals[pi] > gwas_df.pval[i]:
#                 print(gwas_df.beta[i])
                prs_mat[:, pi, 0] += h1 * gwas_df.beta[i]
                prs_mat[:, pi, 1] += h2 * gwas_df.beta[i]
    return prs_mat

In [4]:
prs1 = compute_prs('test_gwas1.txt', 'test_gwas1.clump')
prs2 = compute_prs('test_gwas2.txt', 'test_gwas2.clump')

  res = PandasDataFrame.from_items(items)


In [5]:
import h5py
f = h5py.File('test_out.h5', 'r')
prs = f['prs'][:]
traits = f['traits'][:]
f.close()
oprs1 = np.einsum('ijk->kji', prs[:,0,:,:])
oprs2 = np.einsum('ijk->kji', prs[:,1,:,:])


In [6]:
traits

array([b'20002_1262.gwas.imputed_v3.both_sexes',
       b'20002_1262.gwas.imputed_v3.male'], dtype=object)

In [7]:
np.testing.assert_almost_equal(oprs1, prs1, decimal=6)
np.testing.assert_almost_equal(oprs2, prs2, decimal=6)