In [None]:
!pip install icd10-cm bgen_reader pydantic

Collecting icd10-cm
[?25l  Downloading https://files.pythonhosted.org/packages/6b/a5/3059308d94513845e78d701b71a60c55ee4c37fab4b6442e4c58cdb70da1/icd10_cm-0.0.4-py2.py3-none-any.whl (675kB)
[K     |████████████████████████████████| 675kB 9.0MB/s 
[?25hCollecting bgen_reader
[?25l  Downloading https://files.pythonhosted.org/packages/a1/69/3643bab7ecc96f2b40b4120b316e95a51622a4c044c834937de5c1cad78a/bgen_reader-4.0.7-py3-none-any.whl (44kB)
[K     |████████████████████████████████| 51kB 5.5MB/s 
[?25hCollecting pydantic
[?25l  Downloading https://files.pythonhosted.org/packages/2b/a3/0ffdb6c63f45f10d19b8e8b32670b22ed089cafb29732f6bf8ce518821fb/pydantic-1.8.1-cp37-cp37m-manylinux2014_x86_64.whl (10.1MB)
[K     |████████████████████████████████| 10.1MB 17.8MB/s 
Collecting xarray>=0.16.0
[?25l  Downloading https://files.pythonhosted.org/packages/a5/19/debc1f470b8b9e2949da221663c8102ed6728f4d38dc964085ca43de1428/xarray-0.17.0-py3-none-any.whl (759kB)
[K     |██████████████████████

In [None]:
from bgen_reader import read_bgen, open_bgen
from pydantic import BaseModel
import numpy as np
import dask.dataframe as dd
from dask.delayed import Delayed
from typing import List, Union


In [None]:
#utils functions

def get_geno_one_snp(row,  high_lim=0.9, low_lim=0.3, NA_val = np.nan):
    geno_1, geno_2, geno_3 = row
    homo_ref_cond = (geno_1 >=high_lim) & (geno_2 < low_lim) & (geno_3 < low_lim)
    het_cond = (geno_2 >= high_lim) & (geno_1 < low_lim) & (geno_3 < low_lim)
    homo_alt_cond = (geno_3 >= high_lim) & (geno_1 < low_lim) & (geno_2 < low_lim)
    geno_df =  np.select([homo_ref_cond, het_cond, homo_alt_cond],
             [0., 1., 2.],
             default = NA_val)
    return geno_df


In [None]:
class BgenFileObject(BaseModel):
    variants: dd.DataFrame
    samples: pd.Series
    genotype: List[Delayed]
    bgen_reader_obj: open_bgen
    
    class Config:
        arbitrary_types_allowed = True
    
    def __repr__(self):
        return str(self.__class__) + f" {self.samples.shape[0]} samples"
    
    def get_variant_index(self,rsids=None):
        variant_index = np.argwhere(np.isin(self.bgen_reader_obj.rsids, rsids)).reshape(-1,) if rsids is not None else None
        return variant_index
    
    def get_sample_index(self, sample_ids=None):
        sample_index = np.argwhere(np.isin(self.samples.values, sample_ids)).reshape(-1,) if sample_ids is not None else None
        return sample_index
    
    def get_probs(self, sample_ids=None, rsids=None):
        variant_index = self.get_variant_index(rsids)
        sample_index = self.get_sample_index(sample_ids)
        return self.bgen_reader_obj.read((sample_index, variant_index))
    
    
    def get_geno_each_sample(self, probs, prob_to_geno_func:Union["max", "stringent"]= "stringent", high_lim=0.9, low_lim=0.3, NA_val=np.nan):
        if prob_to_geno_func == "max":
            geno = np.nanargmax(probs, axis=2).astype(float)
            
        elif prob_to_geno_func == "stringent":
            geno = np.apply_along_axis(get_geno_one_snp, axis=2, arr=probs, high_lim=high_lim, low_lim=low_lim, NA_val=NA_val)
        
        return geno
            
        
    def get_allele_ids(self, rsids = None, variant_index = None):
        if variant_index is None:
            variant_index = self.get_variant_index(rsids)
        df = pd.DataFrame([allele_str.split(",") for allele_str in self.bgen_reader_obj.allele_ids[variant_index]], columns = ["allele_1", "allele_2"])
        
        if rsids is not None:
            df.index = rsids
        return df
    
    def get_variant_combinations(self, rsids = None, variant_index = None):
        if variant_index is None:
            variant_index = np.argwhere(np.isin(self.bgen_reader_obj.rsids, rsids)).reshape(-1,) if rsids is not None else None
        geno_df = self.get_allele_ids(rsids, variant_index)
        geno_df = get_possible_geno_combinations(geno_df, "allele_1", "allele_2")
        return geno_df

In [None]:
#the first time reading the file, the metafile will be created so no need to put it as argument.
#Please modify the path of the file as appropriate, your path might be different than mine
filepath = "/content/drive/MyDrive/6.874 project/Data/ukb_imp_chr21_v3.bgen"
samples_filepath="/content/drive/MyDrive/6.874 project/Data/ukb45624_imp_chr21_v3_s487275.sample"

test = BgenFileObject(**read_bgen(filepath=filepath, samples_filepath=samples_filepath), bgen_reader_obj = open_bgen(filepath=filepath, samples_filepath=samples_filepath))


Sample IDs are read from /content/drive/MyDrive/6.874 project/Data/ukb45624_imp_chr21_v3_s487275.sample.


Mapping genotypes: 100%|██████████| 1261158/1261158 [01:33<00:00, 13546.64it/s]



Sample IDs are read from '/content/drive/MyDrive/6.874 project/Data/ukb45624_imp_chr21_v3_s487275.sample''.




In [None]:
test.samples

0         5542886
1         5137974
2         3758348
3         1391800
4         3165331
           ...   
487404    5512806
487405    5548469
487406    2956972
487407    5229561
487408    3665101
Name: id, Length: 487409, dtype: object

In [None]:
test.bgen_reader_obj.rsids

memmap(['rs559462325', 'rs181691356', 'rs548263598', ..., 'rs555877612',
        'rs574115117', 'rs541185110'], dtype='<U115')

In [None]:
test_probs = test.get_probs(sample_ids = ['5542886', '5137974', '3758348', '1391800'], rsids = ['rs559462325', 'rs181691356', 'rs548263598', 'rs555877612', 'rs574115117', 'rs541185110'])
test_probs

reading -- time=0:00:00.10, thread 1 of 2, part 3 of 3


array([[[1.        , 0.        , 0.        ],
        [0.99215686, 0.00784314, 0.        ],
        [1.        , 0.        , 0.        ],
        [1.        , 0.        , 0.        ],
        [1.        , 0.        , 0.        ],
        [1.        , 0.        , 0.        ]],

       [[1.        , 0.        , 0.        ],
        [0.99215686, 0.00784314, 0.        ],
        [1.        , 0.        , 0.        ],
        [1.        , 0.        , 0.        ],
        [1.        , 0.        , 0.        ],
        [1.        , 0.        , 0.        ]],

       [[1.        , 0.        , 0.        ],
        [1.        , 0.        , 0.        ],
        [1.        , 0.        , 0.        ],
        [1.        , 0.        , 0.        ],
        [1.        , 0.        , 0.        ],
        [1.        , 0.        , 0.        ]],

       [[1.        , 0.        , 0.        ],
        [1.        , 0.        , 0.        ],
        [1.        , 0.        , 0.        ],
        [1.        , 0.     

In [None]:
test_probs.shape

(4, 6, 3)

In [None]:
test.get_geno_each_sample(test_probs)

array([[0., 0., 0., 0., 0., 0.],
       [0., 0., 0., 0., 0., 0.],
       [0., 0., 0., 0., 0., 0.],
       [0., 0., 0., 0., 0., 0.]])

In [None]:
test.get_geno_each_sample(test_probs).shape

(4, 6)

In [None]:
np.identity(3)[test.get_geno_each_sample(test_probs).astype(int)]

array([[[1., 0., 0.],
        [1., 0., 0.],
        [1., 0., 0.]],

       [[1., 0., 0.],
        [1., 0., 0.],
        [1., 0., 0.]],

       [[1., 0., 0.],
        [1., 0., 0.],
        [1., 0., 0.]]])

In [None]:
np.identity(3)[test.get_geno_each_sample(test_probs).astype(int)].shape

(4, 6, 3)

In [None]:
# output: binary matrix of (n samples, p phenotypes)

In [None]:
import pandas as pd

In [None]:
pheno = pd.read_csv("/content/drive/MyDrive/6.874 project/Data/ukb42682_icd10.txt", sep="\t")
pheno

  interactivity=interactivity, compiler=compiler, result=result)


Unnamed: 0,Unnamed: 1,eid,41202-0.0,41202-0.1,41202-0.2,41202-0.3,41202-0.4,41202-0.5,41202-0.6,41202-0.7,41202-0.8,41202-0.9,41202-0.10,41202-0.11,41202-0.12,41202-0.13,41202-0.14,41202-0.15,41202-0.16,41202-0.17,41202-0.18,41202-0.19,41202-0.20,41202-0.21,41202-0.22,41202-0.23,41202-0.24,41202-0.25,41202-0.26,41202-0.27,41202-0.28,41202-0.29,41202-0.30,41202-0.31,41202-0.32,41202-0.33,41202-0.34,41202-0.35,41202-0.36,41202-0.37,41202-0.38,...,41204-0.144,41204-0.145,41204-0.146,41204-0.147,41204-0.148,41204-0.149,41204-0.150,41204-0.151,41204-0.152,41204-0.153,41204-0.154,41204-0.155,41204-0.156,41204-0.157,41204-0.158,41204-0.159,41204-0.160,41204-0.161,41204-0.162,41204-0.163,41204-0.164,41204-0.165,41204-0.166,41204-0.167,41204-0.168,41204-0.169,41204-0.170,41204-0.171,41204-0.172,41204-0.173,41204-0.174,41204-0.175,41204-0.176,41204-0.177,41204-0.178,41204-0.179,41204-0.180,41204-0.181,41204-0.182,41204-0.183
,6026424,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,...,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,
,1000011,E831,H251,I249,R790,S0220,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,...,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,
,1000026,M179,S8250,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,...,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,
,1000032,H269,H449,R073,R32,R55,T784,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,...,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,
,1000044,H333,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,...,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,
,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
,6026376,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,...,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,
,6026389,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,...,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,
,6026393,D361,M161,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,...,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,
,6026407,K047,N939,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,...,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,


In [None]:
import icd10

code = icd10.find("E.83.1")
code.description

'Disorders of iron metabolism'