In [None]:
#default_exp genotype

In [None]:
from combinatorial_gwas.data_catalog import get_catalog, get_config, get_parameters
from pydantic import BaseModel
import dask.dataframe as dd
from dask.delayed import Delayed
from bgen_reader import open_bgen, read_bgen
import numpy as np
import pandas as pd
from typing import List, Union

In [None]:
get_catalog().list()

['example_iris_data',
 'globals_test',
 'ICD10_pheno_matrix',
 'phenotypes_info_Neale',
 'GWAS_results_links_df',
 'heritability_trait_level_summary',
 'heritability_trait_level_partitioned_z4_sig']

In [None]:
#export
def get_geno_one_snp(row,  high_lim=0.9, low_lim=0.3, NA_val = np.nan):
    geno_1, geno_2, geno_3 = row
    homo_ref_cond = (geno_1 >=high_lim) & (geno_2 < low_lim) & (geno_3 < low_lim)
    het_cond = (geno_2 >= high_lim) & (geno_1 < low_lim) & (geno_3 < low_lim)
    homo_alt_cond = (geno_3 >= high_lim) & (geno_1 < low_lim) & (geno_2 < low_lim)
    geno_df =  np.select([homo_ref_cond, het_cond, homo_alt_cond],
             [0., 1., 2.],
             default = NA_val)
    return geno_df

In [None]:
#export
class BgenFileObject(BaseModel):
    variants: dd.DataFrame
    samples: pd.Series
    genotype: List[Delayed]
    bgen_reader_obj: open_bgen
    
    class Config:
        arbitrary_types_allowed = True
    
    def __repr__(self):
        return str(self.__class__) + f" {self.samples.shape[0]} samples"
    
    def get_variant_index(self,ids=None):
        variant_index = np.argwhere(np.isin(self.bgen_reader_obj.ids, ids)).reshape(-1,) if rsids is not None else None
        return variant_index
    
    def get_sample_index(self, sample_ids=None):
        sample_index = np.argwhere(np.isin(self.samples.values, sample_ids)).reshape(-1,) if sample_ids is not None else None
        return sample_index
    
    def get_probs(self, sample_ids=None, rsids=None):
        variant_index = self.get_variant_index(rsids)
        sample_index = self.get_sample_index(sample_ids)
        return self.bgen_reader_obj.read((sample_index, variant_index))
    
    
    def get_geno_each_sample(self, probs, prob_to_geno_func:Union["max", "stringent"]= "stringent", high_lim=0.9, low_lim=0.3, NA_val=np.nan):
        if prob_to_geno_func == "max":
            geno = np.nanargmax(probs, axis=2).astype(float)
            
        elif prob_to_geno_func == "stringent":
            geno = np.apply_along_axis(get_geno_one_snp, axis=2, arr=probs, high_lim=high_lim, low_lim=low_lim, NA_val=NA_val)
        
        return geno
            
        
    def get_allele_ids(self, rsids = None, variant_index = None):
        if variant_index is None:
            variant_index = self.get_variant_index(rsids)
        df = pd.DataFrame([allele_str.split(",") for allele_str in self.bgen_reader_obj.allele_ids[variant_index]], columns = ["allele_1", "allele_2"])
        
        if rsids is not None:
            df.index = rsids
        return df
    
    def get_variant_combinations(self, rsids = None, variant_index = None):
        if variant_index is None:
            variant_index = np.argwhere(np.isin(self.bgen_reader_obj.rsids, rsids)).reshape(-1,) if rsids is not None else None
        geno_df = self.get_allele_ids(rsids, variant_index)
        geno_df = get_possible_geno_combinations(geno_df, "allele_1", "allele_2")
        return geno_df

In [None]:
#export 
parameters = get_parameters()

def get_genetic_file_path(chrom):
    bgen_path = parameters["genetic_file_path_template"].format(chrom_number=chrom)
    sample_path = parameters["sample_file_template"].format(chrom_number=chrom)
    return (bgen_path,sample_path)

def load_genetic_file(chrom):
    bgen_path, sample_path = get_genetic_file_path(chrom)
    return BgenFileObject(**read_bgen(filepath=bgen_path, samples_filepath=sample_path), bgen_reader_obj = open_bgen(filepath=bgen_path, samples_filepath=sample_path))

In [None]:
get_genetic_file_path(22)

('/lab/corradin_biobank/Raw_UKB_downloads/BGEN/ukb_imp_chr22_v3.bgen',
 '/lab/corradin_biobank/Raw_UKB_downloads/sample_files/ukb45624_imp_chr21_v3_s487275.sample')

In [None]:
test_bgen = load_genetic_file(21)

Sample IDs are read from /lab/corradin_biobank/Raw_UKB_downloads/sample_files/ukb45624_imp_chr21_v3_s487275.sample.


Mapping genotypes: 100%|██████████| 1261158/1261158 [01:06<00:00, 18944.65it/s]


In [None]:
test_bgen.samples

0         5542886
1         5137974
2         3758348
3         1391800
4         3165331
           ...   
487404    5512806
487405    5548469
487406    2956972
487407    5229561
487408    3665101
Name: id, Length: 487409, dtype: object

In [None]:
test_bgen.bgen_reader_obj.ids

memmap(['21:9411239_G_A', '21:9411245_C_A', '21:9411264_A_C', ...,
        '21:48119697_T_G', '21:48119700_A_G', '21:48119740_C_G'],
       dtype='<U115')