In [None]:
from combinatorial_gwas.genotype import load_genetic_file
from combinatorial_gwas.data_catalog import get_catalog, get_config, get_parameters
from combinatorial_gwas.phenotypes import get_phenotype, get_GWAS_snps_for_trait
import numpy as np



# Demoing how to get phenotype and genotype data

In [None]:
genetic_file = load_genetic_file(21)


Sample IDs are read from /lab/corradin_biobank/Raw_UKB_downloads/sample_files/ukb45624_imp_chr21_v3_s487275.sample.


Mapping genotypes: 100%|██████████| 1261158/1261158 [00:55<00:00, 22745.71it/s]


### Getting multiple ICD codes

In [None]:
get_phenotype(["I84", "R07"])[1]

Unnamed: 0_level_0,I84,R07
f.eid,Unnamed: 1_level_1,Unnamed: 2_level_1
1000025,0,0
1000038,0,0
1000042,0,0
1000056,0,0
1000061,0,0
...,...,...
5873158,0,0
5873167,0,0
5873175,0,0
5873180,0,0


### Getting multiple ICD codes arranged by samples in the genetic file

Helpful when you want to subset samples and want to ensure the phenotype df follows the exact same order as the genotype

In [None]:
genetic_file.samples.values

array(['5542886', '5137974', '3758348', ..., '2956972', '5229561',
       '3665101'], dtype=object)

In [None]:
sample_index, pheno_df_ordered = get_phenotype(["I84", "R07"], samples = genetic_file.samples)
pheno_df_ordered

Unnamed: 0_level_0,I84,R07
f.eid,Unnamed: 1_level_1,Unnamed: 2_level_1
5542886,0,0
5137974,0,0
3758348,0,0
1391800,0,0
3165331,0,0
...,...,...
3783812,0,0
2440570,0,0
1213317,0,0
1354423,0,1


In [None]:
assert sample_index.shape[0] == pheno_df_ordered.shape[0]

---

### Get SNPs for a phenotype

Get the SNPs that were used for a GWAS of the ICD10 code. You can output the entire dataframe or just the array. To find what `phenotype_code` to provide as function argument, refer to the column of the same name [here](https://docs.google.com/spreadsheets/d/1kvPoupSzsSFBNSztMzl04xMoSC3Kcx3CrjVf4yBmESU/edit?ts=5b5f17db#gid=178908679). For ICD codes phenotype, the values are the same, but this might not be true for your phenotype of interest

In [None]:
get_GWAS_snps_for_trait("I84")


array(['21:23683958_G_A', '21:44501642_C_T', '21:25570916_T_C', ...,
       '21:46876064_G_C', '21:47773893_C_G', '21:47831648_C_T'],
      dtype=object)

The SNPs are sorted by pvalue by default. This is so that you can select **the lowest pvalue SNPs** or **the highest beta SNPs** but you can sort them by position by changing the `sort_val_cols_list` argument. 

In [None]:
GWAS_snps_df = get_GWAS_snps_for_trait("I84", id_only=False, sort_val_cols_list= ['pval'], ascending_bool_list= [True])
GWAS_snps_df

Unnamed: 0,position_rank,variant,minor_allele,minor_AF,expected_case_minor_AC,low_confidence_variant,n_complete_samples,AC,ytx,beta,se,tstat,pval,chr,position,major_allele,full_id
170594,170594,21:46875774:C:T,T,0.000007,0.172184,True,361194,4.90588,2.117650,0.667492,0.107762,6.19411,5.867920e-10,21,46875774,C,21:46875774_C_T
84094,84094,21:30968890:C:T,T,0.000002,0.043356,True,361194,1.23529,1.000000,0.994117,0.186704,5.32456,1.012560e-07,21,30968890,C,21:30968890_C_T
151278,151278,21:43767602:A:G,G,0.000002,0.053403,True,361194,1.52157,0.831373,1.189730,0.226778,5.24623,1.553320e-07,21,43767602,A,21:43767602_A_G
88826,88826,21:31869421:T:G,G,0.000002,0.041291,True,361194,1.17647,0.952941,0.990411,0.192736,5.13868,2.768140e-07,21,31869421,T,21:31869421_T_G
162176,162176,21:45587940:C:T,T,0.000003,0.068543,True,361194,1.95294,0.933333,0.871227,0.184111,4.73207,2.223280e-06,21,45587940,C,21:45587940_C_T
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
161896,161896,21:45545974:C:T,T,0.000000,0.000000,True,361194,0.00000,0.000000,,,,,21,45545974,C,21:45545974_C_T
165184,165184,21:46067181:G:C,C,0.000000,0.000000,True,361194,0.00000,0.000000,,,,,21,46067181,G,21:46067181_G_C
170601,170601,21:46876064:G:C,C,0.000000,0.000000,True,361194,0.00000,0.000000,,,,,21,46876064,G,21:46876064_G_C
176072,176072,21:47773893:C:G,G,0.000000,0.000000,True,361194,0.00000,0.000000,,,,,21,47773893,C,21:47773893_C_G


Here we demonstrate these steps: 

1) keep the sort by `pval` or beta
2) return the dataframe 
3) subset the number of SNPs you want (SNPs that have pvalue < 1e-5)
4) and then sort again by position (to put into a CNN for example)

In [None]:
subset_snps = GWAS_snps_df.query("pval < 1e-5").sort_values("position")["full_id"].values
subset_snps

array(['21:30411367_T_G', '21:30968890_C_T', '21:31812778_T_A',
       '21:31869421_T_G', '21:34889665_T_C', '21:43767602_A_G',
       '21:45587940_C_T', '21:46875774_C_T', '21:47269877_C_T',
       '21:47419606_C_T'], dtype=object)

In [None]:
subset_snps.shape

(10,)

#### Now we get the index of the chosen snps

In [None]:
sorter = np.argsort(genetic_file.bgen_reader_obj.ids)
variants_index = sorter[np.searchsorted(genetic_file.bgen_reader_obj.ids, subset_snps, sorter=sorter)]
variants_index

array([ 612848,  630868,  662162,  664130,  766757, 1084605, 1154997,
       1211179, 1226583, 1232900])

In [None]:
#get genetic data of all the chosen SNPs and all samples in phenotype

probs = genetic_file.bgen_reader_obj.read((sample_index, variants_index))
probs.shape


reading -- time=0:00:00.00, thread 1 of 10, part 1 of 1


(357643, 10, 3)

#### Turn the probabilities to one hot encoded values

In [None]:
ohe_genetic_info = np.identity(3)[genetic_file.get_geno_each_sample(probs,"max").astype(int)] #sometimes it has Nans so need to convert to type int
ohe_genetic_info

array([[[1., 0., 0.],
        [1., 0., 0.],
        [1., 0., 0.],
        ...,
        [1., 0., 0.],
        [1., 0., 0.],
        [1., 0., 0.]],

       [[1., 0., 0.],
        [1., 0., 0.],
        [1., 0., 0.],
        ...,
        [1., 0., 0.],
        [1., 0., 0.],
        [1., 0., 0.]],

       [[1., 0., 0.],
        [1., 0., 0.],
        [1., 0., 0.],
        ...,
        [1., 0., 0.],
        [1., 0., 0.],
        [1., 0., 0.]],

       ...,

       [[1., 0., 0.],
        [1., 0., 0.],
        [1., 0., 0.],
        ...,
        [1., 0., 0.],
        [1., 0., 0.],
        [1., 0., 0.]],

       [[1., 0., 0.],
        [1., 0., 0.],
        [1., 0., 0.],
        ...,
        [1., 0., 0.],
        [1., 0., 0.],
        [1., 0., 0.]],

       [[1., 0., 0.],
        [1., 0., 0.],
        [1., 0., 0.],
        ...,
        [1., 0., 0.],
        [1., 0., 0.],
        [1., 0., 0.]]])