In [None]:
#default_exp phenotypes

In [None]:
#export
from combinatorial_GWAS.data_catalog import get_catalog, get_parameters
import combinatorial_GWAS
from pathlib import Path
import pandas as pd
from dataclasses import dataclass
from functools import partial
import numpy as np
from typing import List
from fastcore.utils import partialler
import logging

In [None]:
#export

@pd.api.extensions.register_dataframe_accessor("pheno")
@dataclass
class QueryDataframe():
    df: pd.DataFrame
    
    def query(self, **column_dict:dict):
        query_str = " and ".join([f"({col} {cond})" for col, cond in column_dict.items()])
        return self.df.query(query_str)

In [None]:
parameters = get_parameters()
parameters

{'example_test_data_ratio': 0.2,
 'example_num_train_iter': 10000,
 'example_learning_rate': 0.01,
 'template_gwas_result_file_link': 'https://broad-ukb-sumstats-us-east-1.s3.amazonaws.com/round2/additive-tsvs/{phenotype_code}.gwas.imputed_v3.both_sexes.tsv.bgz'}

In [None]:
catalog_all = get_catalog()
catalog_all = catalog_all.reload()
catalog_all.list()

['example_iris_data',
 'ICD10_pheno_matrix',
 'phenotypes_info_Neale',
 'GWAS_results_links_df',
 'heritability_trait_level_summary',
 'heritability_trait_level_partitioned_z4_sig']

In [None]:
#export
read_csv_compressed= partialler(pd.read_csv, sep="\t", compression= "gzip")
get_GWAS_result_link = partialler(parameters['template_gwas_result_file_link'].format)


---

### Find high heritability traits

In [None]:
heritability_Neale = catalog_all.load("heritability_trait_level_summary")
heritability_Neale.head()

  return pd.read_csv(fs_file, **self._load_args)


Unnamed: 0,phenotype,description,h2_liability,h2_liability_se,h2_observed,h2_observed_se,h2_z,h2_p,h2_sig,confidence,...,sex,isNotPrimary,isBadPower,isLowNeff,isMidNeff,isExtremeSE,isHighSE,isSexBias,isBadOrdinal,isNumericOrdinal
0,100001_irnt,Food weight,0.068818,0.016857,0.068818,0.016857,4.082528,2.2e-05,z4,high,...,both_sexes,False,False,False,False,False,False,False,False,False
1,100002_irnt,Energy,0.064783,0.015784,0.064783,0.015784,4.104249,2e-05,z4,high,...,both_sexes,False,False,False,False,False,False,False,False,False
2,100003_irnt,Protein,0.034543,0.014619,0.034543,0.014619,2.362892,0.009066,nominal,high,...,both_sexes,False,False,False,False,False,False,False,False,False
3,100004_irnt,Fat,0.055197,0.018265,0.055197,0.018265,3.021937,0.001256,nominal,high,...,both_sexes,False,False,False,False,False,False,False,False,False
4,100005_irnt,Carbohydrate,0.051744,0.015357,0.051744,0.015357,3.369359,0.000377,nominal,high,...,both_sexes,False,False,False,False,False,False,False,False,False


In [None]:
heritability_Neale.columns

Index(['phenotype', 'description', 'h2_liability', 'h2_liability_se',
       'h2_observed', 'h2_observed_se', 'h2_z', 'h2_p', 'h2_sig', 'confidence',
       'notes', 'intercept', 'intercept_se', 'intercept_z', 'intercept_p',
       'lambdaGC', 'mean_chi2', 'ratio', 'ratio_se', 'n', 'Neff',
       'variable_type', 'isBinary', 'n_cases', 'n_controls', 'prevalence',
       'source', 'sex', 'isNotPrimary', 'isBadPower', 'isLowNeff', 'isMidNeff',
       'isExtremeSE', 'isHighSE', 'isSexBias', 'isBadOrdinal',
       'isNumericOrdinal'],
      dtype='object')

In [None]:
display_cols = ['description', 'h2_liability', 'h2_sig', 'confidence', 'n_cases', 'n_controls', 'prevalence']

In [None]:
quality_heritability_phenos = heritability_Neale.pheno.query(h2_sig = "in ['z7', 'z4']", source= " == 'icd10'", confidence= "in ['medium', 'high']").sort_values("h2_liability", ascending = False)
quality_heritability_phenos = quality_heritability_phenos.set_index("phenotype")
quality_heritability_phenos.head()[display_cols]

Unnamed: 0_level_0,description,h2_liability,h2_sig,confidence,n_cases,n_controls,prevalence
phenotype,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1,Unnamed: 5_level_1,Unnamed: 6_level_1,Unnamed: 7_level_1
I83,Diagnoses - main ICD10: I83 Varicose veins of ...,0.189299,z7,medium,8763.0,352431.0,0.024261
I25,Diagnoses - main ICD10: I25 Chronic ischaemic ...,0.163823,z7,medium,12769.0,348425.0,0.035352
G56,Diagnoses - main ICD10: G56 Mononeuropathies o...,0.157519,z7,medium,8130.0,353064.0,0.022509
N81,Diagnoses - main ICD10: N81 Female genital pro...,0.152223,z4,medium,7511.0,186663.0,0.038682
M16,Diagnoses - main ICD10: M16 Coxarthrosis [arth...,0.151297,z4,medium,9136.0,352058.0,0.025294


In [None]:
quality_heritability_phenos[display_cols]

Unnamed: 0_level_0,description,h2_liability,h2_sig,confidence,n_cases,n_controls,prevalence
phenotype,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1,Unnamed: 5_level_1,Unnamed: 6_level_1,Unnamed: 7_level_1
I83,Diagnoses - main ICD10: I83 Varicose veins of ...,0.189299,z7,medium,8763.0,352431.0,0.024261
I25,Diagnoses - main ICD10: I25 Chronic ischaemic ...,0.163823,z7,medium,12769.0,348425.0,0.035352
G56,Diagnoses - main ICD10: G56 Mononeuropathies o...,0.157519,z7,medium,8130.0,353064.0,0.022509
N81,Diagnoses - main ICD10: N81 Female genital pro...,0.152223,z4,medium,7511.0,186663.0,0.038682
M16,Diagnoses - main ICD10: M16 Coxarthrosis [arth...,0.151297,z4,medium,9136.0,352058.0,0.025294
M17,Diagnoses - main ICD10: M17 Gonarthrosis [arth...,0.14436,z7,high,11497.0,349697.0,0.031831
I48,Diagnoses - main ICD10: I48 Atrial fibrillatio...,0.144326,z4,medium,6356.0,354838.0,0.017597
C44,Diagnoses - main ICD10: C44 Other malignant ne...,0.141495,z4,medium,9086.0,352108.0,0.025155
K80,Diagnoses - main ICD10: K80 Cholelithiasis,0.137236,z4,high,10520.0,350674.0,0.029126
I21,Diagnoses - main ICD10: I21 Acute myocardial i...,0.136628,z4,medium,5948.0,355246.0,0.016468


In [None]:
quality_heritability_phenos.shape

In [None]:
pheno = quality_heritability_phenos.loc["I84"]
pheno

description         Diagnoses - main ICD10: I84 Haemorrhoids
h2_liability                                        0.057475
h2_liability_se                                     0.013482
h2_observed                                         0.009864
h2_observed_se                                      0.002314
h2_z                                                4.263044
h2_p                                                 0.00001
h2_sig                                                    z4
confidence                                              high
notes                                                    NaN
intercept                                           1.018834
intercept_se                                        0.008843
intercept_z                                         2.129842
intercept_p                                         0.016592
lambdaGC                                            1.074945
mean_chi2                                           1.085477
ratio                   

---

### Output phenotype matrix

In [None]:
#export
icd10_pheno_matrix = catalog_all.load("ICD10_pheno_matrix")

#get the first 3 character of ICD code
icd10_primary_cols = icd10_pheno_matrix.columns[icd10_pheno_matrix.columns.str.contains("primary")]
icd10_pheno_matrix = icd10_pheno_matrix.astype(str).apply(lambda x: x.str.slice(0,3))



  result = func()


In [None]:
#export
def get_phenotype(icd10_codes="I84"):
    icd10_codes = list(icd10_codes)
    pheno_df_list = [icd10_pheno_matrix[icd10_primary_cols].isin([icd10_code]).any(axis=1).astype(int) for icd10_code in icd10_codes]
    pheno_df = pd.concat(pheno_df_list, axis=1)
    pheno_df.columns = icd10_codes
    return pheno_df

In [None]:
get_phenotype(icd10_codes = ["I84", "R07"])

Unnamed: 0_level_0,I84,R07
f.eid,Unnamed: 1_level_1,Unnamed: 2_level_1
1000025,0,0
1000038,0,0
1000042,0,0
1000056,0,0
1000061,0,0
...,...,...
5873158,0,0
5873167,0,0
5873175,0,0
5873180,0,0


---
## Get SNPs used in GWAS for trait

In [None]:
#export

def get_GWAS_snps_for_trait(phenotype_code= "I84", chromosome:int = 21, sort_val_cols_list: List[str] = ["pval"], ascending_bool_list: List[bool] = [False], id_only= True):
    chromosome_str = f"{chromosome}:"
    gwas_result_df = read_csv_compressed(get_GWAS_result_link(phenotype_code=phenotype_code)).query(f"variant.str.startswith('{chromosome_str}')")
    gwas_result_df = gwas_result_df.reset_index(drop=True).reset_index().rename(columns = {"index":"position_rank"})
    gwas_result_df = gwas_result_df.sort_values(sort_val_cols_list, ascending = ascending_bool_list)
    variant_id_df = gwas_result_df["variant"].str.split(":",expand=True)
    variant_id_df["chr1_4"] =variant_id_df[[1,2,3]].apply("_".join, axis=1)
    variant_id_df[1] = variant_id_df[1].astype(int)
    gwas_result_df[["chr", "position", "major_allele"]] = variant_id_df[[0, 1, 2]]
    gwas_result_df["full_id"] =  variant_id_df[[0, "chr1_4"]].apply(":".join, axis=1)
    
    if id_only:
        return gwas_result_df["full_id"].values
    else:
        return gwas_result_df


In [None]:
get_GWAS_snps_for_trait()

array(['21:23683958_G_A', '21:44501642_C_T', '21:25570916_T_C', ...,
       '21:46876064_G_C', '21:47773893_C_G', '21:47831648_C_T'],
      dtype=object)

In [None]:
get_GWAS_snps_for_trait(id_only=False)

Unnamed: 0,position_rank,variant,minor_allele,minor_AF,expected_case_minor_AC,low_confidence_variant,n_complete_samples,AC,ytx,beta,se,tstat,pval,chr,position,major_allele,full_id
46739,46739,21:23683958:G:A,A,0.448599,11373.8000,False,361194,324062.000,11381.3000,1.745250e-09,0.000435,0.000004,0.999997,21,23683958,G,21:23683958_G_A
156679,156679,21:44501642:C:T,T,0.012528,317.6430,False,361194,9050.310,315.2900,-1.938640e-08,0.002030,-0.000010,0.999992,21,44501642,C,21:44501642_C_T
57576,57576,21:25570916:T:C,C,0.006228,157.8990,False,361194,4498.860,158.5530,-5.141850e-08,0.002875,-0.000018,0.999986,21,25570916,T,21:25570916_T_C
17572,17572,21:18598893:A:C,C,0.001108,28.0995,False,361194,800.612,28.0941,1.672180e-07,0.006937,0.000024,0.999981,21,18598893,A,21:18598893_A_C
89085,89085,21:31934274:T:C,C,0.011455,290.4320,False,361194,8275.000,288.0000,7.702210e-08,0.002036,0.000038,0.999970,21,31934274,T,21:31934274_T_C
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
161896,161896,21:45545974:C:T,T,0.000000,0.0000,True,361194,0.000,0.0000,,,,,21,45545974,C,21:45545974_C_T
165184,165184,21:46067181:G:C,C,0.000000,0.0000,True,361194,0.000,0.0000,,,,,21,46067181,G,21:46067181_G_C
170601,170601,21:46876064:G:C,C,0.000000,0.0000,True,361194,0.000,0.0000,,,,,21,46876064,G,21:46876064_G_C
176072,176072,21:47773893:C:G,G,0.000000,0.0000,True,361194,0.000,0.0000,,,,,21,47773893,C,21:47773893_C_G
