In [1]:
%load_ext autoreload
%autoreload 2

In [2]:
import os
import pickle

import numpy as np
import pandas as pd

import settings as conf
import metadata

# Traits info file

In [3]:
from entity import Trait

In [4]:
pheno_codes = metadata.RAPID_GWAS_PHENO_INFO.index
pheno_codes = pheno_codes.union(metadata.GTEX_GWAS_PHENO_INFO.index)

In [5]:
assert len(pheno_codes) == len(metadata.RAPID_GWAS_PHENO_INFO.index) + len(metadata.GTEX_GWAS_PHENO_INFO.index)

In [6]:
pheno_codes

Index(['100001_irnt', '100001_raw', '100002_irnt', '100002_raw', '100003_irnt',
       '100003_raw', '100004_irnt', '100004_raw', '100005_irnt', '100005_raw',
       ...
       'Z53', 'Z71', 'Z76', 'Z80', 'Z85', 'Z87', 'age', 'is_female',
       'pgc.scz2', 'tag.evrsmk.tbl'],
      dtype='object', length=4473)

In [7]:
pheno_traits = [Trait(code) for code in pheno_codes]

In [8]:
assert len(pheno_traits) == len(pheno_codes)

In [9]:
pheno_traits[:5]

[100001_irnt-Food_weight,
 100001_raw-Food_weight,
 100002_irnt-Energy,
 100002_raw-Energy,
 100003_irnt-Protein]

In [10]:
pheno_traits[-5:]

[Z87-Diagnoses_main_ICD10_Z87_Personal_history_of_other_diseases_and_conditions,
 age-Age_at_recruitment,
 is_female-is_female_based_on_inferred_genetic_sex,
 pgc.scz2,
 tag.evrsmk.tbl]

In [11]:
pheno_data_tuples = [
    (t.get_plain_name(), t.code, t.description, t.type, t.n, t.n_cases, t.n_controls, t.source)
    for t in pheno_traits
]

In [12]:
pheno_info_df = pd.DataFrame.from_records(
    pheno_data_tuples,
    index='full_code',
    columns='full_code short_code description type n n_cases n_controls source'.split()
)

In [13]:
pheno_info_df.head()

Unnamed: 0_level_0,short_code,description,type,n,n_cases,n_controls,source
full_code,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1,Unnamed: 5_level_1,Unnamed: 6_level_1,Unnamed: 7_level_1
100001_irnt-Food_weight,100001_irnt,Food weight,continuous_irnt,51453,,,UK Biobank
100001_raw-Food_weight,100001_raw,Food weight,continuous_raw,51453,,,UK Biobank
100002_irnt-Energy,100002_irnt,Energy,continuous_irnt,51453,,,UK Biobank
100002_raw-Energy,100002_raw,Energy,continuous_raw,51453,,,UK Biobank
100003_irnt-Protein,100003_irnt,Protein,continuous_irnt,51453,,,UK Biobank


In [14]:
pheno_info_df.tail()

Unnamed: 0_level_0,short_code,description,type,n,n_cases,n_controls,source
full_code,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1,Unnamed: 5_level_1,Unnamed: 6_level_1,Unnamed: 7_level_1
Z87-Diagnoses_main_ICD10_Z87_Personal_history_of_other_diseases_and_conditions,Z87,Diagnoses - main ICD10: Z87 Personal history o...,categorical,361194,234.0,360960.0,UK Biobank
age-Age_at_recruitment,age,Age at recruitment,continuous_raw,361194,,,UK Biobank
is_female-is_female_based_on_inferred_genetic_sex,is_female,"is_female, based on inferred genetic sex",categorical,361194,194174.0,167020.0,UK Biobank
pgc.scz2,pgc.scz2,Schizophrenia,binary,150064,36989.0,113075.0,PGC
tag.evrsmk.tbl,tag.evrsmk.tbl,Tobacco Ever Smoker vs Never Smoker,binary,74035,41969.0,32066.0,TAG


## Load S-MultiXcan results

In [15]:
smultixcan_pvalues_file = os.path.join(conf.GENE_ASSOC_DIR, f'smultixcan-mashr-pvalues.pkl.xz')
display(smultixcan_pvalues_file)

'/mnt/phenomexcan_base/gene_assoc/smultixcan-mashr-pvalues.pkl.xz'

In [16]:
smultixcan_gene_associations = pd.read_pickle(smultixcan_pvalues_file)

In [17]:
smultixcan_gene_associations.shape

(22515, 4091)

In [18]:
smultixcan_gene_associations.head(5)

Unnamed: 0_level_0,4270-Volume_level_set_by_participant_left,S05-Diagnoses_main_ICD10_S05_Injury_of_eye_and_orbit,20003_1141157402-Treatmentmedication_code_prednisolone_product,20002_1427-Noncancer_illness_code_selfreported_polycystic_kidney,110001-Invitation_to_complete_online_24hour_recall_dietary_questionnaire_acceptance,22617_2442-Job_SOC_coding_Social_workers,J93-Diagnoses_main_ICD10_J93_Pneumothorax,22601_41133206-Job_coding_local_government_administrative_officer_or_assistant_or_clerk,6145_3-Illness_injury_bereavement_stress_in_last_2_years_Death_of_a_close_relative,20002_1597-Noncancer_illness_code_selfreported_tinnitus_tiniitis,...,PGC_ADHD_EUR_2017,BCAC_ER_negative_BreastCancer_EUR,SSGAC_Education_Years_Pooled,pgc.scz2,MAGNETIC_HDL.C,MAGIC_ln_FastingInsulin,Astle_et_al_2016_Sum_eosinophil_basophil_counts,SSGAC_Depressive_Symptoms,Jones_et_al_2016_SleepDuration,Astle_et_al_2016_Red_blood_cell_count
gene_name,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1,Unnamed: 5_level_1,Unnamed: 6_level_1,Unnamed: 7_level_1,Unnamed: 8_level_1,Unnamed: 9_level_1,Unnamed: 10_level_1,Unnamed: 11_level_1,Unnamed: 12_level_1,Unnamed: 13_level_1,Unnamed: 14_level_1,Unnamed: 15_level_1,Unnamed: 16_level_1,Unnamed: 17_level_1,Unnamed: 18_level_1,Unnamed: 19_level_1,Unnamed: 20_level_1,Unnamed: 21_level_1
ENSG00000000419,0.952828,0.827955,0.976745,0.916643,0.580172,0.270858,0.779171,0.597882,0.531657,0.534696,...,0.980281,0.413973,0.747822,0.609467,0.019467,0.371748,0.82301,0.71331,0.858389,0.409761
ENSG00000000457,0.646365,0.299606,0.477472,0.00357,0.905365,0.845711,0.13743,0.868056,0.028947,0.392273,...,0.228982,0.63683,0.654535,0.010907,0.7263,0.210658,0.008023,0.678749,0.836679,0.812484
ENSG00000000460,0.896074,0.094218,0.507646,0.166442,0.974545,0.62891,0.425188,0.551723,0.059456,0.295833,...,0.30146,0.140522,0.646442,0.587969,0.498724,0.521805,0.004462,0.736509,0.432229,0.486664
ENSG00000000938,0.576924,0.107121,0.439276,0.567582,0.768334,0.203873,0.231449,0.106706,0.595655,0.953718,...,0.588855,0.226977,0.576593,0.059247,0.435438,0.95316,0.101875,0.954998,0.097831,0.135045
ENSG00000000971,0.95639,0.492012,0.510924,0.532389,0.555313,0.993563,0.807439,0.948366,0.774694,0.490962,...,0.109883,0.040871,0.005662,0.020391,0.439466,0.690242,0.055059,0.00266,0.331132,0.99545


## Select traits from S-MultiXcan results

In [19]:
phenos_used_df = pheno_info_df.loc[smultixcan_gene_associations.columns.tolist()]

In [20]:
phenos_used_df.shape

(4091, 7)

In [21]:
assert phenos_used_df.shape[0] == smultixcan_gene_associations.shape[1]

In [22]:
phenos_used_df.head()

Unnamed: 0_level_0,short_code,description,type,n,n_cases,n_controls,source
full_code,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1,Unnamed: 5_level_1,Unnamed: 6_level_1,Unnamed: 7_level_1
4270-Volume_level_set_by_participant_left,4270,Volume level set by participant (left),ordinal,116790,,,UK Biobank
S05-Diagnoses_main_ICD10_S05_Injury_of_eye_and_orbit,S05,Diagnoses - main ICD10: S05 Injury of eye and ...,categorical,361194,159.0,361035.0,UK Biobank
20003_1141157402-Treatmentmedication_code_prednisolone_product,20003_1141157402,Treatment/medication code: prednisolone product,binary,361141,139.0,361002.0,UK Biobank
20002_1427-Noncancer_illness_code_selfreported_polycystic_kidney,20002_1427,"Non-cancer illness code, self-reported: polycy...",binary,361141,251.0,360890.0,UK Biobank
110001-Invitation_to_complete_online_24hour_recall_dietary_questionnaire_acceptance,110001,Invitation to complete online 24-hour recall d...,binary,218234,148169.0,70065.0,UK Biobank


## Save

In [23]:
phenos_used_df.to_csv(os.path.join(conf.DELIVERABLES_DIR, 'phenotypes_info.tsv.gz'), sep='\t')

# Genes info file

In [24]:
genes_info = metadata.GENES_MAPPINGS.set_index('gene_id').loc[smultixcan_gene_associations.index]

In [25]:
smultixcan_gene_associations.shape

(22515, 4091)

In [26]:
assert genes_info.shape[0] == smultixcan_gene_associations.shape[0]

In [27]:
genes_info.head()

Unnamed: 0_level_0,gene,gene_name,gene_type,band
gene_name,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1
ENSG00000000419,ENSG00000000419.12,DPM1,protein_coding,20q13.13
ENSG00000000457,ENSG00000000457.13,SCYL3,protein_coding,1q24.2
ENSG00000000460,ENSG00000000460.16,C1orf112,protein_coding,1q24.2
ENSG00000000938,ENSG00000000938.12,FGR,protein_coding,1p35.3
ENSG00000000971,ENSG00000000971.15,CFH,protein_coding,1q31.3


## Save

In [28]:
genes_info.to_csv(os.path.join(conf.DELIVERABLES_DIR, 'genes_info.tsv.gz'), sep='\t')