In [1]:
%load_ext autoreload
%autoreload 2

In [2]:
import os
import pickle
from glob import glob
from concurrent.futures import ProcessPoolExecutor, as_completed

import numpy as np
import pandas as pd
from scipy import stats

import settings as conf
import metadata
from utils import is_number, chunker
from results.multixcan import MXPhenoInfo, MXPhenoResults

# Gene mappings

In [3]:
metadata.GENES_MAPPINGS.head()

Unnamed: 0,gene,gene_name,gene_type,gene_id,band
0,ENSG00000000457.13,SCYL3,protein_coding,ENSG00000000457,1q24.2
1,ENSG00000000460.16,C1orf112,protein_coding,ENSG00000000460,1q24.2
2,ENSG00000000938.12,FGR,protein_coding,ENSG00000000938,1p35.3
3,ENSG00000000971.15,CFH,protein_coding,ENSG00000000971,1q31.3
4,ENSG00000001036.13,FUCA2,protein_coding,ENSG00000001036,6q24.2


In [4]:
gene_name_to_id_long = metadata.GENES_MAPPINGS[['gene_name', 'gene']].set_index('gene_name').to_dict()['gene']

In [5]:
gene_name_to_id_long['A2M']

'ENSG00000175899.14'

In [6]:
gene_id_long_to_name = metadata.GENES_MAPPINGS[['gene_name', 'gene']].set_index('gene').to_dict()['gene_name']

In [7]:
gene_id_long_to_name['ENSG00000175899.14']

'A2M'

In [8]:
gene_id_long_to_band = metadata.GENES_MAPPINGS[['band', 'gene']].set_index('gene').to_dict()['band']

In [9]:
gene_id_long_to_band['ENSG00000175899.14']

'12p13.31'

# Load UKB x ClinVar z2

In [10]:
ukb_clinvar_z2_filename = os.path.join(conf.GENE_ASSOC_DIR, 'smultixcan_and_clinvar-z2.pkl.xz')
display(ukb_clinvar_z2_filename)

'/mnt/phenomexcan_base/gene_assoc/smultixcan_and_clinvar-z2.pkl.xz'

In [11]:
ukb_clinvar_z2 = pd.read_pickle(ukb_clinvar_z2_filename)

In [12]:
ukb_clinvar_z2.shape

(4091, 5106)

In [13]:
ukb_clinvar_z2.head()

Unnamed: 0_level_0,Alzheimer's disease,Alpha-2-macroglobulin deficiency,"Otitis media, susceptibility to",p phenotype,Glucocorticoid deficiency with achalasia,Keratosis palmoplantaris papulosa,"Charcot-Marie-Tooth disease, type 2N","Epileptic encephalopathy, early infantile, 29",Combined oxidative phosphorylation deficiency 8,"Leukoencephalopathy, progressive, with ovarian failure",...,Retinitis pigmentosa 58,"Myopia 21, autosomal dominant",Paget disease of bone 6,Seborrhea-like dermatitis with psoriasiform elements,PEHO syndrome,Oocyte maturation defect 1,OOCYTE MATURATION DEFECT 6,OOCYTE MATURATION DEFECT 3,Acromelic frontonasal dysostosis,"NEURODEVELOPMENTAL DISORDER WITH MOVEMENT ABNORMALITIES, ABNORMAL GAIT, AND AUTISTIC FEATURES"
phenomexcan_traits,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1,Unnamed: 5_level_1,Unnamed: 6_level_1,Unnamed: 7_level_1,Unnamed: 8_level_1,Unnamed: 9_level_1,Unnamed: 10_level_1,Unnamed: 11_level_1,Unnamed: 12_level_1,Unnamed: 13_level_1,Unnamed: 14_level_1,Unnamed: 15_level_1,Unnamed: 16_level_1,Unnamed: 17_level_1,Unnamed: 18_level_1,Unnamed: 19_level_1,Unnamed: 20_level_1,Unnamed: 21_level_1
4270-Volume_level_set_by_participant_left,0.512739,0.342467,1.070237,1.800286,1.631911,0.698835,0.074415,0.074415,1.38765,1.38765,...,1.017634,1.224636,0.009039,0.385532,3.521638,0.223988,0.001329,0.425832,0.011,0.011
S05-Diagnoses_main_ICD10_S05_Injury_of_eye_and_orbit,0.217049,0.001955,0.631155,1.087293,0.007369,2.805816,0.075216,0.075216,0.093814,0.093814,...,0.501215,1.852592,1.481549,0.04681,0.619792,0.164473,2.656553,3.644131,0.321142,0.321142
20003_1141157402-Treatmentmedication_code_prednisolone_product,0.343953,0.676443,0.034698,0.203671,0.004971,1.390301,1.228719,1.228719,2.967204,2.967204,...,0.360941,11.078803,0.831347,0.92459,0.223633,1.503475,1.331688,2.692623,0.000386,0.000386
20002_1427-Noncancer_illness_code_selfreported_polycystic_kidney,0.444715,0.231927,0.288159,3.099246,0.446979,1.205976,0.114788,0.114788,0.686694,0.686694,...,1.772919,1.062408,0.254789,1.11964,1.354223,0.02573,0.680319,1.682058,0.579198,0.579198
110001-Invitation_to_complete_online_24hour_recall_dietary_questionnaire_acceptance,1.229807,2.597971,6.239911,1.639243,1.395789,0.051032,1.20218,1.20218,3.93413,3.93413,...,2.650158,0.972351,0.661263,0.021415,0.685834,1.089133,2.498117,2.86172,0.095069,0.095069


# Load S-MultiXcan gene associations

In [14]:
smultixcan_pvalues_file = os.path.join(conf.GENE_ASSOC_DIR, f'smultixcan-mashr-pvalues.pkl.xz')
display(smultixcan_pvalues_file)

'/mnt/phenomexcan_base/gene_assoc/smultixcan-mashr-pvalues.pkl.xz'

In [15]:
smultixcan_gene_associations = pd.read_pickle(smultixcan_pvalues_file)

In [16]:
smultixcan_gene_associations.shape

(22515, 4091)

In [17]:
smultixcan_gene_associations.head(5)

Unnamed: 0_level_0,4270-Volume_level_set_by_participant_left,S05-Diagnoses_main_ICD10_S05_Injury_of_eye_and_orbit,20003_1141157402-Treatmentmedication_code_prednisolone_product,20002_1427-Noncancer_illness_code_selfreported_polycystic_kidney,110001-Invitation_to_complete_online_24hour_recall_dietary_questionnaire_acceptance,22617_2442-Job_SOC_coding_Social_workers,J93-Diagnoses_main_ICD10_J93_Pneumothorax,22601_41133206-Job_coding_local_government_administrative_officer_or_assistant_or_clerk,6145_3-Illness_injury_bereavement_stress_in_last_2_years_Death_of_a_close_relative,20002_1597-Noncancer_illness_code_selfreported_tinnitus_tiniitis,...,PGC_ADHD_EUR_2017,BCAC_ER_negative_BreastCancer_EUR,SSGAC_Education_Years_Pooled,pgc.scz2,MAGNETIC_HDL.C,MAGIC_ln_FastingInsulin,Astle_et_al_2016_Sum_eosinophil_basophil_counts,SSGAC_Depressive_Symptoms,Jones_et_al_2016_SleepDuration,Astle_et_al_2016_Red_blood_cell_count
gene_name,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1,Unnamed: 5_level_1,Unnamed: 6_level_1,Unnamed: 7_level_1,Unnamed: 8_level_1,Unnamed: 9_level_1,Unnamed: 10_level_1,Unnamed: 11_level_1,Unnamed: 12_level_1,Unnamed: 13_level_1,Unnamed: 14_level_1,Unnamed: 15_level_1,Unnamed: 16_level_1,Unnamed: 17_level_1,Unnamed: 18_level_1,Unnamed: 19_level_1,Unnamed: 20_level_1,Unnamed: 21_level_1
ENSG00000000419,0.952828,0.827955,0.976745,0.916643,0.580172,0.270858,0.779171,0.597882,0.531657,0.534696,...,0.980281,0.413973,0.747822,0.609467,0.019467,0.371748,0.82301,0.71331,0.858389,0.409761
ENSG00000000457,0.646365,0.299606,0.477472,0.00357,0.905365,0.845711,0.13743,0.868056,0.028947,0.392273,...,0.228982,0.63683,0.654535,0.010907,0.7263,0.210658,0.008023,0.678749,0.836679,0.812484
ENSG00000000460,0.896074,0.094218,0.507646,0.166442,0.974545,0.62891,0.425188,0.551723,0.059456,0.295833,...,0.30146,0.140522,0.646442,0.587969,0.498724,0.521805,0.004462,0.736509,0.432229,0.486664
ENSG00000000938,0.576924,0.107121,0.439276,0.567582,0.768334,0.203873,0.231449,0.106706,0.595655,0.953718,...,0.588855,0.226977,0.576593,0.059247,0.435438,0.95316,0.101875,0.954998,0.097831,0.135045
ENSG00000000971,0.95639,0.492012,0.510924,0.532389,0.555313,0.993563,0.807439,0.948366,0.774694,0.490962,...,0.109883,0.040871,0.005662,0.020391,0.439466,0.690242,0.055059,0.00266,0.331132,0.99545


# Load ClinVar gene associations

In [26]:
clinvar_genes_associations_filename = os.path.join(conf.GENE_ASSOC_DIR, 'clinvar-gene_associations.pkl.xz')
display(clinvar_genes_associations_filename)

'/mnt/phenomexcan_base/gene_assoc/clinvar-gene_associations.pkl.xz'

In [27]:
clinvar_genes_associations = pd.read_pickle(clinvar_genes_associations_filename)

In [28]:
clinvar_genes_associations.shape

(4194, 5586)

In [29]:
clinvar_genes_associations.head(5)

Unnamed: 0_level_0,Alzheimer's disease,Alpha-2-macroglobulin deficiency,"Otitis media, susceptibility to",p phenotype,Glucocorticoid deficiency with achalasia,Keratosis palmoplantaris papulosa,"Charcot-Marie-Tooth disease, type 2N","Epileptic encephalopathy, early infantile, 29",Combined oxidative phosphorylation deficiency 8,"Leukoencephalopathy, progressive, with ovarian failure",...,"Myopia 21, autosomal dominant",Paget disease of bone 6,ZNF711-Related X-linked Mental Retardation,Seborrhea-like dermatitis with psoriasiform elements,PEHO syndrome,Oocyte maturation defect 1,OOCYTE MATURATION DEFECT 6,OOCYTE MATURATION DEFECT 3,Acromelic frontonasal dysostosis,"NEURODEVELOPMENTAL DISORDER WITH MOVEMENT ABNORMALITIES, ABNORMAL GAIT, AND AUTISTIC FEATURES"
gene,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1,Unnamed: 5_level_1,Unnamed: 6_level_1,Unnamed: 7_level_1,Unnamed: 8_level_1,Unnamed: 9_level_1,Unnamed: 10_level_1,Unnamed: 11_level_1,Unnamed: 12_level_1,Unnamed: 13_level_1,Unnamed: 14_level_1,Unnamed: 15_level_1,Unnamed: 16_level_1,Unnamed: 17_level_1,Unnamed: 18_level_1,Unnamed: 19_level_1,Unnamed: 20_level_1,Unnamed: 21_level_1
A2M,1,1,0,0,0,0,0,0,0,0,...,0,0,0,0,0,0,0,0,0,0
A2ML1,0,0,1,0,0,0,0,0,0,0,...,0,0,0,0,0,0,0,0,0,0
A4GALT,0,0,0,1,0,0,0,0,0,0,...,0,0,0,0,0,0,0,0,0,0
AAAS,0,0,0,0,1,0,0,0,0,0,...,0,0,0,0,0,0,0,0,0,0
AAGAB,0,0,0,0,0,1,0,0,0,0,...,0,0,0,0,0,0,0,0,0,0


# Compute unique list of genes and non-empty diseases

In [30]:
clinvar_genes_to_id = pd.Index([metadata.GENE_NAME_TO_ID_MAP[x] for x in clinvar_genes_associations.index if x in metadata.GENE_NAME_TO_ID_MAP])
display(len(clinvar_genes_to_id))

3790

In [31]:
shared_gene_ids = clinvar_genes_to_id.intersection(smultixcan_gene_associations.index)
display(len(shared_gene_ids))

3789

In [32]:
shared_gene_names = pd.Index([metadata.GENE_ID_TO_NAME_MAP[x] for x in shared_gene_ids])
display(len(shared_gene_names))

3789

In [33]:
assert len(shared_gene_ids) == len(shared_gene_names)

In [34]:
smultixcan_gene_associations = smultixcan_gene_associations.loc[shared_gene_ids]
clinvar_genes_associations = clinvar_genes_associations.loc[shared_gene_names]

In [35]:
_tmp = clinvar_genes_associations.sum()
_clinvar_diseases_to_remove = _tmp[_tmp == 0].index
display(_clinvar_diseases_to_remove.shape)

(480,)

In [36]:
# remove diseases that have no-genes associated (after selecting just shared ones above)
clinvar_genes_associations = clinvar_genes_associations.drop(columns=_clinvar_diseases_to_remove)

In [37]:
_tmp = clinvar_genes_associations.sum()
assert _tmp[_tmp == 0].shape[0] == 0

In [38]:
display(smultixcan_gene_associations.shape)
display(clinvar_genes_associations.shape)

(3789, 4091)

(3789, 5106)

# ClinVar: rename genes

In [39]:
clinvar_genes_associations = clinvar_genes_associations.rename(index=gene_name_to_id_long)

In [40]:
clinvar_genes_associations.shape

(3789, 5106)

In [41]:
clinvar_genes_associations.head()

Unnamed: 0,Alzheimer's disease,Alpha-2-macroglobulin deficiency,"Otitis media, susceptibility to",p phenotype,Glucocorticoid deficiency with achalasia,Keratosis palmoplantaris papulosa,"Charcot-Marie-Tooth disease, type 2N","Epileptic encephalopathy, early infantile, 29",Combined oxidative phosphorylation deficiency 8,"Leukoencephalopathy, progressive, with ovarian failure",...,Retinitis pigmentosa 58,"Myopia 21, autosomal dominant",Paget disease of bone 6,Seborrhea-like dermatitis with psoriasiform elements,PEHO syndrome,Oocyte maturation defect 1,OOCYTE MATURATION DEFECT 6,OOCYTE MATURATION DEFECT 3,Acromelic frontonasal dysostosis,"NEURODEVELOPMENTAL DISORDER WITH MOVEMENT ABNORMALITIES, ABNORMAL GAIT, AND AUTISTIC FEATURES"
ENSG00000175899.14,1,1,0,0,0,0,0,0,0,0,...,0,0,0,0,0,0,0,0,0,0
ENSG00000166535.19,0,0,1,0,0,0,0,0,0,0,...,0,0,0,0,0,0,0,0,0,0
ENSG00000128274.15,0,0,0,1,0,0,0,0,0,0,...,0,0,0,0,0,0,0,0,0,0
ENSG00000094914.12,0,0,0,0,1,0,0,0,0,0,...,0,0,0,0,0,0,0,0,0,0
ENSG00000103591.12,0,0,0,0,0,1,0,0,0,0,...,0,0,0,0,0,0,0,0,0,0


In [42]:
assert clinvar_genes_associations.index.is_unique
assert clinvar_genes_associations.columns.is_unique

# ClinVar genes per trait

In [43]:
clinvar_genes = clinvar_genes_associations.unstack().to_frame('associated')

In [44]:
clinvar_genes.shape

(19346634, 1)

In [45]:
assert clinvar_genes.shape[0] == clinvar_genes_associations.size

In [46]:
clinvar_genes.head()

Unnamed: 0,Unnamed: 1,associated
Alzheimer's disease,ENSG00000175899.14,1
Alzheimer's disease,ENSG00000166535.19,0
Alzheimer's disease,ENSG00000128274.15,0
Alzheimer's disease,ENSG00000094914.12,0
Alzheimer's disease,ENSG00000103591.12,0


In [47]:
clinvar_genes = clinvar_genes[clinvar_genes['associated'] == 1].reset_index().rename(columns={'level_0': 'clinvar_trait', 'level_1': 'gene'}).drop(columns=['associated'])

In [48]:
clinvar_genes.shape

(5921, 2)

In [49]:
clinvar_genes.head()

Unnamed: 0,clinvar_trait,gene
0,Alzheimer's disease,ENSG00000175899.14
1,Alzheimer's disease,ENSG00000142192.20
2,Alzheimer's disease,ENSG00000010704.18
3,Alzheimer's disease,ENSG00000005381.7
4,Alzheimer's disease,ENSG00000164867.10


In [50]:
clinvar_genes = clinvar_genes.assign(gene_name=clinvar_genes['gene'].apply(lambda x: gene_id_long_to_name[x]))

In [51]:
clinvar_genes = clinvar_genes.assign(gene_band=clinvar_genes['gene'].apply(lambda x: gene_id_long_to_band[x]))

In [52]:
clinvar_genes.head()

Unnamed: 0,clinvar_trait,gene,gene_name,gene_band
0,Alzheimer's disease,ENSG00000175899.14,A2M,12p13.31
1,Alzheimer's disease,ENSG00000142192.20,APP,21q21.3
2,Alzheimer's disease,ENSG00000010704.18,HFE,6p22.2
3,Alzheimer's disease,ENSG00000005381.7,MPO,17q22
4,Alzheimer's disease,ENSG00000164867.10,NOS3,7q36.1


In [53]:
assert clinvar_genes.drop_duplicates().shape == clinvar_genes.shape

In [54]:
def _format_gene(x):
    return pd.Series([
        ', '.join(x.gene.values),
        ', '.join([f'{g} ({b})' for g, b in zip(x.gene_name.values, x.gene_band.values)]),
    ], index=['gene_ids', 'gene_names'])

In [55]:
clinvar_genes_grp = clinvar_genes.groupby('clinvar_trait').apply(_format_gene)

In [56]:
clinvar_genes_grp.shape

(5106, 2)

In [57]:
clinvar_genes_grp.head()

Unnamed: 0_level_0,gene_ids,gene_names
clinvar_trait,Unnamed: 1_level_1,Unnamed: 2_level_1
15q13.3 microdeletion syndrome,"ENSG00000175344.17, ENSG00000169926.10","CHRNA7 (15q13.3), KLF13 (15q13.3)"
1q21.1 recurrent microdeletion,"ENSG00000265107.2, ENSG00000121634.5","GJA5 (1q21.2), GJA8 (1q21.2)"
"2,4-Dienoyl-CoA reductase deficiency",ENSG00000152620.12,NADK2 (5p13.2)
2-aminoadipic 2-oxoadipic aciduria,ENSG00000181192.11,DHTKD1 (10p14)
22q13.3 deletion syndrome,ENSG00000251322.7,SHANK3 (22q13.33)


In [58]:
assert clinvar_genes_grp.index.is_unique

# Load phenotype information

In [59]:
os.path.join(conf.DELIVERABLES_DIR, 'phenotypes_info.tsv.gz')

'/mnt/phenomexcan_base/deliverables/phenotypes_info.tsv.gz'

In [60]:
pheno_info = pd.read_csv(os.path.join(conf.DELIVERABLES_DIR, 'phenotypes_info.tsv.gz'), sep='\t')

In [61]:
pheno_info.shape

(4091, 9)

In [62]:
pheno_info.head()

Unnamed: 0,full_code,short_code,description,unique_description,type,n,n_cases,n_controls,source
0,100001_raw-Food_weight,100001_raw,Food weight,Food weight,continuous_raw,51453,,,UK Biobank
1,100002_raw-Energy,100002_raw,Energy,Energy,continuous_raw,51453,,,UK Biobank
2,100003_raw-Protein,100003_raw,Protein,Protein,continuous_raw,51453,,,UK Biobank
3,100004_raw-Fat,100004_raw,Fat,Fat,continuous_raw,51453,,,UK Biobank
4,100005_raw-Carbohydrate,100005_raw,Carbohydrate,Carbohydrate,continuous_raw,51453,,,UK Biobank


In [63]:
pheno_full_id_to_uniq_desc = pheno_info[['full_code', 'unique_description']].set_index('full_code').to_dict()['unique_description']

In [64]:
pheno_full_id_to_uniq_desc['100001_raw-Food_weight']

'Food weight'

### Obtain z2

In [65]:
clinvar_unstacked = ukb_clinvar_z2.rename(index=pheno_full_id_to_uniq_desc).T.unstack().to_frame('z2_avg')

In [66]:
assert clinvar_unstacked.index.is_unique

In [67]:
assert ukb_clinvar_z2.size == clinvar_unstacked.shape[0]

In [68]:
clinvar_unstacked.index.rename(['ukb_trait', 'clinvar_trait'], inplace=True)

In [69]:
clinvar_unstacked = clinvar_unstacked.assign(sqrt_z2_avg=np.sqrt(clinvar_unstacked['z2_avg']))

In [70]:
assert clinvar_unstacked.index.is_unique

In [71]:
clinvar_unstacked.shape

(20888646, 2)

In [72]:
clinvar_unstacked.head()

Unnamed: 0_level_0,Unnamed: 1_level_0,z2_avg,sqrt_z2_avg
ukb_trait,clinvar_trait,Unnamed: 2_level_1,Unnamed: 3_level_1
Volume level set by participant (left),Alzheimer's disease,0.512739,0.716058
Volume level set by participant (left),Alpha-2-macroglobulin deficiency,0.342467,0.585207
Volume level set by participant (left),"Otitis media, susceptibility to",1.070237,1.034523
Volume level set by participant (left),p phenotype,1.800286,1.341748
Volume level set by participant (left),Glucocorticoid deficiency with achalasia,1.631911,1.277463


### Complete version

In [73]:
clinvar_unstacked_complete = pd.merge(clinvar_unstacked.reset_index(), clinvar_genes_grp, on='clinvar_trait', how='inner')

In [74]:
assert clinvar_unstacked_complete.shape[0] == clinvar_unstacked.shape[0]

In [75]:
clinvar_unstacked_complete.shape

(20888646, 6)

In [76]:
clinvar_unstacked_complete.head()

Unnamed: 0,ukb_trait,clinvar_trait,z2_avg,sqrt_z2_avg,gene_ids,gene_names
0,Volume level set by participant (left),Alzheimer's disease,0.512739,0.716058,"ENSG00000175899.14, ENSG00000142192.20, ENSG00...","A2M (12p13.31), APP (21q21.3), HFE (6p22.2), M..."
1,Diagnoses - main ICD10: S05 Injury of eye and ...,Alzheimer's disease,0.217049,0.465886,"ENSG00000175899.14, ENSG00000142192.20, ENSG00...","A2M (12p13.31), APP (21q21.3), HFE (6p22.2), M..."
2,Treatment/medication code: prednisolone produc...,Alzheimer's disease,0.343953,0.586475,"ENSG00000175899.14, ENSG00000142192.20, ENSG00...","A2M (12p13.31), APP (21q21.3), HFE (6p22.2), M..."
3,"Non-cancer illness code, self-reported: polycy...",Alzheimer's disease,0.444715,0.666869,"ENSG00000175899.14, ENSG00000142192.20, ENSG00...","A2M (12p13.31), APP (21q21.3), HFE (6p22.2), M..."
4,Invitation to complete online 24-hour recall d...,Alzheimer's disease,1.229807,1.108967,"ENSG00000175899.14, ENSG00000142192.20, ENSG00...","A2M (12p13.31), APP (21q21.3), HFE (6p22.2), M..."


In [77]:
clinvar_unstacked_complete = clinvar_unstacked_complete.set_index(['ukb_trait', 'clinvar_trait'])

In [78]:
assert clinvar_unstacked_complete.index.is_unique

In [79]:
clinvar_unstacked_complete.shape

(20888646, 4)

In [80]:
clinvar_unstacked_complete.head()

Unnamed: 0_level_0,Unnamed: 1_level_0,z2_avg,sqrt_z2_avg,gene_ids,gene_names
ukb_trait,clinvar_trait,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1,Unnamed: 5_level_1
Volume level set by participant (left),Alzheimer's disease,0.512739,0.716058,"ENSG00000175899.14, ENSG00000142192.20, ENSG00...","A2M (12p13.31), APP (21q21.3), HFE (6p22.2), M..."
Diagnoses - main ICD10: S05 Injury of eye and orbit,Alzheimer's disease,0.217049,0.465886,"ENSG00000175899.14, ENSG00000142192.20, ENSG00...","A2M (12p13.31), APP (21q21.3), HFE (6p22.2), M..."
Treatment/medication code: prednisolone product (20003_1141157402),Alzheimer's disease,0.343953,0.586475,"ENSG00000175899.14, ENSG00000142192.20, ENSG00...","A2M (12p13.31), APP (21q21.3), HFE (6p22.2), M..."
"Non-cancer illness code, self-reported: polycystic kidney",Alzheimer's disease,0.444715,0.666869,"ENSG00000175899.14, ENSG00000142192.20, ENSG00...","A2M (12p13.31), APP (21q21.3), HFE (6p22.2), M..."
"Invitation to complete online 24-hour recall dietary questionnaire, acceptance",Alzheimer's disease,1.229807,1.108967,"ENSG00000175899.14, ENSG00000142192.20, ENSG00...","A2M (12p13.31), APP (21q21.3), HFE (6p22.2), M..."


In [81]:
clinvar_unstacked_complete = clinvar_unstacked_complete.sort_index()

In [82]:
assert clinvar_unstacked_complete.index.is_lexsorted()

In [83]:
clinvar_unstacked_complete.head()

Unnamed: 0_level_0,Unnamed: 1_level_0,z2_avg,sqrt_z2_avg,gene_ids,gene_names
ukb_trait,clinvar_trait,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1,Unnamed: 5_level_1
#Arthrosis,15q13.3 microdeletion syndrome,0.041067,0.20265,"ENSG00000175344.17, ENSG00000169926.10","CHRNA7 (15q13.3), KLF13 (15q13.3)"
#Arthrosis,1q21.1 recurrent microdeletion,0.290045,0.538558,"ENSG00000265107.2, ENSG00000121634.5","GJA5 (1q21.2), GJA8 (1q21.2)"
#Arthrosis,"2,4-Dienoyl-CoA reductase deficiency",2.176716,1.47537,ENSG00000152620.12,NADK2 (5p13.2)
#Arthrosis,2-aminoadipic 2-oxoadipic aciduria,3.551778,1.884616,ENSG00000181192.11,DHTKD1 (10p14)
#Arthrosis,22q13.3 deletion syndrome,0.072909,0.270016,ENSG00000251322.7,SHANK3 (22q13.33)


# Some testing

In [84]:
clinvar_unstacked_complete.loc['Depression possibly related to childbirth', "Keratosis palmoplantaris papulosa"]

z2_avg                  0.0809037
sqrt_z2_avg              0.284436
gene_ids       ENSG00000103591.12
gene_names          AAGAB (15q23)
Name: (Depression possibly related to childbirth, Keratosis palmoplantaris papulosa), dtype: object

In [85]:
assert clinvar_unstacked_complete.loc['Other epidermal thickening', "Alzheimer's disease"]['z2_avg'] == 1.3472206237157518

In [86]:
assert clinvar_unstacked_complete.loc['Other epidermal thickening', "Alzheimer's disease"]['sqrt_z2_avg'] == np.sqrt(1.3472206237157518)

In [87]:
assert clinvar_unstacked_complete.loc['Depression possibly related to childbirth', "Keratosis palmoplantaris papulosa"]['z2_avg'] == 0.08090372843310314

In [88]:
clinvar_unstacked_complete[clinvar_unstacked_complete['z2_avg'] > 35].head()

Unnamed: 0_level_0,Unnamed: 1_level_0,z2_avg,sqrt_z2_avg,gene_ids,gene_names
ukb_trait,clinvar_trait,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1,Unnamed: 5_level_1
#Other joint disorders,Acromesomelic dysplasia Hunter Thompson type,75.073496,8.664496,ENSG00000125965.8,GDF5 (20q11.22)
#Other joint disorders,"Brachydactyly, type a1, c",75.073496,8.664496,ENSG00000125965.8,GDF5 (20q11.22)
#Other joint disorders,Fibular hypoplasia and complex brachydactyly,75.073496,8.664496,ENSG00000125965.8,GDF5 (20q11.22)
#Other joint disorders,Grebe syndrome,75.073496,8.664496,ENSG00000125965.8,GDF5 (20q11.22)
#Other joint disorders,Multiple synostoses syndrome 2,75.073496,8.664496,ENSG00000125965.8,GDF5 (20q11.22)


### Select columns

In [89]:
clinvar_unstacked_complete = clinvar_unstacked_complete.drop(columns=['z2_avg', 'gene_ids'])

In [90]:
clinvar_unstacked_complete.shape

(20888646, 2)

In [91]:
clinvar_unstacked_complete.head()

Unnamed: 0_level_0,Unnamed: 1_level_0,sqrt_z2_avg,gene_names
ukb_trait,clinvar_trait,Unnamed: 2_level_1,Unnamed: 3_level_1
#Arthrosis,15q13.3 microdeletion syndrome,0.20265,"CHRNA7 (15q13.3), KLF13 (15q13.3)"
#Arthrosis,1q21.1 recurrent microdeletion,0.538558,"GJA5 (1q21.2), GJA8 (1q21.2)"
#Arthrosis,"2,4-Dienoyl-CoA reductase deficiency",1.47537,NADK2 (5p13.2)
#Arthrosis,2-aminoadipic 2-oxoadipic aciduria,1.884616,DHTKD1 (10p14)
#Arthrosis,22q13.3 deletion syndrome,0.270016,SHANK3 (22q13.33)


### Some stats

In [92]:
pd.Series(clinvar_unstacked_complete.index.get_level_values('ukb_trait')).apply(len).max()

223

In [93]:
pd.Series(clinvar_unstacked_complete.index.get_level_values('clinvar_trait')).apply(len).max()

145

In [94]:
clinvar_unstacked_complete['gene_names'].apply(len).max()

448

#### Save

In [95]:
clinvar_unstacked_complete_filename = os.path.join(conf.GENE_ASSOC_DIR, 'ukb_clinvar.tsv')
display(clinvar_unstacked_complete_filename)

'/mnt/phenomexcan_base/gene_assoc/ukb_clinvar.tsv'

In [96]:
clinvar_unstacked_complete.to_csv(clinvar_unstacked_complete_filename, sep='\t')