In [1]:
%load_ext autoreload
%autoreload 2

In [2]:
import os
import pickle
from glob import glob

import numpy as np
import pandas as pd
from scipy.cluster.hierarchy import linkage
from sklearn.metrics import pairwise_distances
import matplotlib.pyplot as plt
import seaborn as sns

import settings as conf
from utils import is_number, chunker
import metadata

# Load PhenomeXcan vs Clinvar associations

In [3]:
phenomexcan_vs_clinvar_filename = os.path.join(conf.GENE_ASSOC_DIR, 'smultixcan_and_clinvar-z2.pkl.xz')
display(phenomexcan_vs_clinvar_filename)

'/mnt/phenomexcan_base/gene_assoc/smultixcan_and_clinvar-z2.pkl.xz'

In [4]:
phenomexcan_vs_clinvar = pd.read_pickle(phenomexcan_vs_clinvar_filename)

In [5]:
display(phenomexcan_vs_clinvar.shape)
display(phenomexcan_vs_clinvar.head())

(4091, 5106)

Unnamed: 0_level_0,Alzheimer's disease,Alpha-2-macroglobulin deficiency,"Otitis media, susceptibility to",p phenotype,Glucocorticoid deficiency with achalasia,Keratosis palmoplantaris papulosa,"Charcot-Marie-Tooth disease, type 2N","Epileptic encephalopathy, early infantile, 29",Combined oxidative phosphorylation deficiency 8,"Leukoencephalopathy, progressive, with ovarian failure",...,Retinitis pigmentosa 58,"Myopia 21, autosomal dominant",Paget disease of bone 6,Seborrhea-like dermatitis with psoriasiform elements,PEHO syndrome,Oocyte maturation defect 1,OOCYTE MATURATION DEFECT 6,OOCYTE MATURATION DEFECT 3,Acromelic frontonasal dysostosis,"NEURODEVELOPMENTAL DISORDER WITH MOVEMENT ABNORMALITIES, ABNORMAL GAIT, AND AUTISTIC FEATURES"
phenomexcan_traits,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1,Unnamed: 5_level_1,Unnamed: 6_level_1,Unnamed: 7_level_1,Unnamed: 8_level_1,Unnamed: 9_level_1,Unnamed: 10_level_1,Unnamed: 11_level_1,Unnamed: 12_level_1,Unnamed: 13_level_1,Unnamed: 14_level_1,Unnamed: 15_level_1,Unnamed: 16_level_1,Unnamed: 17_level_1,Unnamed: 18_level_1,Unnamed: 19_level_1,Unnamed: 20_level_1,Unnamed: 21_level_1
20096_1-Size_of_red_wine_glass_drunk_small_125ml,0.651161,0.247607,3.820126,1.075217,0.009677,0.162067,1.670074,1.670074,0.273777,0.273777,...,0.292204,1.695379,0.355525,1.101905,0.37941,1.747519,0.27292,0.006041,0.194577,0.194577
2345-Ever_had_bowel_cancer_screening,0.544953,0.055165,0.373364,0.667099,1.731603,5.829613,0.000311,0.000311,0.872317,0.872317,...,0.871863,0.772939,0.120311,0.015428,0.332303,1.911665,2.069227,0.242165,0.392449,0.392449
N49-Diagnoses_main_ICD10_N49_Inflammatory_disorders_of_male_genital_organs_not_elsewhere_classified,2.352883,3.096874,0.208791,1.486795,0.166468,0.268138,2.304044,2.304044,0.007125,0.007125,...,0.02007,0.137331,0.744516,0.810886,0.222463,5.448538,0.087805,0.725028,0.155316,0.155316
100011_raw-Iron,2.905654,0.049563,11.132239,0.978171,0.4289,0.39541,0.199536,0.199536,0.638303,0.638303,...,0.000193,0.32735,0.436058,0.096868,0.173245,3.459105,0.144557,0.009105,0.015222,0.015222
5221-Index_of_best_refractometry_result_right,0.836999,0.071389,0.229481,0.340287,0.546314,2.319034,4.984327,4.984327,0.177313,0.177313,...,2.239274,1.115672,0.141618,0.01261,1.253595,0.031501,0.24487,0.001039,0.028283,0.028283


# Filter out traits with low z-scores

In [None]:
# plot z-score density
flat_values = phenomexcan_vs_clinvar.values.flatten()
flat_values = pd.Series(np.sqrt(flat_values))

plt.figure(figsize=(10, 8))
with sns.plotting_context(context='talk'):
    sns.distplot(flat_values)

In [9]:
flat_values.describe().apply(lambda x: '%.3f' % x)

count    20888646.000
mean            0.873
std             0.754
min             0.000
25%             0.351
50%             0.730
75%             1.219
max            40.000
dtype: object

In [10]:
flat_values[flat_values > flat_values.quantile(0.9995)].shape

(10445,)

In [11]:
Z_THRESHOLD = 4.00
display(Z_THRESHOLD)

4.0

In [13]:
phenomexcan_vs_clinvar_sqrt = np.sqrt(phenomexcan_vs_clinvar)
display(phenomexcan_vs_clinvar_sqrt.shape)
display(phenomexcan_vs_clinvar_sqrt.head())

(4091, 5106)

Unnamed: 0_level_0,Alzheimer's disease,Alpha-2-macroglobulin deficiency,"Otitis media, susceptibility to",p phenotype,Glucocorticoid deficiency with achalasia,Keratosis palmoplantaris papulosa,"Charcot-Marie-Tooth disease, type 2N","Epileptic encephalopathy, early infantile, 29",Combined oxidative phosphorylation deficiency 8,"Leukoencephalopathy, progressive, with ovarian failure",...,Retinitis pigmentosa 58,"Myopia 21, autosomal dominant",Paget disease of bone 6,Seborrhea-like dermatitis with psoriasiform elements,PEHO syndrome,Oocyte maturation defect 1,OOCYTE MATURATION DEFECT 6,OOCYTE MATURATION DEFECT 3,Acromelic frontonasal dysostosis,"NEURODEVELOPMENTAL DISORDER WITH MOVEMENT ABNORMALITIES, ABNORMAL GAIT, AND AUTISTIC FEATURES"
phenomexcan_traits,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1,Unnamed: 5_level_1,Unnamed: 6_level_1,Unnamed: 7_level_1,Unnamed: 8_level_1,Unnamed: 9_level_1,Unnamed: 10_level_1,Unnamed: 11_level_1,Unnamed: 12_level_1,Unnamed: 13_level_1,Unnamed: 14_level_1,Unnamed: 15_level_1,Unnamed: 16_level_1,Unnamed: 17_level_1,Unnamed: 18_level_1,Unnamed: 19_level_1,Unnamed: 20_level_1,Unnamed: 21_level_1
20096_1-Size_of_red_wine_glass_drunk_small_125ml,0.806946,0.497601,1.954514,1.036927,0.098373,0.402575,1.292313,1.292313,0.523237,0.523237,...,0.540559,1.302067,0.596259,1.049716,0.615962,1.321938,0.522417,0.077722,0.441108,0.441108
2345-Ever_had_bowel_cancer_screening,0.738209,0.234871,0.611035,0.816761,1.315904,2.414459,0.017647,0.017647,0.933979,0.933979,...,0.933736,0.87917,0.346859,0.124208,0.576457,1.38263,1.438481,0.492102,0.626457,0.626457
N49-Diagnoses_main_ICD10_N49_Inflammatory_disorders_of_male_genital_organs_not_elsewhere_classified,1.533911,1.759794,0.456937,1.219342,0.408005,0.51782,1.517908,1.517908,0.084412,0.084412,...,0.14167,0.370583,0.862854,0.900492,0.471659,2.33421,0.296318,0.851486,0.394101,0.394101
100011_raw-Iron,1.704598,0.222627,3.336501,0.989025,0.654905,0.628817,0.446695,0.446695,0.798939,0.798939,...,0.01388,0.572145,0.660347,0.311236,0.416227,1.859867,0.380206,0.09542,0.123379,0.123379
5221-Index_of_best_refractometry_result_right,0.914876,0.267187,0.479041,0.583342,0.739131,1.522838,2.232561,2.232561,0.421086,0.421086,...,1.49642,1.056254,0.376321,0.112292,1.119641,0.177485,0.494844,0.032239,0.168177,0.168177


In [14]:
# filter by value in assoc_coef
display(f'Z_THRESHOLD: {Z_THRESHOLD}')

cols_to_remove = []
for col in phenomexcan_vs_clinvar_sqrt.columns:
    if phenomexcan_vs_clinvar_sqrt[col].max() < Z_THRESHOLD:
        cols_to_remove.append(col)

rows_to_remove = []
for idx, row in phenomexcan_vs_clinvar_sqrt.iterrows():
    if row.max() < Z_THRESHOLD:
        rows_to_remove.append(idx)

display((len(rows_to_remove), len(cols_to_remove)))

'Z_THRESHOLD: 4.0'

(2142, 695)

In [15]:
rows_to_remove[:5]

['20096_1-Size_of_red_wine_glass_drunk_small_125ml',
 '2345-Ever_had_bowel_cancer_screening',
 '100011_raw-Iron',
 '5221-Index_of_best_refractometry_result_right',
 '20003_1141150624-Treatmentmedication_code_zomig_25mg_tablet']

In [16]:
phenomexcan_vs_clinvar_sqrt_selected = phenomexcan_vs_clinvar_sqrt.drop(columns=cols_to_remove)
phenomexcan_vs_clinvar_sqrt_selected = phenomexcan_vs_clinvar_sqrt_selected.drop(rows_to_remove)

In [17]:
display(phenomexcan_vs_clinvar_sqrt_selected.shape)
display(phenomexcan_vs_clinvar_sqrt_selected.head())

(1949, 4411)

Unnamed: 0_level_0,Alzheimer's disease,Alpha-2-macroglobulin deficiency,"Otitis media, susceptibility to",p phenotype,Glucocorticoid deficiency with achalasia,Keratosis palmoplantaris papulosa,"Charcot-Marie-Tooth disease, type 2N","Epileptic encephalopathy, early infantile, 29",Combined oxidative phosphorylation deficiency 8,"Leukoencephalopathy, progressive, with ovarian failure",...,"Corneal fragility keratoglobus, blue sclerae AND joint hypermobility",Retinitis pigmentosa 58,"Myopia 21, autosomal dominant",Paget disease of bone 6,Seborrhea-like dermatitis with psoriasiform elements,PEHO syndrome,Oocyte maturation defect 1,OOCYTE MATURATION DEFECT 3,Acromelic frontonasal dysostosis,"NEURODEVELOPMENTAL DISORDER WITH MOVEMENT ABNORMALITIES, ABNORMAL GAIT, AND AUTISTIC FEATURES"
phenomexcan_traits,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1,Unnamed: 5_level_1,Unnamed: 6_level_1,Unnamed: 7_level_1,Unnamed: 8_level_1,Unnamed: 9_level_1,Unnamed: 10_level_1,Unnamed: 11_level_1,Unnamed: 12_level_1,Unnamed: 13_level_1,Unnamed: 14_level_1,Unnamed: 15_level_1,Unnamed: 16_level_1,Unnamed: 17_level_1,Unnamed: 18_level_1,Unnamed: 19_level_1,Unnamed: 20_level_1,Unnamed: 21_level_1
N49-Diagnoses_main_ICD10_N49_Inflammatory_disorders_of_male_genital_organs_not_elsewhere_classified,1.533911,1.759794,0.456937,1.219342,0.408005,0.51782,1.517908,1.517908,0.084412,0.084412,...,0.788323,0.14167,0.370583,0.862854,0.900492,0.471659,2.33421,0.851486,0.394101,0.394101
20024_1136-Job_code_deduced_Information_and_communication_technology_managers,1.16717,1.781442,0.200259,0.496048,0.734752,0.346126,0.572916,0.572916,0.983466,0.983466,...,1.374289,1.551001,2.672544,1.521476,0.382859,0.7421,1.269273,0.007474,0.820057,0.820057
806-Job_involves_mainly_walking_or_standing,1.064439,0.872511,0.491002,0.744056,0.085302,0.238237,1.529063,1.529063,0.755943,0.755943,...,0.24998,0.834041,0.128594,2.06902,1.276395,3.243742,1.631854,1.194704,1.938288,1.938288
1060-Time_spent_outdoors_in_winter,2.225252,2.175974,1.074131,0.587395,0.430628,1.347906,0.305246,0.305246,0.911463,0.911463,...,0.696363,0.682483,0.605986,2.478585,1.745194,3.214646,0.743638,1.020932,0.778805,0.778805
20002_1373-Noncancer_illness_code_selfreported_connective_tissue_disorder,0.984052,0.483619,0.292449,1.596548,0.483717,0.605057,0.529684,0.529684,1.684554,1.684554,...,1.357636,0.458778,1.094217,0.406669,0.30606,0.489312,0.18634,0.185514,0.524153,0.524153


# Hierarchical clustering on rows and columns

In [18]:
z_rows = linkage(phenomexcan_vs_clinvar_sqrt_selected, 'average')
z_cols = linkage(phenomexcan_vs_clinvar_sqrt_selected.T, 'average')

In [23]:
t = np.array(phenomexcan_vs_clinvar_sqrt_selected.shape)
display(f'Data size: {t}')
fig_size = list(reversed(tuple((t / max(t)) * 200)))
display(fig_size)

'Data size: [1949 4411]'

[200.0, 88.36998413058264]

In [None]:
ax = sns.clustermap(
    data=phenomexcan_vs_clinvar_sqrt_selected,
    yticklabels=False, xticklabels=False,
    figsize=fig_size,
    row_linkage=z_rows, col_linkage=z_cols
)

ax.ax_row_dendrogram.set_visible(False)
ax.ax_col_dendrogram.set_visible(False)

In [31]:
clustermap_matrix = phenomexcan_vs_clinvar_sqrt_selected.iloc[ax.dendrogram_row.reordered_ind, ax.dendrogram_col.reordered_ind]
display(clustermap_matrix.shape)

(1949, 4411)

In [32]:
clustermap_matrix.head()

Unnamed: 0_level_0,"Symphalangism, proximal, 1b","Brachydactyly, type a1, c",Acromesomelic dysplasia Hunter Thompson type,Type C brachydactyly,Fibular hypoplasia and complex brachydactyly,Multiple synostoses syndrome 2,Osteoarthritis of hip,Grebe syndrome,"Cortical dysplasia, complex, with other brain malformations 6",Michelin-tire baby,...,Townes-Brocks syndrome 1,Parkinson disease 11,Snowflake vitreoretinal degeneration,Leber congenital amaurosis 16,"Cerebellar ataxia, nonprogressive, with mental retardation",Pulmonary veno-occlusive disease,Primary pulmonary hypertension,Pilomatrixoma,"Mental retardation, autosomal dominant 19",EXUDATIVE VITREORETINOPATHY 7
phenomexcan_traits,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1,Unnamed: 5_level_1,Unnamed: 6_level_1,Unnamed: 7_level_1,Unnamed: 8_level_1,Unnamed: 9_level_1,Unnamed: 10_level_1,Unnamed: 11_level_1,Unnamed: 12_level_1,Unnamed: 13_level_1,Unnamed: 14_level_1,Unnamed: 15_level_1,Unnamed: 16_level_1,Unnamed: 17_level_1,Unnamed: 18_level_1,Unnamed: 19_level_1,Unnamed: 20_level_1,Unnamed: 21_level_1
30100_raw-Mean_platelet_thrombocyte_volume,4.731462,4.731462,4.731462,4.731462,4.731462,4.731462,4.731462,4.731462,3.900522,3.900522,...,1.406331,0.878565,1.273427,1.273427,8.322121,3.159152,3.159152,7.309667,0.078364,0.078364
30050_raw-Mean_corpuscular_haemoglobin,2.902513,2.902513,2.902513,2.902513,2.902513,2.902513,2.902513,2.902513,4.133993,4.133993,...,1.532716,1.549973,1.784803,1.784803,1.613504,4.122258,4.122258,0.61192,0.584763,0.584763
30040_raw-Mean_corpuscular_volume,2.058844,2.058844,2.058844,2.058844,2.058844,2.058844,2.058844,2.058844,0.761882,0.761882,...,0.449345,1.955158,2.001667,2.001667,1.681089,3.144189,3.144189,1.207961,1.111495,1.111495
30270_raw-Mean_sphered_cell_volume,1.449294,1.449294,1.449294,1.449294,1.449294,1.449294,1.449294,1.449294,6.720109,6.720109,...,0.529857,1.723171,1.751668,1.751668,1.087806,3.543794,3.543794,1.754809,1.626424,1.626424
30260_raw-Mean_reticulocyte_volume,2.053437,2.053437,2.053437,2.053437,2.053437,2.053437,2.053437,2.053437,11.985074,11.985074,...,0.024956,1.568462,1.187347,1.187347,2.218463,3.608027,3.608027,1.124968,1.251059,1.251059


# Save

In [33]:
# for internal usage
clustermap_matrix_filename = os.path.join(conf.RESULTS_DIR, 'phenomexcan_vs_clinvar-clustermap.pkl.xz')
display(clustermap_matrix_filename)

'/mnt/phenomexcan_base/results/phenomexcan_vs_clinvar-clustermap.pkl.xz'

In [34]:
clustermap_matrix.to_pickle(clustermap_matrix_filename)

In [35]:
# for publication (csv)
clustermap_matrix_filename = os.path.join(conf.RESULTS_DIR, 'phenomexcan_vs_clinvar-clustermap.tsv.gz')
display(clustermap_matrix_filename)

'/mnt/phenomexcan_base/results/phenomexcan_vs_clinvar-clustermap.tsv.gz'

In [36]:
clustermap_matrix.to_csv(clustermap_matrix_filename, sep='\t')