In [1]:
%load_ext autoreload
%autoreload 2

In [2]:
import os
import pickle
from glob import glob
from concurrent.futures import ProcessPoolExecutor, as_completed

import numpy as np
import pandas as pd
from scipy import stats

import settings as conf
from utils import is_number, chunker
from results.multixcan import MXPhenoInfo, MXPhenoResults

# Load S-MultiXcan results

## From Rapid GWAS project

In [3]:
_path = os.path.join(conf.SMULTIXCAN_RESULTS_DIR['RapidGWASProject'], '*.tsv.gz')
display(_path)
all_smultixcan_results_dirs = glob(_path)
display(len(all_smultixcan_results_dirs))
assert len(all_smultixcan_results_dirs) == conf.SMULTIXCAN_EXPECTED_PHENOTYPES['RapidGWASProject']

'/mnt/phenomexcan_base/results/smultixcan/rapid_gwas_project/*.tsv.gz'

4049

In [4]:
all_smultixcan_phenotypes = [MXPhenoResults(p) for p in all_smultixcan_results_dirs]
all_smultixcan_phenotypes_plain_names = pd.Index([p.pheno_info.get_plain_name() for p in all_smultixcan_phenotypes])

display(len(all_smultixcan_phenotypes))
assert len(all_smultixcan_phenotypes) == conf.SMULTIXCAN_EXPECTED_PHENOTYPES['RapidGWASProject']

4049

## From GTEx GWAS manuscript

In [5]:
_path = os.path.join(conf.SMULTIXCAN_RESULTS_DIR['GTEX_GWAS'], '*_ccn30.txt.gz')
display(_path)
all_extra_results_dirs = glob(_path)
display(len(all_extra_results_dirs))
assert len(all_extra_results_dirs) == conf.SMULTIXCAN_EXPECTED_PHENOTYPES['GTEX_GWAS']

'/mnt/phenomexcan_base/results/smultixcan/gtex_gwas/*_ccn30.txt.gz'

42

In [6]:
_file_pattern = '(?P<code>[^/]+)_smultixcan_imputed_gwas_gtexv8mashr_ccn30\.txt'
all_extra_phenotypes = [MXPhenoResults(p, _file_pattern) for p in all_extra_results_dirs]
all_extra_phenotypes_plain_names = pd.Index([p.pheno_info.get_plain_name() for p in all_extra_phenotypes])

display(len(all_extra_phenotypes))
assert len(all_extra_phenotypes) == conf.SMULTIXCAN_EXPECTED_PHENOTYPES['GTEX_GWAS']

42

## Run loading

This read all phenotypes results (S-MultiXcan) and saves them all into a Pandas DataFrame

In [7]:
def _get_combined_results(phenos, column):
    return {
        pheno.pheno_info.get_plain_name() :
        pheno.get_data(cols=['gene_name', column], index_col='gene_simple')[column]
        for pheno in phenos
    }

In [8]:
def _run_all(column_name, phenotype_chunks, n_jobs=conf.N_JOBS_HIGH):
    print(column_name, flush=True)
    
    all_results = {}
    
    with ProcessPoolExecutor(max_workers=n_jobs) as executor:
        tasks = [executor.submit(_get_combined_results, chunk, column_name) for chunk in phenotype_chunks]
        for future in as_completed(tasks):
            res = future.result()
            all_results.update(res)
    
    return all_results

In [9]:
# phenotype_chunks = chunker(all_smultixcan_phenotypes[:5] + all_extra_phenotypes[:5], 200)
phenotype_chunks = chunker(all_smultixcan_phenotypes + all_extra_phenotypes, 200)

In [10]:
all_results = _run_all('pvalue', phenotype_chunks)

pvalue


## Save as DataFrame

In [11]:
_n_expected_phenos = np.sum(list(conf.SMULTIXCAN_EXPECTED_PHENOTYPES.values()))
display(_n_expected_phenos)
assert len(all_results) == _n_expected_phenos

4091

In [12]:
smultixcan_genes_associations = pd.DataFrame(all_results)
smultixcan_genes_associations.index.rename('gene_name', inplace=True)

assert smultixcan_genes_associations.index.is_unique

display(smultixcan_genes_associations.shape)
display(smultixcan_genes_associations.head())

(22518, 4091)

Unnamed: 0_level_0,20096_1-Size_of_red_wine_glass_drunk_small_125ml,2345-Ever_had_bowel_cancer_screening,N49-Diagnoses_main_ICD10_N49_Inflammatory_disorders_of_male_genital_organs_not_elsewhere_classified,100011_raw-Iron,5221-Index_of_best_refractometry_result_right,20003_1141150624-Treatmentmedication_code_zomig_25mg_tablet,S69-Diagnoses_main_ICD10_S69_Other_and_unspecified_injuries_of_wrist_and_hand,20024_1136-Job_code_deduced_Information_and_communication_technology_managers,20002_1385-Noncancer_illness_code_selfreported_allergy_or_anaphylactic_reaction_to_food,G6_SLEEPAPNO-Sleep_apnoea,...,Astle_et_al_2016_Sum_basophil_neutrophil_counts,RA_OKADA_TRANS_ETHNIC,pgc.scz2,PGC_ADHD_EUR_2017,MAGIC_FastingGlucose,Astle_et_al_2016_Red_blood_cell_count,SSGAC_Depressive_Symptoms,BCAC_ER_positive_BreastCancer_EUR,IBD.EUR.Inflammatory_Bowel_Disease,Astle_et_al_2016_High_light_scatter_reticulocyte_count
gene_name,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1,Unnamed: 5_level_1,Unnamed: 6_level_1,Unnamed: 7_level_1,Unnamed: 8_level_1,Unnamed: 9_level_1,Unnamed: 10_level_1,Unnamed: 11_level_1,Unnamed: 12_level_1,Unnamed: 13_level_1,Unnamed: 14_level_1,Unnamed: 15_level_1,Unnamed: 16_level_1,Unnamed: 17_level_1,Unnamed: 18_level_1,Unnamed: 19_level_1,Unnamed: 20_level_1,Unnamed: 21_level_1
ENSG00000000419,0.865429,0.918314,0.810683,0.374671,0.189032,0.140981,0.467741,0.129427,0.19368,0.285479,...,0.41621,0.782554,0.609467,0.980281,0.666504,0.409761,0.71331,0.168319,0.460244,0.765506
ENSG00000000457,0.174192,0.064765,0.889194,0.896938,0.448596,0.269602,0.540261,0.068405,0.041813,0.313427,...,0.14936,0.512603,0.010907,0.228982,0.607081,0.812484,0.678749,0.918971,0.311187,0.344574
ENSG00000000460,0.879969,0.240715,0.238228,0.567555,0.92132,0.825036,0.78223,0.644525,0.392273,0.840014,...,0.50352,0.764147,0.587969,0.30146,0.629621,0.486664,0.736509,0.9336,0.000477,0.321223
ENSG00000000938,0.19267,0.400054,0.114353,0.4707,0.889202,1.1e-05,0.899764,0.212352,0.829671,0.372348,...,0.899212,0.961678,0.059247,0.588855,0.898525,0.135045,0.954998,0.08822,0.176497,0.304281
ENSG00000000971,0.180632,0.79306,0.490585,0.088752,0.744531,0.949639,0.253817,0.377408,0.971655,0.070266,...,0.390618,0.093824,0.020391,0.109883,0.870551,0.99545,0.00266,0.421588,0.656851,0.868416


In [13]:
# Remove genes with no results
smultixcan_genes_associations = smultixcan_genes_associations.dropna(axis=0, how='all')

In [14]:
# how many entries are nan
smultixcan_genes_associations.isna().sum().sum()

1053055

In [15]:
# each UKB trait has 260 nan entries
260 * 4049

1052740

In [16]:
display(smultixcan_genes_associations.shape)

assert smultixcan_genes_associations.shape == (22515, _n_expected_phenos)

(22515, 4091)

In [17]:
# some testing

# For FinnGen
assert smultixcan_genes_associations.loc['ENSG00000110628', 'C_TONGUENAS-Malignant_neoplasm_of_other_and_unspecified_parts_of_tongue'] == 0.005086576789507484
assert smultixcan_genes_associations.loc['ENSG00000169783', 'C_TONGUENAS-Malignant_neoplasm_of_other_and_unspecified_parts_of_tongue'] == 0.3757187601354043
assert smultixcan_genes_associations.loc['ENSG00000137959', 'C_TONGUENAS-Malignant_neoplasm_of_other_and_unspecified_parts_of_tongue'] == 5.132614371931036e-07

# For ICD10
assert smultixcan_genes_associations.loc['ENSG00000135775', 'N18-Diagnoses_main_ICD10_N18_Chronic_renal_failure'] == 2.795075036067939e-05
assert smultixcan_genes_associations.loc['ENSG00000169783', 'N18-Diagnoses_main_ICD10_N18_Chronic_renal_failure'] == 0.06668736815697908
assert smultixcan_genes_associations.loc['ENSG00000174226', 'N18-Diagnoses_main_ICD10_N18_Chronic_renal_failure'] == 0.02496852053808064

# For extra phenotypes
assert smultixcan_genes_associations.loc['ENSG00000135775', 'MAGIC_ln_FastingInsulin'] == 0.08712399858507687
assert smultixcan_genes_associations.loc['ENSG00000169783', 'IMMUNOBASE_Systemic_lupus_erythematosus_hg19'] == 0.10974365378971256
assert smultixcan_genes_associations.loc['ENSG00000158691', 'pgc.scz2'] == 2.698821020217747e-28

### Remove zero pvalues

In [18]:
assert not smultixcan_genes_associations.isin([np.inf, -np.inf]).any().any()

In [19]:
smultixcan_genes_associations.isna().any().any()

True

In [20]:
all_pvals = pd.Series(smultixcan_genes_associations.values.flatten())

In [21]:
all_pvals.describe()

count    9.105581e+07
mean     4.839226e-01
std      2.934763e-01
min      0.000000e+00
25%      2.266266e-01
50%      4.794625e-01
75%      7.378647e-01
max      1.000000e+00
dtype: float64

In [22]:
_tmp = all_pvals.sort_values(ascending=True)
display(_tmp[_tmp > 0].head())

22853508    1.222488e-311
21202699    2.021948e-311
5438061     2.320165e-311
90964567    3.396667e-311
21233450    5.298136e-311
dtype: float64

In [23]:
_tmp = all_pvals.sort_values(ascending=False)
display(_tmp.head())

78657625    1.0
74083894    1.0
72766608    1.0
46494190    1.0
81914081    1.0
dtype: float64

In [24]:
all_pvals = pd.Series(smultixcan_genes_associations.values.flatten()).dropna()

In [25]:
assert (all_pvals >= 0).all()
assert (all_pvals <= 1).all()

In [26]:
# replace 0.0 pvals
smultixcan_genes_associations_zeros_removed = smultixcan_genes_associations.replace(0.0, 1e-320)

In [27]:
all_pvals = pd.Series(smultixcan_genes_associations_zeros_removed.values.flatten()).dropna()

In [28]:
assert (all_pvals > 0).all()
assert (all_pvals <= 1).all()

### Save

In [29]:
smultixcan_genes_associations_zeros_removed.shape

(22515, 4091)

In [30]:
smultixcan_genes_associations_zeros_removed.head()

Unnamed: 0_level_0,20096_1-Size_of_red_wine_glass_drunk_small_125ml,2345-Ever_had_bowel_cancer_screening,N49-Diagnoses_main_ICD10_N49_Inflammatory_disorders_of_male_genital_organs_not_elsewhere_classified,100011_raw-Iron,5221-Index_of_best_refractometry_result_right,20003_1141150624-Treatmentmedication_code_zomig_25mg_tablet,S69-Diagnoses_main_ICD10_S69_Other_and_unspecified_injuries_of_wrist_and_hand,20024_1136-Job_code_deduced_Information_and_communication_technology_managers,20002_1385-Noncancer_illness_code_selfreported_allergy_or_anaphylactic_reaction_to_food,G6_SLEEPAPNO-Sleep_apnoea,...,Astle_et_al_2016_Sum_basophil_neutrophil_counts,RA_OKADA_TRANS_ETHNIC,pgc.scz2,PGC_ADHD_EUR_2017,MAGIC_FastingGlucose,Astle_et_al_2016_Red_blood_cell_count,SSGAC_Depressive_Symptoms,BCAC_ER_positive_BreastCancer_EUR,IBD.EUR.Inflammatory_Bowel_Disease,Astle_et_al_2016_High_light_scatter_reticulocyte_count
gene_name,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1,Unnamed: 5_level_1,Unnamed: 6_level_1,Unnamed: 7_level_1,Unnamed: 8_level_1,Unnamed: 9_level_1,Unnamed: 10_level_1,Unnamed: 11_level_1,Unnamed: 12_level_1,Unnamed: 13_level_1,Unnamed: 14_level_1,Unnamed: 15_level_1,Unnamed: 16_level_1,Unnamed: 17_level_1,Unnamed: 18_level_1,Unnamed: 19_level_1,Unnamed: 20_level_1,Unnamed: 21_level_1
ENSG00000000419,0.865429,0.918314,0.810683,0.374671,0.189032,0.140981,0.467741,0.129427,0.19368,0.285479,...,0.41621,0.782554,0.609467,0.980281,0.666504,0.409761,0.71331,0.168319,0.460244,0.765506
ENSG00000000457,0.174192,0.064765,0.889194,0.896938,0.448596,0.269602,0.540261,0.068405,0.041813,0.313427,...,0.14936,0.512603,0.010907,0.228982,0.607081,0.812484,0.678749,0.918971,0.311187,0.344574
ENSG00000000460,0.879969,0.240715,0.238228,0.567555,0.92132,0.825036,0.78223,0.644525,0.392273,0.840014,...,0.50352,0.764147,0.587969,0.30146,0.629621,0.486664,0.736509,0.9336,0.000477,0.321223
ENSG00000000938,0.19267,0.400054,0.114353,0.4707,0.889202,1.1e-05,0.899764,0.212352,0.829671,0.372348,...,0.899212,0.961678,0.059247,0.588855,0.898525,0.135045,0.954998,0.08822,0.176497,0.304281
ENSG00000000971,0.180632,0.79306,0.490585,0.088752,0.744531,0.949639,0.253817,0.377408,0.971655,0.070266,...,0.390618,0.093824,0.020391,0.109883,0.870551,0.99545,0.00266,0.421588,0.656851,0.868416


In [31]:
# Save
smultixcan_genes_associations_filename = os.path.join(conf.GENE_ASSOC_DIR, f'smultixcan-mashr-pvalues.pkl.xz')
display(smultixcan_genes_associations_filename)

'/mnt/phenomexcan_base/gene_assoc/smultixcan-mashr-pvalues.pkl.xz'

In [32]:
smultixcan_genes_associations_zeros_removed.to_pickle(smultixcan_genes_associations_filename)

### Save for publication

In [33]:
# for publication
output_file = os.path.join(conf.GENE_ASSOC_DIR, 'smultixcan-mashr-pvalues.tsv.gz')
display(output_file)

smultixcan_genes_associations_zeros_removed.to_csv(output_file, sep='\t', float_format='%.4e')

'/mnt/phenomexcan_base/gene_assoc/smultixcan-mashr-pvalues.tsv.gz'

In [34]:
# test "for publication" file
_tmp = pd.read_csv(output_file, sep='\t', index_col='gene_name')

In [35]:
display(_tmp.shape)
assert _tmp.shape == smultixcan_genes_associations_zeros_removed.shape

(22515, 4091)

In [36]:
_tmp.head()

Unnamed: 0_level_0,20096_1-Size_of_red_wine_glass_drunk_small_125ml,2345-Ever_had_bowel_cancer_screening,N49-Diagnoses_main_ICD10_N49_Inflammatory_disorders_of_male_genital_organs_not_elsewhere_classified,100011_raw-Iron,5221-Index_of_best_refractometry_result_right,20003_1141150624-Treatmentmedication_code_zomig_25mg_tablet,S69-Diagnoses_main_ICD10_S69_Other_and_unspecified_injuries_of_wrist_and_hand,20024_1136-Job_code_deduced_Information_and_communication_technology_managers,20002_1385-Noncancer_illness_code_selfreported_allergy_or_anaphylactic_reaction_to_food,G6_SLEEPAPNO-Sleep_apnoea,...,Astle_et_al_2016_Sum_basophil_neutrophil_counts,RA_OKADA_TRANS_ETHNIC,pgc.scz2,PGC_ADHD_EUR_2017,MAGIC_FastingGlucose,Astle_et_al_2016_Red_blood_cell_count,SSGAC_Depressive_Symptoms,BCAC_ER_positive_BreastCancer_EUR,IBD.EUR.Inflammatory_Bowel_Disease,Astle_et_al_2016_High_light_scatter_reticulocyte_count
gene_name,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1,Unnamed: 5_level_1,Unnamed: 6_level_1,Unnamed: 7_level_1,Unnamed: 8_level_1,Unnamed: 9_level_1,Unnamed: 10_level_1,Unnamed: 11_level_1,Unnamed: 12_level_1,Unnamed: 13_level_1,Unnamed: 14_level_1,Unnamed: 15_level_1,Unnamed: 16_level_1,Unnamed: 17_level_1,Unnamed: 18_level_1,Unnamed: 19_level_1,Unnamed: 20_level_1,Unnamed: 21_level_1
ENSG00000000419,0.86543,0.91831,0.81068,0.37467,0.18903,0.14098,0.46774,0.12943,0.19368,0.28548,...,0.41621,0.78255,0.60947,0.98028,0.6665,0.40976,0.71331,0.16832,0.46024,0.76551
ENSG00000000457,0.17419,0.064765,0.88919,0.89694,0.4486,0.2696,0.54026,0.068405,0.041813,0.31343,...,0.14936,0.5126,0.010907,0.22898,0.60708,0.81248,0.67875,0.91897,0.31119,0.34457
ENSG00000000460,0.87997,0.24071,0.23823,0.56755,0.92132,0.82504,0.78223,0.64453,0.39227,0.84001,...,0.50352,0.76415,0.58797,0.30146,0.62962,0.48666,0.73651,0.9336,0.000477,0.32122
ENSG00000000938,0.19267,0.40005,0.11435,0.4707,0.8892,1.1e-05,0.89976,0.21235,0.82967,0.37235,...,0.89921,0.96168,0.059247,0.58885,0.89853,0.13504,0.955,0.08822,0.1765,0.30428
ENSG00000000971,0.18063,0.79306,0.49058,0.088752,0.74453,0.94964,0.25382,0.37741,0.97165,0.070266,...,0.39062,0.093824,0.020391,0.10988,0.87055,0.99545,0.00266,0.42159,0.65685,0.86842


In [37]:
assert not _tmp.isin([np.inf, -np.inf]).any().any()

In [38]:
assert np.array_equal(smultixcan_genes_associations_zeros_removed.isna(), _tmp.isna())

In [39]:
_tmp.isna().any().any()

True

In [40]:
_tmp_flat = pd.Series(_tmp.values.flatten()).dropna()
assert ((_tmp_flat > 0) & (_tmp_flat <= 1)).all().all()

In [41]:
assert np.allclose(_tmp.values, smultixcan_genes_associations_zeros_removed.values, atol=1e-320, rtol=1e-4, equal_nan=True)

## Save zscores

In [42]:
zscores = np.abs(stats.norm.ppf(smultixcan_genes_associations / 2))

smultixcan_genes_associations_zscores = pd.DataFrame(zscores, index=smultixcan_genes_associations.index.copy(), columns=smultixcan_genes_associations.columns.copy())

display(smultixcan_genes_associations_zscores.shape)
display(smultixcan_genes_associations_zscores.head())

  cond1 = (0 < q) & (q < 1)


  cond1 = (0 < q) & (q < 1)


(22515, 4091)

Unnamed: 0_level_0,20096_1-Size_of_red_wine_glass_drunk_small_125ml,2345-Ever_had_bowel_cancer_screening,N49-Diagnoses_main_ICD10_N49_Inflammatory_disorders_of_male_genital_organs_not_elsewhere_classified,100011_raw-Iron,5221-Index_of_best_refractometry_result_right,20003_1141150624-Treatmentmedication_code_zomig_25mg_tablet,S69-Diagnoses_main_ICD10_S69_Other_and_unspecified_injuries_of_wrist_and_hand,20024_1136-Job_code_deduced_Information_and_communication_technology_managers,20002_1385-Noncancer_illness_code_selfreported_allergy_or_anaphylactic_reaction_to_food,G6_SLEEPAPNO-Sleep_apnoea,...,Astle_et_al_2016_Sum_basophil_neutrophil_counts,RA_OKADA_TRANS_ETHNIC,pgc.scz2,PGC_ADHD_EUR_2017,MAGIC_FastingGlucose,Astle_et_al_2016_Red_blood_cell_count,SSGAC_Depressive_Symptoms,BCAC_ER_positive_BreastCancer_EUR,IBD.EUR.Inflammatory_Bowel_Disease,Astle_et_al_2016_High_light_scatter_reticulocyte_count
gene_name,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1,Unnamed: 5_level_1,Unnamed: 6_level_1,Unnamed: 7_level_1,Unnamed: 8_level_1,Unnamed: 9_level_1,Unnamed: 10_level_1,Unnamed: 11_level_1,Unnamed: 12_level_1,Unnamed: 13_level_1,Unnamed: 14_level_1,Unnamed: 15_level_1,Unnamed: 16_level_1,Unnamed: 17_level_1,Unnamed: 18_level_1,Unnamed: 19_level_1,Unnamed: 20_level_1,Unnamed: 21_level_1
ENSG00000000419,0.169468,0.102558,0.239545,0.887758,1.313448,1.472148,0.72616,1.516367,1.299771,1.068093,...,0.813014,0.275993,0.510834,0.024717,0.430951,0.824314,0.367414,1.377624,0.738444,0.298259
ENSG00000000457,1.358856,1.846875,0.139324,0.12953,0.757757,1.103979,0.612418,1.822327,2.035372,1.008058,...,1.441795,0.654791,2.545653,1.202984,0.514244,0.237223,0.414171,0.101731,1.012735,0.945167
ENSG00000000460,0.151008,1.173202,1.179426,0.571656,0.098771,0.221072,0.276415,0.461381,0.855502,0.201876,...,0.668962,0.30004,0.541782,1.033308,0.482261,0.695624,0.33648,0.083316,3.493196,0.991948
ENSG00000000938,1.302722,0.841524,1.578926,0.72134,0.139314,4.387016,0.125959,1.247123,0.215124,0.892083,...,0.126657,0.048048,1.886356,0.540496,0.127524,1.494501,0.056432,1.704863,1.351619,1.027297
ENSG00000000971,1.338813,0.262339,0.689379,1.702019,0.325859,0.063161,1.141126,0.882682,0.035533,1.810191,...,0.858497,1.675562,2.319072,1.598721,0.162958,0.005703,3.004544,0.803669,0.444266,0.165671


In [43]:
# some testing

# For FinnGen
assert smultixcan_genes_associations_zscores.loc['ENSG00000110628', 'C_TONGUENAS-Malignant_neoplasm_of_other_and_unspecified_parts_of_tongue'] == 2.8014991958592232
assert smultixcan_genes_associations_zscores.loc['ENSG00000169783', 'C_TONGUENAS-Malignant_neoplasm_of_other_and_unspecified_parts_of_tongue'] == 0.8858121525410351
assert smultixcan_genes_associations_zscores.loc['ENSG00000137959', 'C_TONGUENAS-Malignant_neoplasm_of_other_and_unspecified_parts_of_tongue'] == 5.021287959552069

# For ICD10
assert smultixcan_genes_associations_zscores.loc['ENSG00000135775', 'N18-Diagnoses_main_ICD10_N18_Chronic_renal_failure'] == 4.1895505583580785

# For extra phenotypes
assert smultixcan_genes_associations_zscores.loc['ENSG00000135775', 'MAGIC_ln_FastingInsulin'] == 1.71076773175347
assert smultixcan_genes_associations_zscores.loc['ENSG00000169783', 'IMMUNOBASE_Systemic_lupus_erythematosus_hg19'] == 1.5993464052052957
assert smultixcan_genes_associations_zscores.loc['ENSG00000158691', 'pgc.scz2'] == 11.031317483379759

In [44]:
assert np.array_equal(smultixcan_genes_associations.isna(), smultixcan_genes_associations_zscores.isna())

### Remove inf values

In [45]:
smultixcan_genes_associations_zscores.isin([np.inf, -np.inf]).any().any()

True

In [46]:
max_zscores = pd.Series(smultixcan_genes_associations_zscores.values.flatten())

In [47]:
_tmp = max_zscores.sort_values(ascending=False)
display(_tmp[~np.isinf(_tmp)].head())

22853508    37.737142
21202699    37.723815
5438061     37.720171
90964567    37.710072
21233450    37.698289
dtype: float64

In [48]:
_tmp = max_zscores.sort_values(ascending=True)
display(_tmp.head())

73097956    0.0
75098455    0.0
75098462    0.0
71437010    0.0
84565057    0.0
dtype: float64

In [49]:
smultixcan_genes_associations_zscores.isna().any().any()

True

In [50]:
# replace inf
smultixcan_genes_associations_zscores = smultixcan_genes_associations_zscores.replace(np.inf, 40)
assert not smultixcan_genes_associations_zscores.isin([np.inf, -np.inf]).any().any()

### Save

In [51]:
smultixcan_genes_associations_zscores.shape

(22515, 4091)

In [52]:
smultixcan_genes_associations_zscores.head()

Unnamed: 0_level_0,20096_1-Size_of_red_wine_glass_drunk_small_125ml,2345-Ever_had_bowel_cancer_screening,N49-Diagnoses_main_ICD10_N49_Inflammatory_disorders_of_male_genital_organs_not_elsewhere_classified,100011_raw-Iron,5221-Index_of_best_refractometry_result_right,20003_1141150624-Treatmentmedication_code_zomig_25mg_tablet,S69-Diagnoses_main_ICD10_S69_Other_and_unspecified_injuries_of_wrist_and_hand,20024_1136-Job_code_deduced_Information_and_communication_technology_managers,20002_1385-Noncancer_illness_code_selfreported_allergy_or_anaphylactic_reaction_to_food,G6_SLEEPAPNO-Sleep_apnoea,...,Astle_et_al_2016_Sum_basophil_neutrophil_counts,RA_OKADA_TRANS_ETHNIC,pgc.scz2,PGC_ADHD_EUR_2017,MAGIC_FastingGlucose,Astle_et_al_2016_Red_blood_cell_count,SSGAC_Depressive_Symptoms,BCAC_ER_positive_BreastCancer_EUR,IBD.EUR.Inflammatory_Bowel_Disease,Astle_et_al_2016_High_light_scatter_reticulocyte_count
gene_name,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1,Unnamed: 5_level_1,Unnamed: 6_level_1,Unnamed: 7_level_1,Unnamed: 8_level_1,Unnamed: 9_level_1,Unnamed: 10_level_1,Unnamed: 11_level_1,Unnamed: 12_level_1,Unnamed: 13_level_1,Unnamed: 14_level_1,Unnamed: 15_level_1,Unnamed: 16_level_1,Unnamed: 17_level_1,Unnamed: 18_level_1,Unnamed: 19_level_1,Unnamed: 20_level_1,Unnamed: 21_level_1
ENSG00000000419,0.169468,0.102558,0.239545,0.887758,1.313448,1.472148,0.72616,1.516367,1.299771,1.068093,...,0.813014,0.275993,0.510834,0.024717,0.430951,0.824314,0.367414,1.377624,0.738444,0.298259
ENSG00000000457,1.358856,1.846875,0.139324,0.12953,0.757757,1.103979,0.612418,1.822327,2.035372,1.008058,...,1.441795,0.654791,2.545653,1.202984,0.514244,0.237223,0.414171,0.101731,1.012735,0.945167
ENSG00000000460,0.151008,1.173202,1.179426,0.571656,0.098771,0.221072,0.276415,0.461381,0.855502,0.201876,...,0.668962,0.30004,0.541782,1.033308,0.482261,0.695624,0.33648,0.083316,3.493196,0.991948
ENSG00000000938,1.302722,0.841524,1.578926,0.72134,0.139314,4.387016,0.125959,1.247123,0.215124,0.892083,...,0.126657,0.048048,1.886356,0.540496,0.127524,1.494501,0.056432,1.704863,1.351619,1.027297
ENSG00000000971,1.338813,0.262339,0.689379,1.702019,0.325859,0.063161,1.141126,0.882682,0.035533,1.810191,...,0.858497,1.675562,2.319072,1.598721,0.162958,0.005703,3.004544,0.803669,0.444266,0.165671


In [53]:
# save
smultixcan_genes_associations_zscores_filename = os.path.join(conf.GENE_ASSOC_DIR, f'smultixcan-mashr-zscores.pkl.xz')
display(smultixcan_genes_associations_zscores_filename)

'/mnt/phenomexcan_base/gene_assoc/smultixcan-mashr-zscores.pkl.xz'

In [54]:
smultixcan_genes_associations_zscores.to_pickle(smultixcan_genes_associations_zscores_filename)

### Save for publication

In [55]:
# for publication
output_file = os.path.join(conf.GENE_ASSOC_DIR, 'smultixcan-mashr-zscores.tsv.gz')
display(output_file)

smultixcan_genes_associations_zscores.to_csv(output_file, sep='\t', float_format='%.4e')

'/mnt/phenomexcan_base/gene_assoc/smultixcan-mashr-zscores.tsv.gz'

In [56]:
# test "for publication" file
_tmp = pd.read_csv(output_file, sep='\t', index_col='gene_name')

In [57]:
display(_tmp.shape)
assert _tmp.shape == smultixcan_genes_associations_zscores.shape

(22515, 4091)

In [58]:
_tmp.head()

Unnamed: 0_level_0,20096_1-Size_of_red_wine_glass_drunk_small_125ml,2345-Ever_had_bowel_cancer_screening,N49-Diagnoses_main_ICD10_N49_Inflammatory_disorders_of_male_genital_organs_not_elsewhere_classified,100011_raw-Iron,5221-Index_of_best_refractometry_result_right,20003_1141150624-Treatmentmedication_code_zomig_25mg_tablet,S69-Diagnoses_main_ICD10_S69_Other_and_unspecified_injuries_of_wrist_and_hand,20024_1136-Job_code_deduced_Information_and_communication_technology_managers,20002_1385-Noncancer_illness_code_selfreported_allergy_or_anaphylactic_reaction_to_food,G6_SLEEPAPNO-Sleep_apnoea,...,Astle_et_al_2016_Sum_basophil_neutrophil_counts,RA_OKADA_TRANS_ETHNIC,pgc.scz2,PGC_ADHD_EUR_2017,MAGIC_FastingGlucose,Astle_et_al_2016_Red_blood_cell_count,SSGAC_Depressive_Symptoms,BCAC_ER_positive_BreastCancer_EUR,IBD.EUR.Inflammatory_Bowel_Disease,Astle_et_al_2016_High_light_scatter_reticulocyte_count
gene_name,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1,Unnamed: 5_level_1,Unnamed: 6_level_1,Unnamed: 7_level_1,Unnamed: 8_level_1,Unnamed: 9_level_1,Unnamed: 10_level_1,Unnamed: 11_level_1,Unnamed: 12_level_1,Unnamed: 13_level_1,Unnamed: 14_level_1,Unnamed: 15_level_1,Unnamed: 16_level_1,Unnamed: 17_level_1,Unnamed: 18_level_1,Unnamed: 19_level_1,Unnamed: 20_level_1,Unnamed: 21_level_1
ENSG00000000419,0.16947,0.10256,0.23954,0.88776,1.3134,1.4721,0.72616,1.5164,1.2998,1.0681,...,0.81301,0.27599,0.51083,0.024717,0.43095,0.82431,0.36741,1.3776,0.73844,0.29826
ENSG00000000457,1.3589,1.8469,0.13932,0.12953,0.75776,1.104,0.61242,1.8223,2.0354,1.0081,...,1.4418,0.65479,2.5457,1.203,0.51424,0.23722,0.41417,0.10173,1.0127,0.94517
ENSG00000000460,0.15101,1.1732,1.1794,0.57166,0.098771,0.22107,0.27641,0.46138,0.8555,0.20188,...,0.66896,0.30004,0.54178,1.0333,0.48226,0.69562,0.33648,0.083316,3.4932,0.99195
ENSG00000000938,1.3027,0.84152,1.5789,0.72134,0.13931,4.387,0.12596,1.2471,0.21512,0.89208,...,0.12666,0.048048,1.8864,0.5405,0.12752,1.4945,0.056432,1.7049,1.3516,1.0273
ENSG00000000971,1.3388,0.26234,0.68938,1.702,0.32586,0.063161,1.1411,0.88268,0.035533,1.8102,...,0.8585,1.6756,2.3191,1.5987,0.16296,0.005703,3.0045,0.80367,0.44427,0.16567


In [59]:
assert not _tmp.isin([np.inf, -np.inf]).any().any()

In [60]:
assert np.array_equal(smultixcan_genes_associations_zscores.isna(), _tmp.isna())

In [61]:
_tmp.isna().any().any()

True

In [62]:
_tmp_flat = pd.Series(_tmp.values.flatten()).dropna()
assert ((_tmp_flat >= 0) & (_tmp_flat <= 1600)).all().all()

In [63]:
assert np.allclose(_tmp.values, smultixcan_genes_associations_zscores.values, atol=1e-320, rtol=1e-4, equal_nan=True)