In [1]:
%load_ext autoreload
%autoreload 2

In [2]:
import os
import pickle
from glob import glob
from concurrent.futures import ProcessPoolExecutor, as_completed

import numpy as np
import pandas as pd
from scipy import stats

import settings as conf
from utils import is_number, chunker
from results.multixcan import MXPhenoInfo, MXPhenoResults

# Load S-MultiXcan results

## From Rapid GWAS project

In [3]:
_path = os.path.join(conf.SMULTIXCAN_RESULTS_DIR['RapidGWASProject'], '*.tsv.gz')
display(_path)
all_smultixcan_results_dirs = glob(_path)
display(len(all_smultixcan_results_dirs))
assert len(all_smultixcan_results_dirs) == conf.SMULTIXCAN_EXPECTED_PHENOTYPES['RapidGWASProject']

'/mnt/phenomexcan_base/results/smultixcan/rapid_gwas_project/*.tsv.gz'

4049

In [4]:
all_smultixcan_phenotypes = [MXPhenoResults(p) for p in all_smultixcan_results_dirs]
all_smultixcan_phenotypes_plain_names = pd.Index([p.pheno_info.get_plain_name() for p in all_smultixcan_phenotypes])

display(len(all_smultixcan_phenotypes))
assert len(all_smultixcan_phenotypes) == conf.SMULTIXCAN_EXPECTED_PHENOTYPES['RapidGWASProject']

4049

## From GTEx GWAS manuscript

In [5]:
_path = os.path.join(conf.SMULTIXCAN_RESULTS_DIR['GTEX_GWAS'], '*_ccn30.txt')
display(_path)
all_extra_results_dirs = glob(_path)
display(len(all_extra_results_dirs))
assert len(all_extra_results_dirs) == conf.SMULTIXCAN_EXPECTED_PHENOTYPES['GTEX_GWAS']

'/mnt/phenomexcan_base/results/smultixcan/gtex_gwas/*_ccn30.txt'

42

In [6]:
_file_pattern = '(?P<code>[^/]+)_smultixcan_imputed_gwas_gtexv8mashr_ccn30\.txt'
all_extra_phenotypes = [MXPhenoResults(p, _file_pattern) for p in all_extra_results_dirs]
all_extra_phenotypes_plain_names = pd.Index([p.pheno_info.get_plain_name() for p in all_extra_phenotypes])

display(len(all_extra_phenotypes))
assert len(all_extra_phenotypes) == conf.SMULTIXCAN_EXPECTED_PHENOTYPES['GTEX_GWAS']

42

## Run loading

This read all phenotypes results (S-MultiXcan) and saves them all into a Pandas DataFrame

In [7]:
def _get_combined_results(phenos, column):
    return {
        pheno.pheno_info.get_plain_name() :
        pheno.get_data(cols=['gene_name', column], index_col='gene_simple')[column]
        for pheno in phenos
    }

In [8]:
def _run_all(column_name, phenotype_chunks, n_jobs=conf.N_JOBS_HIGH):
    print(column_name, flush=True)
    
    all_results = {}
    
    with ProcessPoolExecutor(max_workers=n_jobs) as executor:
        tasks = [executor.submit(_get_combined_results, chunk, column_name) for chunk in phenotype_chunks]
        for future in as_completed(tasks):
            res = future.result()
            all_results.update(res)
    
    return all_results

In [9]:
# phenotype_chunks = chunker(all_smultixcan_phenotypes[:5] + all_extra_phenotypes[:5], 200)
phenotype_chunks = chunker(all_smultixcan_phenotypes + all_extra_phenotypes, 200)

In [10]:
all_results = _run_all('pvalue', phenotype_chunks)

pvalue


## Save as DataFrame

In [11]:
_n_expected_phenos = np.sum(list(conf.SMULTIXCAN_EXPECTED_PHENOTYPES.values()))
display(_n_expected_phenos)
assert len(all_results) == _n_expected_phenos

4091

In [12]:
smultixcan_genes_associations = pd.DataFrame(all_results)
smultixcan_genes_associations.index.rename('gene_name', inplace=True)

assert smultixcan_genes_associations.index.is_unique

display(smultixcan_genes_associations.shape)
display(smultixcan_genes_associations.head())

(22518, 4091)

Unnamed: 0_level_0,4270-Volume_level_set_by_participant_left,S05-Diagnoses_main_ICD10_S05_Injury_of_eye_and_orbit,20003_1141157402-Treatmentmedication_code_prednisolone_product,20002_1427-Noncancer_illness_code_selfreported_polycystic_kidney,110001-Invitation_to_complete_online_24hour_recall_dietary_questionnaire_acceptance,22617_2442-Job_SOC_coding_Social_workers,J93-Diagnoses_main_ICD10_J93_Pneumothorax,22601_41133206-Job_coding_local_government_administrative_officer_or_assistant_or_clerk,6145_3-Illness_injury_bereavement_stress_in_last_2_years_Death_of_a_close_relative,20002_1597-Noncancer_illness_code_selfreported_tinnitus_tiniitis,...,PGC_ADHD_EUR_2017,BCAC_ER_negative_BreastCancer_EUR,SSGAC_Education_Years_Pooled,pgc.scz2,MAGNETIC_HDL.C,MAGIC_ln_FastingInsulin,Astle_et_al_2016_Sum_eosinophil_basophil_counts,SSGAC_Depressive_Symptoms,Jones_et_al_2016_SleepDuration,Astle_et_al_2016_Red_blood_cell_count
gene_name,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1,Unnamed: 5_level_1,Unnamed: 6_level_1,Unnamed: 7_level_1,Unnamed: 8_level_1,Unnamed: 9_level_1,Unnamed: 10_level_1,Unnamed: 11_level_1,Unnamed: 12_level_1,Unnamed: 13_level_1,Unnamed: 14_level_1,Unnamed: 15_level_1,Unnamed: 16_level_1,Unnamed: 17_level_1,Unnamed: 18_level_1,Unnamed: 19_level_1,Unnamed: 20_level_1,Unnamed: 21_level_1
ENSG00000000419,0.952828,0.827955,0.976745,0.916643,0.580172,0.270858,0.779171,0.597882,0.531657,0.534696,...,0.980281,0.413973,0.747822,0.609467,0.019467,0.371748,0.82301,0.71331,0.858389,0.409761
ENSG00000000457,0.646365,0.299606,0.477472,0.00357,0.905365,0.845711,0.13743,0.868056,0.028947,0.392273,...,0.228982,0.63683,0.654535,0.010907,0.7263,0.210658,0.008023,0.678749,0.836679,0.812484
ENSG00000000460,0.896074,0.094218,0.507646,0.166442,0.974545,0.62891,0.425188,0.551723,0.059456,0.295833,...,0.30146,0.140522,0.646442,0.587969,0.498724,0.521805,0.004462,0.736509,0.432229,0.486664
ENSG00000000938,0.576924,0.107121,0.439276,0.567582,0.768334,0.203873,0.231449,0.106706,0.595655,0.953718,...,0.588855,0.226977,0.576593,0.059247,0.435438,0.95316,0.101875,0.954998,0.097831,0.135045
ENSG00000000971,0.95639,0.492012,0.510924,0.532389,0.555313,0.993563,0.807439,0.948366,0.774694,0.490962,...,0.109883,0.040871,0.005662,0.020391,0.439466,0.690242,0.055059,0.00266,0.331132,0.99545


In [13]:
# Remove genes with no results
smultixcan_genes_associations = smultixcan_genes_associations.dropna(axis=0, how='all')

In [14]:
# how many entries are nan
smultixcan_genes_associations.isna().sum().sum()

1053055

In [15]:
# each UKB trait has 260 nan entries
260 * 4049

1052740

In [16]:
display(smultixcan_genes_associations.shape)

assert smultixcan_genes_associations.shape == (22515, _n_expected_phenos)

(22515, 4091)

In [17]:
# some testing

# For FinnGen
assert smultixcan_genes_associations.loc['ENSG00000110628', 'C_TONGUENAS-Malignant_neoplasm_of_other_and_unspecified_parts_of_tongue'] == 0.005086576789507484
assert smultixcan_genes_associations.loc['ENSG00000169783', 'C_TONGUENAS-Malignant_neoplasm_of_other_and_unspecified_parts_of_tongue'] == 0.3757187601354043
assert smultixcan_genes_associations.loc['ENSG00000137959', 'C_TONGUENAS-Malignant_neoplasm_of_other_and_unspecified_parts_of_tongue'] == 5.132614371931036e-07

# For ICD10
assert smultixcan_genes_associations.loc['ENSG00000135775', 'N18-Diagnoses_main_ICD10_N18_Chronic_renal_failure'] == 2.795075036067939e-05
assert smultixcan_genes_associations.loc['ENSG00000169783', 'N18-Diagnoses_main_ICD10_N18_Chronic_renal_failure'] == 0.06668736815697908
assert smultixcan_genes_associations.loc['ENSG00000174226', 'N18-Diagnoses_main_ICD10_N18_Chronic_renal_failure'] == 0.02496852053808064

# For extra phenotypes
assert smultixcan_genes_associations.loc['ENSG00000135775', 'MAGIC_ln_FastingInsulin'] == 0.08712399858507687
assert smultixcan_genes_associations.loc['ENSG00000169783', 'IMMUNOBASE_Systemic_lupus_erythematosus_hg19'] == 0.10974365378971256
assert smultixcan_genes_associations.loc['ENSG00000158691', 'pgc.scz2'] == 2.698821020217747e-28

### Remove zero pvalues

In [18]:
assert not smultixcan_genes_associations.isin([np.inf, -np.inf]).any().any()

In [19]:
smultixcan_genes_associations.isna().any().any()

True

In [20]:
all_pvals = pd.Series(smultixcan_genes_associations.values.flatten())

In [21]:
all_pvals.describe()

count    9.105581e+07
mean     4.839226e-01
std      2.934763e-01
min      0.000000e+00
25%      2.266266e-01
50%      4.794625e-01
75%      7.378647e-01
max      1.000000e+00
dtype: float64

In [22]:
_tmp = all_pvals.sort_values(ascending=True)
display(_tmp[_tmp > 0].head())

22853308    1.222488e-311
21203099    2.021948e-311
5437861     2.320165e-311
90964367    3.396667e-311
21233250    5.298136e-311
dtype: float64

In [23]:
_tmp = all_pvals.sort_values(ascending=False)
display(_tmp.head())

9221107     1.0
75663037    1.0
79238576    1.0
57261708    1.0
71874771    1.0
dtype: float64

In [24]:
all_pvals = pd.Series(smultixcan_genes_associations.values.flatten()).dropna()

In [25]:
assert (all_pvals >= 0).all()
assert (all_pvals <= 1).all()

In [26]:
# replace 0.0 pvals
smultixcan_genes_associations_zeros_removed = smultixcan_genes_associations.replace(0.0, 1e-320)

In [27]:
all_pvals = pd.Series(smultixcan_genes_associations_zeros_removed.values.flatten()).dropna()

In [28]:
assert (all_pvals > 0).all()
assert (all_pvals <= 1).all()

### Save

In [29]:
smultixcan_genes_associations_zeros_removed.shape

(22515, 4091)

In [30]:
smultixcan_genes_associations_zeros_removed.head()

Unnamed: 0_level_0,4270-Volume_level_set_by_participant_left,S05-Diagnoses_main_ICD10_S05_Injury_of_eye_and_orbit,20003_1141157402-Treatmentmedication_code_prednisolone_product,20002_1427-Noncancer_illness_code_selfreported_polycystic_kidney,110001-Invitation_to_complete_online_24hour_recall_dietary_questionnaire_acceptance,22617_2442-Job_SOC_coding_Social_workers,J93-Diagnoses_main_ICD10_J93_Pneumothorax,22601_41133206-Job_coding_local_government_administrative_officer_or_assistant_or_clerk,6145_3-Illness_injury_bereavement_stress_in_last_2_years_Death_of_a_close_relative,20002_1597-Noncancer_illness_code_selfreported_tinnitus_tiniitis,...,PGC_ADHD_EUR_2017,BCAC_ER_negative_BreastCancer_EUR,SSGAC_Education_Years_Pooled,pgc.scz2,MAGNETIC_HDL.C,MAGIC_ln_FastingInsulin,Astle_et_al_2016_Sum_eosinophil_basophil_counts,SSGAC_Depressive_Symptoms,Jones_et_al_2016_SleepDuration,Astle_et_al_2016_Red_blood_cell_count
gene_name,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1,Unnamed: 5_level_1,Unnamed: 6_level_1,Unnamed: 7_level_1,Unnamed: 8_level_1,Unnamed: 9_level_1,Unnamed: 10_level_1,Unnamed: 11_level_1,Unnamed: 12_level_1,Unnamed: 13_level_1,Unnamed: 14_level_1,Unnamed: 15_level_1,Unnamed: 16_level_1,Unnamed: 17_level_1,Unnamed: 18_level_1,Unnamed: 19_level_1,Unnamed: 20_level_1,Unnamed: 21_level_1
ENSG00000000419,0.952828,0.827955,0.976745,0.916643,0.580172,0.270858,0.779171,0.597882,0.531657,0.534696,...,0.980281,0.413973,0.747822,0.609467,0.019467,0.371748,0.82301,0.71331,0.858389,0.409761
ENSG00000000457,0.646365,0.299606,0.477472,0.00357,0.905365,0.845711,0.13743,0.868056,0.028947,0.392273,...,0.228982,0.63683,0.654535,0.010907,0.7263,0.210658,0.008023,0.678749,0.836679,0.812484
ENSG00000000460,0.896074,0.094218,0.507646,0.166442,0.974545,0.62891,0.425188,0.551723,0.059456,0.295833,...,0.30146,0.140522,0.646442,0.587969,0.498724,0.521805,0.004462,0.736509,0.432229,0.486664
ENSG00000000938,0.576924,0.107121,0.439276,0.567582,0.768334,0.203873,0.231449,0.106706,0.595655,0.953718,...,0.588855,0.226977,0.576593,0.059247,0.435438,0.95316,0.101875,0.954998,0.097831,0.135045
ENSG00000000971,0.95639,0.492012,0.510924,0.532389,0.555313,0.993563,0.807439,0.948366,0.774694,0.490962,...,0.109883,0.040871,0.005662,0.020391,0.439466,0.690242,0.055059,0.00266,0.331132,0.99545


In [31]:
# Save
smultixcan_genes_associations_filename = os.path.join(conf.GENE_ASSOC_DIR, f'smultixcan-mashr-pvalues.pkl.xz')
display(smultixcan_genes_associations_filename)

'/mnt/phenomexcan_base/gene_assoc/smultixcan-mashr-pvalues.pkl.xz'

In [32]:
smultixcan_genes_associations_zeros_removed.to_pickle(smultixcan_genes_associations_filename)

### Save for publication

In [33]:
# for publication
output_file = os.path.join(conf.GENE_ASSOC_DIR, 'smultixcan-mashr-pvalues.tsv.gz')
display(output_file)

smultixcan_genes_associations_zeros_removed.to_csv(output_file, sep='\t', float_format='%.4e')

'/mnt/phenomexcan_base/gene_assoc/smultixcan-mashr-pvalues.tsv.gz'

In [34]:
# test "for publication" file
_tmp = pd.read_csv(output_file, sep='\t', index_col='gene_name')

In [35]:
display(_tmp.shape)
assert _tmp.shape == smultixcan_genes_associations_zeros_removed.shape

(22515, 4091)

In [36]:
_tmp.head()

Unnamed: 0_level_0,4270-Volume_level_set_by_participant_left,S05-Diagnoses_main_ICD10_S05_Injury_of_eye_and_orbit,20003_1141157402-Treatmentmedication_code_prednisolone_product,20002_1427-Noncancer_illness_code_selfreported_polycystic_kidney,110001-Invitation_to_complete_online_24hour_recall_dietary_questionnaire_acceptance,22617_2442-Job_SOC_coding_Social_workers,J93-Diagnoses_main_ICD10_J93_Pneumothorax,22601_41133206-Job_coding_local_government_administrative_officer_or_assistant_or_clerk,6145_3-Illness_injury_bereavement_stress_in_last_2_years_Death_of_a_close_relative,20002_1597-Noncancer_illness_code_selfreported_tinnitus_tiniitis,...,PGC_ADHD_EUR_2017,BCAC_ER_negative_BreastCancer_EUR,SSGAC_Education_Years_Pooled,pgc.scz2,MAGNETIC_HDL.C,MAGIC_ln_FastingInsulin,Astle_et_al_2016_Sum_eosinophil_basophil_counts,SSGAC_Depressive_Symptoms,Jones_et_al_2016_SleepDuration,Astle_et_al_2016_Red_blood_cell_count
gene_name,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1,Unnamed: 5_level_1,Unnamed: 6_level_1,Unnamed: 7_level_1,Unnamed: 8_level_1,Unnamed: 9_level_1,Unnamed: 10_level_1,Unnamed: 11_level_1,Unnamed: 12_level_1,Unnamed: 13_level_1,Unnamed: 14_level_1,Unnamed: 15_level_1,Unnamed: 16_level_1,Unnamed: 17_level_1,Unnamed: 18_level_1,Unnamed: 19_level_1,Unnamed: 20_level_1,Unnamed: 21_level_1
ENSG00000000419,0.95283,0.82795,0.97674,0.91664,0.58017,0.27086,0.77917,0.59788,0.53166,0.5347,...,0.98028,0.41397,0.74782,0.60947,0.019467,0.37175,0.82301,0.71331,0.85839,0.40976
ENSG00000000457,0.64636,0.29961,0.47747,0.00357,0.90537,0.84571,0.13743,0.86806,0.028947,0.39227,...,0.22898,0.63683,0.65453,0.010907,0.7263,0.21066,0.008023,0.67875,0.83668,0.81248
ENSG00000000460,0.89607,0.094218,0.50765,0.16644,0.97455,0.62891,0.42519,0.55172,0.059456,0.29583,...,0.30146,0.14052,0.64644,0.58797,0.49872,0.52181,0.004462,0.73651,0.43223,0.48666
ENSG00000000938,0.57692,0.10712,0.43928,0.56758,0.76833,0.20387,0.23145,0.10671,0.59565,0.95372,...,0.58885,0.22698,0.57659,0.059247,0.43544,0.95316,0.10188,0.955,0.097831,0.13504
ENSG00000000971,0.95639,0.49201,0.51092,0.53239,0.55531,0.99356,0.80744,0.94837,0.77469,0.49096,...,0.10988,0.040871,0.005662,0.020391,0.43947,0.69024,0.055059,0.00266,0.33113,0.99545


In [37]:
assert not _tmp.isin([np.inf, -np.inf]).any().any()

In [38]:
assert np.array_equal(smultixcan_genes_associations_zeros_removed.isna(), _tmp.isna())

In [39]:
_tmp.isna().any().any()

True

In [40]:
_tmp_flat = pd.Series(_tmp.values.flatten()).dropna()
assert ((_tmp_flat > 0) & (_tmp_flat <= 1)).all().all()

In [41]:
assert np.allclose(_tmp.values, smultixcan_genes_associations_zeros_removed.values, atol=1e-320, rtol=1e-4, equal_nan=True)

## Save zscores

In [42]:
zscores = np.abs(stats.norm.ppf(smultixcan_genes_associations / 2))

smultixcan_genes_associations_zscores = pd.DataFrame(zscores, index=smultixcan_genes_associations.index.copy(), columns=smultixcan_genes_associations.columns.copy())

display(smultixcan_genes_associations_zscores.shape)
display(smultixcan_genes_associations_zscores.head())

  cond1 = (0 < q) & (q < 1)
  cond1 = (0 < q) & (q < 1)


(22515, 4091)

Unnamed: 0_level_0,4270-Volume_level_set_by_participant_left,S05-Diagnoses_main_ICD10_S05_Injury_of_eye_and_orbit,20003_1141157402-Treatmentmedication_code_prednisolone_product,20002_1427-Noncancer_illness_code_selfreported_polycystic_kidney,110001-Invitation_to_complete_online_24hour_recall_dietary_questionnaire_acceptance,22617_2442-Job_SOC_coding_Social_workers,J93-Diagnoses_main_ICD10_J93_Pneumothorax,22601_41133206-Job_coding_local_government_administrative_officer_or_assistant_or_clerk,6145_3-Illness_injury_bereavement_stress_in_last_2_years_Death_of_a_close_relative,20002_1597-Noncancer_illness_code_selfreported_tinnitus_tiniitis,...,PGC_ADHD_EUR_2017,BCAC_ER_negative_BreastCancer_EUR,SSGAC_Education_Years_Pooled,pgc.scz2,MAGNETIC_HDL.C,MAGIC_ln_FastingInsulin,Astle_et_al_2016_Sum_eosinophil_basophil_counts,SSGAC_Depressive_Symptoms,Jones_et_al_2016_SleepDuration,Astle_et_al_2016_Red_blood_cell_count
gene_name,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1,Unnamed: 5_level_1,Unnamed: 6_level_1,Unnamed: 7_level_1,Unnamed: 8_level_1,Unnamed: 9_level_1,Unnamed: 10_level_1,Unnamed: 11_level_1,Unnamed: 12_level_1,Unnamed: 13_level_1,Unnamed: 14_level_1,Unnamed: 15_level_1,Unnamed: 16_level_1,Unnamed: 17_level_1,Unnamed: 18_level_1,Unnamed: 19_level_1,Unnamed: 20_level_1,Unnamed: 21_level_1
ENSG00000000419,0.059156,0.217326,0.02915,0.104664,0.553134,1.101089,0.280399,0.527449,0.625478,0.620854,...,0.024717,0.816922,0.321512,0.510834,2.336462,0.893203,0.223675,0.367414,0.178426,0.824314
ENSG00000000457,0.458818,1.037278,0.710374,2.913855,0.118886,0.194594,1.485428,0.166128,2.18421,0.855502,...,1.202984,0.472136,0.447472,2.545653,0.350052,1.251757,2.651112,0.414171,0.206143,0.237223
ENSG00000000460,0.130622,1.673556,0.662508,1.383726,0.031908,0.483261,0.797453,0.59518,1.884805,1.045412,...,1.033308,1.473851,0.458711,0.541782,0.676499,0.640565,2.843532,0.33648,0.785382,0.695624
ENSG00000000938,0.557883,1.611271,0.773417,0.571616,0.294555,1.270594,1.196635,1.613178,0.53066,0.058038,...,0.540496,1.208182,0.558368,1.886356,0.77992,0.058739,1.635829,0.056432,1.655463,1.494501
ENSG00000000971,0.054685,0.687113,0.6574,0.624363,0.589818,0.008068,0.243731,0.064759,0.28624,0.688779,...,1.598721,2.044839,2.766758,2.319072,0.773096,0.398527,1.918412,3.004544,0.971837,0.005703


In [43]:
# some testing

# For FinnGen
assert smultixcan_genes_associations_zscores.loc['ENSG00000110628', 'C_TONGUENAS-Malignant_neoplasm_of_other_and_unspecified_parts_of_tongue'] == 2.8014991958592232
assert smultixcan_genes_associations_zscores.loc['ENSG00000169783', 'C_TONGUENAS-Malignant_neoplasm_of_other_and_unspecified_parts_of_tongue'] == 0.8858121525410351
assert smultixcan_genes_associations_zscores.loc['ENSG00000137959', 'C_TONGUENAS-Malignant_neoplasm_of_other_and_unspecified_parts_of_tongue'] == 5.021287959552069

# For ICD10
assert smultixcan_genes_associations_zscores.loc['ENSG00000135775', 'N18-Diagnoses_main_ICD10_N18_Chronic_renal_failure'] == 4.1895505583580785

# For extra phenotypes
assert smultixcan_genes_associations_zscores.loc['ENSG00000135775', 'MAGIC_ln_FastingInsulin'] == 1.71076773175347
assert smultixcan_genes_associations_zscores.loc['ENSG00000169783', 'IMMUNOBASE_Systemic_lupus_erythematosus_hg19'] == 1.5993464052052957
assert smultixcan_genes_associations_zscores.loc['ENSG00000158691', 'pgc.scz2'] == 11.031317483379759

In [44]:
assert np.array_equal(smultixcan_genes_associations.isna(), smultixcan_genes_associations_zscores.isna())

### Remove inf values

In [45]:
smultixcan_genes_associations_zscores.isin([np.inf, -np.inf]).any().any()

True

In [46]:
max_zscores = pd.Series(smultixcan_genes_associations_zscores.values.flatten())

In [47]:
_tmp = max_zscores.sort_values(ascending=False)
display(_tmp[~np.isinf(_tmp)].head())

22853308    37.737142
21203099    37.723815
5437861     37.720171
90964367    37.710072
21233250    37.698289
dtype: float64

In [48]:
_tmp = max_zscores.sort_values(ascending=True)
display(_tmp.head())

1350027     0.0
69698348    0.0
75413475    0.0
48118334    0.0
48490615    0.0
dtype: float64

In [49]:
smultixcan_genes_associations_zscores.isna().any().any()

True

In [50]:
# replace inf
smultixcan_genes_associations_zscores = smultixcan_genes_associations_zscores.replace(np.inf, 40)
assert not smultixcan_genes_associations_zscores.isin([np.inf, -np.inf]).any().any()

### Save

In [51]:
smultixcan_genes_associations_zscores.shape

(22515, 4091)

In [52]:
smultixcan_genes_associations_zscores.head()

Unnamed: 0_level_0,4270-Volume_level_set_by_participant_left,S05-Diagnoses_main_ICD10_S05_Injury_of_eye_and_orbit,20003_1141157402-Treatmentmedication_code_prednisolone_product,20002_1427-Noncancer_illness_code_selfreported_polycystic_kidney,110001-Invitation_to_complete_online_24hour_recall_dietary_questionnaire_acceptance,22617_2442-Job_SOC_coding_Social_workers,J93-Diagnoses_main_ICD10_J93_Pneumothorax,22601_41133206-Job_coding_local_government_administrative_officer_or_assistant_or_clerk,6145_3-Illness_injury_bereavement_stress_in_last_2_years_Death_of_a_close_relative,20002_1597-Noncancer_illness_code_selfreported_tinnitus_tiniitis,...,PGC_ADHD_EUR_2017,BCAC_ER_negative_BreastCancer_EUR,SSGAC_Education_Years_Pooled,pgc.scz2,MAGNETIC_HDL.C,MAGIC_ln_FastingInsulin,Astle_et_al_2016_Sum_eosinophil_basophil_counts,SSGAC_Depressive_Symptoms,Jones_et_al_2016_SleepDuration,Astle_et_al_2016_Red_blood_cell_count
gene_name,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1,Unnamed: 5_level_1,Unnamed: 6_level_1,Unnamed: 7_level_1,Unnamed: 8_level_1,Unnamed: 9_level_1,Unnamed: 10_level_1,Unnamed: 11_level_1,Unnamed: 12_level_1,Unnamed: 13_level_1,Unnamed: 14_level_1,Unnamed: 15_level_1,Unnamed: 16_level_1,Unnamed: 17_level_1,Unnamed: 18_level_1,Unnamed: 19_level_1,Unnamed: 20_level_1,Unnamed: 21_level_1
ENSG00000000419,0.059156,0.217326,0.02915,0.104664,0.553134,1.101089,0.280399,0.527449,0.625478,0.620854,...,0.024717,0.816922,0.321512,0.510834,2.336462,0.893203,0.223675,0.367414,0.178426,0.824314
ENSG00000000457,0.458818,1.037278,0.710374,2.913855,0.118886,0.194594,1.485428,0.166128,2.18421,0.855502,...,1.202984,0.472136,0.447472,2.545653,0.350052,1.251757,2.651112,0.414171,0.206143,0.237223
ENSG00000000460,0.130622,1.673556,0.662508,1.383726,0.031908,0.483261,0.797453,0.59518,1.884805,1.045412,...,1.033308,1.473851,0.458711,0.541782,0.676499,0.640565,2.843532,0.33648,0.785382,0.695624
ENSG00000000938,0.557883,1.611271,0.773417,0.571616,0.294555,1.270594,1.196635,1.613178,0.53066,0.058038,...,0.540496,1.208182,0.558368,1.886356,0.77992,0.058739,1.635829,0.056432,1.655463,1.494501
ENSG00000000971,0.054685,0.687113,0.6574,0.624363,0.589818,0.008068,0.243731,0.064759,0.28624,0.688779,...,1.598721,2.044839,2.766758,2.319072,0.773096,0.398527,1.918412,3.004544,0.971837,0.005703


In [53]:
# save
smultixcan_genes_associations_zscores_filename = os.path.join(conf.GENE_ASSOC_DIR, f'smultixcan-mashr-zscores.pkl.xz')
display(smultixcan_genes_associations_zscores_filename)

'/mnt/phenomexcan_base/gene_assoc/smultixcan-mashr-zscores.pkl.xz'

In [54]:
smultixcan_genes_associations_zscores.to_pickle(smultixcan_genes_associations_zscores_filename)

### Save for publication

In [55]:
# for publication
output_file = os.path.join(conf.GENE_ASSOC_DIR, 'smultixcan-mashr-zscores.tsv.gz')
display(output_file)

smultixcan_genes_associations_zscores.to_csv(output_file, sep='\t', float_format='%.4e')

'/mnt/phenomexcan_base/gene_assoc/smultixcan-mashr-zscores.tsv.gz'

In [56]:
# test "for publication" file
_tmp = pd.read_csv(output_file, sep='\t', index_col='gene_name')

In [57]:
display(_tmp.shape)
assert _tmp.shape == smultixcan_genes_associations_zscores.shape

(22515, 4091)

In [58]:
_tmp.head()

Unnamed: 0_level_0,4270-Volume_level_set_by_participant_left,S05-Diagnoses_main_ICD10_S05_Injury_of_eye_and_orbit,20003_1141157402-Treatmentmedication_code_prednisolone_product,20002_1427-Noncancer_illness_code_selfreported_polycystic_kidney,110001-Invitation_to_complete_online_24hour_recall_dietary_questionnaire_acceptance,22617_2442-Job_SOC_coding_Social_workers,J93-Diagnoses_main_ICD10_J93_Pneumothorax,22601_41133206-Job_coding_local_government_administrative_officer_or_assistant_or_clerk,6145_3-Illness_injury_bereavement_stress_in_last_2_years_Death_of_a_close_relative,20002_1597-Noncancer_illness_code_selfreported_tinnitus_tiniitis,...,PGC_ADHD_EUR_2017,BCAC_ER_negative_BreastCancer_EUR,SSGAC_Education_Years_Pooled,pgc.scz2,MAGNETIC_HDL.C,MAGIC_ln_FastingInsulin,Astle_et_al_2016_Sum_eosinophil_basophil_counts,SSGAC_Depressive_Symptoms,Jones_et_al_2016_SleepDuration,Astle_et_al_2016_Red_blood_cell_count
gene_name,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1,Unnamed: 5_level_1,Unnamed: 6_level_1,Unnamed: 7_level_1,Unnamed: 8_level_1,Unnamed: 9_level_1,Unnamed: 10_level_1,Unnamed: 11_level_1,Unnamed: 12_level_1,Unnamed: 13_level_1,Unnamed: 14_level_1,Unnamed: 15_level_1,Unnamed: 16_level_1,Unnamed: 17_level_1,Unnamed: 18_level_1,Unnamed: 19_level_1,Unnamed: 20_level_1,Unnamed: 21_level_1
ENSG00000000419,0.059156,0.21733,0.02915,0.10466,0.55313,1.1011,0.2804,0.52745,0.62548,0.62085,...,0.024717,0.81692,0.32151,0.51083,2.3365,0.8932,0.22367,0.36741,0.17843,0.82431
ENSG00000000457,0.45882,1.0373,0.71037,2.9139,0.11889,0.19459,1.4854,0.16613,2.1842,0.8555,...,1.203,0.47214,0.44747,2.5457,0.35005,1.2518,2.6511,0.41417,0.20614,0.23722
ENSG00000000460,0.13062,1.6736,0.66251,1.3837,0.031908,0.48326,0.79745,0.59518,1.8848,1.0454,...,1.0333,1.4739,0.45871,0.54178,0.6765,0.64057,2.8435,0.33648,0.78538,0.69562
ENSG00000000938,0.55788,1.6113,0.77342,0.57162,0.29455,1.2706,1.1966,1.6132,0.53066,0.058038,...,0.5405,1.2082,0.55837,1.8864,0.77992,0.058739,1.6358,0.056432,1.6555,1.4945
ENSG00000000971,0.054685,0.68711,0.6574,0.62436,0.58982,0.008068,0.24373,0.064759,0.28624,0.68878,...,1.5987,2.0448,2.7668,2.3191,0.7731,0.39853,1.9184,3.0045,0.97184,0.005703


In [59]:
assert not _tmp.isin([np.inf, -np.inf]).any().any()

In [60]:
assert np.array_equal(smultixcan_genes_associations_zscores.isna(), _tmp.isna())

In [61]:
_tmp.isna().any().any()

True

In [62]:
_tmp_flat = pd.Series(_tmp.values.flatten()).dropna()
assert ((_tmp_flat >= 0) & (_tmp_flat <= 1600)).all().all()

In [63]:
assert np.allclose(_tmp.values, smultixcan_genes_associations_zscores.values, atol=1e-320, rtol=1e-4, equal_nan=True)