In [1]:
%load_ext autoreload
%autoreload 2

In [2]:
import os
import pickle
from glob import glob
import re
from concurrent.futures import ProcessPoolExecutor, as_completed

import numpy as np
import pandas as pd
#from tqdm import tqdm
from scipy import stats
from sklearn.metrics import pairwise_distances

import utils.constants as constants
from data.multixcan_data import MXPhenoInfo, MXPhenoResults
from utils.utils import is_number, chunker

In [3]:
genes_associations_dir = os.path.join(constants.PREPROCESSED_BASED_DIR, 'gene_associations')

In [4]:
MODELS_TYPE = 'mashr'

In [5]:
OUTPUT_DIR = os.path.join(genes_associations_dir, MODELS_TYPE)
display(OUTPUT_DIR)
os.makedirs(OUTPUT_DIR, exist_ok=True)

'/mnt/phenomexcan/results/preprocessed_data/gene_associations/mashr'

# Load all gene mappings

In [6]:
with open(os.path.join(constants.PREPROCESSED_METADATA_DIR, 'genes_mapping_simplified-0.pkl'), 'rb') as f:
    genes_mapping_0 = pickle.load(f)

with open(os.path.join(constants.PREPROCESSED_METADATA_DIR, 'genes_mapping_simplified-1.pkl'), 'rb') as f:
    genes_mapping_1 = pickle.load(f)

## S-MulTiXcan

In [7]:
N_EXPECTED_PHENOTYPES = 4049

In [8]:
_path = os.path.join(constants.SMULTIXCAN_MASHR_RESULTS_DIR, '*.tsv.gz')
display(_path)
all_spredixcan_results_dirs = glob(_path)
assert len(all_spredixcan_results_dirs) == N_EXPECTED_PHENOTYPES

'/mnt/phenomexcan/results/smultixcan/mashr/*.tsv.gz'

In [9]:
all_spredixcan_phenotypes = [MXPhenoResults(p) for p in all_spredixcan_results_dirs]
all_spredixcan_phenotypes_plain_names = pd.Index([p.pheno_info.get_plain_name() for p in all_spredixcan_phenotypes])

display(len(all_spredixcan_phenotypes))
assert len(all_spredixcan_phenotypes) == N_EXPECTED_PHENOTYPES

4049

## Compute results

This read all phenotypes results (S-MultiXcan) and saves them all into a Pandas DataFrame

In [10]:
def _get_combined_results(phenos, column):
    return {
        pheno.pheno_info.get_plain_name() :
        pheno.get_data(cols=['gene_name', column], index_col='gene_simple')[column]
        for pheno in phenos
    }

### pvalues and z-scores

In [None]:
def _run_all(column_name, phenotype_chunks, n_jobs=20):
    print(column_name, flush=True)
    
    all_results = {}
    
    with ProcessPoolExecutor(max_workers=n_jobs) as executor:
        tasks = [executor.submit(_get_combined_results, chunk, column_name) for chunk in phenotype_chunks]
        for future in as_completed(tasks):
            res = future.result()
            all_results.update(res)
    
    return all_results
    
#     # Save all results from S-PrediXcan
#     final_filename = os.path.join(OUTPUT_DIR, f'smultixcan-all_{column_name}.pkl.xz')
#     display(final_filename)

#     with open(final_filename, 'wb') as f:
#         pickle.dump(all_results, f)

In [None]:
# phenotype_chunks = list(chunker(all_spredixcan_phenotypes[:5], 2))
phenotype_chunks = chunker(all_spredixcan_phenotypes, 200)

In [None]:
all_results = _run_all('pvalue', phenotype_chunks)

## Load results and save as DataFrame

In [None]:
def load_results(setname):
    with open(os.path.join(OUTPUT_DIR, f'smultixcan-all_{setname}.pkl.xz'), 'rb') as f:
        return pickle.load(f)

In [None]:
setname = 'pvalue'
print(setname)

all_results = load_results(setname)

In [None]:
assert len(all_results) == N_EXPECTED_SPREDIXCAN_PHENOTYPES

In [None]:
spredixcan_genes_associations = pd.DataFrame(all_results)
spredixcan_genes_associations.index.rename('gene_name', inplace=True)

assert spredixcan_genes_associations.index.is_unique

display(spredixcan_genes_associations.shape)
display(spredixcan_genes_associations.head())

In [None]:
# Remove genes with no results
spredixcan_genes_associations = spredixcan_genes_associations.dropna(axis=0, how='all')
assert spredixcan_genes_associations.isna().sum().sum() == 0

display(spredixcan_genes_associations.shape)

assert spredixcan_genes_associations.shape == (22255, N_EXPECTED_SPREDIXCAN_PHENOTYPES)

In [None]:
# some testing

# For FinnGen
assert spredixcan_genes_associations.loc['ENSG00000110628', 'C3_TONGUENAS-Malignant_neoplasm_of_other_and_unspecified_parts_of_tongue'] == 0.005086576789507484
assert spredixcan_genes_associations.loc['ENSG00000169783', 'C3_TONGUENAS-Malignant_neoplasm_of_other_and_unspecified_parts_of_tongue'] == 0.3757187601354043
assert spredixcan_genes_associations.loc['ENSG00000137959', 'C3_TONGUENAS-Malignant_neoplasm_of_other_and_unspecified_parts_of_tongue'] == 5.132614371931036e-07
# For ICD10
assert spredixcan_genes_associations.loc['ENSG00000135775', 'N18-Diagnoses_main_ICD10_N18_Chronic_renal_failure'] == 2.795075036067939e-05
assert spredixcan_genes_associations.loc['ENSG00000169783', 'N18-Diagnoses_main_ICD10_N18_Chronic_renal_failure'] == 0.06668736815697908
assert spredixcan_genes_associations.loc['ENSG00000174226', 'N18-Diagnoses_main_ICD10_N18_Chronic_renal_failure'] == 0.02496852053808064

In [None]:
# Save
spredixcan_genes_associations_filename = os.path.join(OUTPUT_DIR, f'smultixcan-genes_associations-{setname}.pkl.xz')
display(spredixcan_genes_associations_filename)

In [None]:
spredixcan_genes_associations.to_pickle(spredixcan_genes_associations_filename)

## Save zscores

In [None]:
setname = 'zscores'
print(setname)

zscores = np.abs(stats.norm.ppf(spredixcan_genes_associations / 2))

spredixcan_genes_associations_zscores = pd.DataFrame(zscores, index=spredixcan_genes_associations.index.copy(), columns=spredixcan_genes_associations.columns.copy())

display(spredixcan_genes_associations_zscores.shape)
display(spredixcan_genes_associations_zscores.head())

In [None]:
# some testing

# For FinnGen
assert spredixcan_genes_associations_zscores.loc['ENSG00000110628', 'C3_TONGUENAS-Malignant_neoplasm_of_other_and_unspecified_parts_of_tongue'] == 2.8014991958592232
assert spredixcan_genes_associations_zscores.loc['ENSG00000169783', 'C3_TONGUENAS-Malignant_neoplasm_of_other_and_unspecified_parts_of_tongue'] == 0.8858121525410351
assert spredixcan_genes_associations_zscores.loc['ENSG00000137959', 'C3_TONGUENAS-Malignant_neoplasm_of_other_and_unspecified_parts_of_tongue'] == 5.021287959552069
# For ICD10
assert spredixcan_genes_associations_zscores.loc['ENSG00000135775', 'N18-Diagnoses_main_ICD10_N18_Chronic_renal_failure'] == 4.1895505583580785
assert spredixcan_genes_associations_zscores.loc['ENSG00000169783', 'N18-Diagnoses_main_ICD10_N18_Chronic_renal_failure'] == 1.8337752158591156
assert spredixcan_genes_associations_zscores.loc['ENSG00000174226', 'N18-Diagnoses_main_ICD10_N18_Chronic_renal_failure'] == 2.241889411679155

In [None]:
# save
spredixcan_genes_associations_zscores_filename = os.path.join(OUTPUT_DIR, f'smultixcan-genes_associations-{setname}.pkl.xz')
display(spredixcan_genes_associations_zscores_filename)

In [None]:
spredixcan_genes_associations_zscores.to_pickle(spredixcan_genes_associations_zscores_filename)

In [None]:
spredixcan_genes_associations_zscores.shape

In [None]:
spredixcan_genes_associations_zscores.head()

Now I need to copy this file to CRI to include only these genes