In [1]:
%load_ext autoreload
%autoreload 2

In [2]:
import os
import pickle
from glob import glob
import re
from concurrent.futures import ProcessPoolExecutor, as_completed

import numpy as np
import pandas as pd
#from tqdm import tqdm
from scipy import stats
from sklearn.metrics import pairwise_distances

import utils.constants as constants
from data.data import PhenoInfo, PhenoResults, get_all_tissues, get_genes
from utils.utils import is_number, chunker

In [3]:
genes_associations_dir = os.path.join(constants.PREPROCESSED_BASED_DIR, 'gene_associations')

In [4]:
MODELS_TYPE = 'mashr'

In [5]:
OUTPUT_DIR = os.path.join(genes_associations_dir, MODELS_TYPE, 'spredixcan')
display(OUTPUT_DIR)
os.makedirs(OUTPUT_DIR, exist_ok=True)

'/mnt/phenomexcan/results/preprocessed_data/gene_associations/mashr/spredixcan'

# Load all S-PrediXcan gene mappings

In [6]:
with open(os.path.join(constants.PREPROCESSED_METADATA_DIR, 'genes_mapping_simplified-0.pkl'), 'rb') as f:
    genes_mapping_0 = pickle.load(f)

with open(os.path.join(constants.PREPROCESSED_METADATA_DIR, 'genes_mapping_simplified-1.pkl'), 'rb') as f:
    genes_mapping_1 = pickle.load(f)

## S-PrediXcan

In [7]:
N_EXPECTED_SPREDIXCAN_PHENOTYPES = 4083

spredixcan_results_folder = constants.SPREDIXCAN_MASHR_RESULTS_DIR
display(spredixcan_results_folder)

all_spredixcan_results_dirs = glob(os.path.join(spredixcan_results_folder, '*'))

# Select only FinnGen phenotypes here (diseases)
all_spredixcan_phenotypes = [PhenoResults(p) for p in all_spredixcan_results_dirs]
#all_spredixcan_phenotypes = [p for p in all_spredixcan_phenotypes if p.pheno_info.pheno_source in ('icd10', 'finngen', 'phesant')]
all_spredixcan_phenotypes_plain_names = pd.Index([p.pheno_info.get_plain_name() for p in all_spredixcan_phenotypes])

display(len(all_spredixcan_phenotypes))
assert len(all_spredixcan_phenotypes) == N_EXPECTED_SPREDIXCAN_PHENOTYPES

'/mnt/phenomexcan/results/spredixcan/mashr'

4083

In [8]:
all_tissues = get_all_tissues()

[data.py - 2019-10-27 22:52:28,133] INFO: Using models dir: /mnt/phenomexcan/data/gtex_v8/mashr


## Compute results

In [9]:
def _get_combined_results(phenos, column, tissue):
    all_pheno_results = {}
    
    for pheno in phenos:
        # get results from all tissues
        all_results_dict = {
            pheno.pheno_info.get_plain_name() :
                pheno.get_tissue_data(
                    tissue,
                    cols=['gene_name', column],
                    index_col='gene_simple')[column]
        }
        all_results_df = pd.DataFrame(all_results_dict)#, index=all_spredixcan_unique_genes)
        #all_results_df.index.rename(('tissue', 'gene_name'), inplace=True)
        
        # create final pd.Series with multiindex
        all_pheno_results[pheno.pheno_info.get_plain_name()] = all_results_df.T.stack()
    
    return all_pheno_results

### pvalues and z-scores

**NEEDS FIXING**

In [10]:
def _run_all(column_name, phenotype_chunks, n_jobs=20):
    print(column_name, flush=True)
    
    for tissue in all_tissues:
        print(tissue, flush=True)
        
        all_results = {}

        with ProcessPoolExecutor(max_workers=n_jobs) as executor:
            tasks = [executor.submit(_get_combined_results, chunk, column_name, tissue) for chunk in phenotype_chunks]
            for future in as_completed(tasks):
                res = future.result()
                all_results.update(res)

        # Save all results from S-PrediXcan
        final_filename = os.path.join(OUTPUT_DIR, f'spredixcan-{tissue}-all_{column_name}.pkl')
        #display(final_filename)

        with open(final_filename, 'wb') as f:
            pickle.dump(all_results, f)

In [11]:
# def _get_combined_zscores(phenos):
#     all_pheno_results = {}
    
#     for pheno in phenos:
#         all_results_dict = {}
#         for t in all_tissues:
#             res = pheno.get_tissue_data(t, cols=['gene_name', 'zscore'], index_col='gene_simple')
#             #res = res.drop_duplicates(subset='gene_name')
#             #res = res.set_index('gene_name')
#             #all_results_df = all_results_df.assign(**{t: res['zscore']})
#             all_results_dict[t] = res['zscore']

#         all_results_df = pd.DataFrame(all_results_dict, index=all_spredixcan_unique_genes)
            
#         all_pheno_results[pheno.pheno_info.get_plain_name()] = all_results_df.apply(lambda x: max((x.min(), x.max()), key=abs), axis=1)
    
#     return all_pheno_results

In [25]:
phenotype_chunks = chunker(all_spredixcan_phenotypes[:10], 2)
#phenotype_chunks = chunker(all_spredixcan_phenotypes, 25)

In [13]:
_run_all('pvalue', phenotype_chunks, n_jobs=20)

pvalue
Brain_Putamen_basal_ganglia


'/mnt/phenomexcan/results/preprocessed_data/gene_associations/mashr/spredixcan/spredixcan-Brain_Putamen_basal_ganglia-all_pvalue.pkl'

Adipose_Subcutaneous


'/mnt/phenomexcan/results/preprocessed_data/gene_associations/mashr/spredixcan/spredixcan-Adipose_Subcutaneous-all_pvalue.pkl'

Breast_Mammary_Tissue


'/mnt/phenomexcan/results/preprocessed_data/gene_associations/mashr/spredixcan/spredixcan-Breast_Mammary_Tissue-all_pvalue.pkl'

Heart_Atrial_Appendage


'/mnt/phenomexcan/results/preprocessed_data/gene_associations/mashr/spredixcan/spredixcan-Heart_Atrial_Appendage-all_pvalue.pkl'

Muscle_Skeletal


'/mnt/phenomexcan/results/preprocessed_data/gene_associations/mashr/spredixcan/spredixcan-Muscle_Skeletal-all_pvalue.pkl'

Brain_Hypothalamus


'/mnt/phenomexcan/results/preprocessed_data/gene_associations/mashr/spredixcan/spredixcan-Brain_Hypothalamus-all_pvalue.pkl'

Brain_Substantia_nigra


'/mnt/phenomexcan/results/preprocessed_data/gene_associations/mashr/spredixcan/spredixcan-Brain_Substantia_nigra-all_pvalue.pkl'

Testis


'/mnt/phenomexcan/results/preprocessed_data/gene_associations/mashr/spredixcan/spredixcan-Testis-all_pvalue.pkl'

Prostate


'/mnt/phenomexcan/results/preprocessed_data/gene_associations/mashr/spredixcan/spredixcan-Prostate-all_pvalue.pkl'

Esophagus_Muscularis


'/mnt/phenomexcan/results/preprocessed_data/gene_associations/mashr/spredixcan/spredixcan-Esophagus_Muscularis-all_pvalue.pkl'

Brain_Cerebellar_Hemisphere


'/mnt/phenomexcan/results/preprocessed_data/gene_associations/mashr/spredixcan/spredixcan-Brain_Cerebellar_Hemisphere-all_pvalue.pkl'

Spleen


'/mnt/phenomexcan/results/preprocessed_data/gene_associations/mashr/spredixcan/spredixcan-Spleen-all_pvalue.pkl'

Cells_Cultured_fibroblasts


'/mnt/phenomexcan/results/preprocessed_data/gene_associations/mashr/spredixcan/spredixcan-Cells_Cultured_fibroblasts-all_pvalue.pkl'

Heart_Left_Ventricle


'/mnt/phenomexcan/results/preprocessed_data/gene_associations/mashr/spredixcan/spredixcan-Heart_Left_Ventricle-all_pvalue.pkl'

Thyroid


'/mnt/phenomexcan/results/preprocessed_data/gene_associations/mashr/spredixcan/spredixcan-Thyroid-all_pvalue.pkl'

Colon_Transverse


'/mnt/phenomexcan/results/preprocessed_data/gene_associations/mashr/spredixcan/spredixcan-Colon_Transverse-all_pvalue.pkl'

Lung


'/mnt/phenomexcan/results/preprocessed_data/gene_associations/mashr/spredixcan/spredixcan-Lung-all_pvalue.pkl'

Brain_Cortex


'/mnt/phenomexcan/results/preprocessed_data/gene_associations/mashr/spredixcan/spredixcan-Brain_Cortex-all_pvalue.pkl'

Nerve_Tibial


'/mnt/phenomexcan/results/preprocessed_data/gene_associations/mashr/spredixcan/spredixcan-Nerve_Tibial-all_pvalue.pkl'

Adrenal_Gland


'/mnt/phenomexcan/results/preprocessed_data/gene_associations/mashr/spredixcan/spredixcan-Adrenal_Gland-all_pvalue.pkl'

Brain_Amygdala


'/mnt/phenomexcan/results/preprocessed_data/gene_associations/mashr/spredixcan/spredixcan-Brain_Amygdala-all_pvalue.pkl'

Pancreas


'/mnt/phenomexcan/results/preprocessed_data/gene_associations/mashr/spredixcan/spredixcan-Pancreas-all_pvalue.pkl'

Brain_Nucleus_accumbens_basal_ganglia


'/mnt/phenomexcan/results/preprocessed_data/gene_associations/mashr/spredixcan/spredixcan-Brain_Nucleus_accumbens_basal_ganglia-all_pvalue.pkl'

Uterus


'/mnt/phenomexcan/results/preprocessed_data/gene_associations/mashr/spredixcan/spredixcan-Uterus-all_pvalue.pkl'

Cells_EBV-transformed_lymphocytes


'/mnt/phenomexcan/results/preprocessed_data/gene_associations/mashr/spredixcan/spredixcan-Cells_EBV-transformed_lymphocytes-all_pvalue.pkl'

Esophagus_Mucosa


'/mnt/phenomexcan/results/preprocessed_data/gene_associations/mashr/spredixcan/spredixcan-Esophagus_Mucosa-all_pvalue.pkl'

Colon_Sigmoid


'/mnt/phenomexcan/results/preprocessed_data/gene_associations/mashr/spredixcan/spredixcan-Colon_Sigmoid-all_pvalue.pkl'

Artery_Aorta


'/mnt/phenomexcan/results/preprocessed_data/gene_associations/mashr/spredixcan/spredixcan-Artery_Aorta-all_pvalue.pkl'

Esophagus_Gastroesophageal_Junction


'/mnt/phenomexcan/results/preprocessed_data/gene_associations/mashr/spredixcan/spredixcan-Esophagus_Gastroesophageal_Junction-all_pvalue.pkl'

Brain_Anterior_cingulate_cortex_BA24


'/mnt/phenomexcan/results/preprocessed_data/gene_associations/mashr/spredixcan/spredixcan-Brain_Anterior_cingulate_cortex_BA24-all_pvalue.pkl'

Brain_Cerebellum


'/mnt/phenomexcan/results/preprocessed_data/gene_associations/mashr/spredixcan/spredixcan-Brain_Cerebellum-all_pvalue.pkl'

Small_Intestine_Terminal_Ileum


'/mnt/phenomexcan/results/preprocessed_data/gene_associations/mashr/spredixcan/spredixcan-Small_Intestine_Terminal_Ileum-all_pvalue.pkl'

Skin_Sun_Exposed_Lower_leg


'/mnt/phenomexcan/results/preprocessed_data/gene_associations/mashr/spredixcan/spredixcan-Skin_Sun_Exposed_Lower_leg-all_pvalue.pkl'

Stomach


'/mnt/phenomexcan/results/preprocessed_data/gene_associations/mashr/spredixcan/spredixcan-Stomach-all_pvalue.pkl'

Brain_Frontal_Cortex_BA9


'/mnt/phenomexcan/results/preprocessed_data/gene_associations/mashr/spredixcan/spredixcan-Brain_Frontal_Cortex_BA9-all_pvalue.pkl'

Adipose_Visceral_Omentum


'/mnt/phenomexcan/results/preprocessed_data/gene_associations/mashr/spredixcan/spredixcan-Adipose_Visceral_Omentum-all_pvalue.pkl'

Brain_Spinal_cord_cervical_c-1


'/mnt/phenomexcan/results/preprocessed_data/gene_associations/mashr/spredixcan/spredixcan-Brain_Spinal_cord_cervical_c-1-all_pvalue.pkl'

Ovary


'/mnt/phenomexcan/results/preprocessed_data/gene_associations/mashr/spredixcan/spredixcan-Ovary-all_pvalue.pkl'

Liver


'/mnt/phenomexcan/results/preprocessed_data/gene_associations/mashr/spredixcan/spredixcan-Liver-all_pvalue.pkl'

Vagina


'/mnt/phenomexcan/results/preprocessed_data/gene_associations/mashr/spredixcan/spredixcan-Vagina-all_pvalue.pkl'

Kidney_Cortex


'/mnt/phenomexcan/results/preprocessed_data/gene_associations/mashr/spredixcan/spredixcan-Kidney_Cortex-all_pvalue.pkl'

Minor_Salivary_Gland


'/mnt/phenomexcan/results/preprocessed_data/gene_associations/mashr/spredixcan/spredixcan-Minor_Salivary_Gland-all_pvalue.pkl'

Artery_Tibial


'/mnt/phenomexcan/results/preprocessed_data/gene_associations/mashr/spredixcan/spredixcan-Artery_Tibial-all_pvalue.pkl'

Pituitary


'/mnt/phenomexcan/results/preprocessed_data/gene_associations/mashr/spredixcan/spredixcan-Pituitary-all_pvalue.pkl'

Brain_Hippocampus


'/mnt/phenomexcan/results/preprocessed_data/gene_associations/mashr/spredixcan/spredixcan-Brain_Hippocampus-all_pvalue.pkl'

Brain_Caudate_basal_ganglia


'/mnt/phenomexcan/results/preprocessed_data/gene_associations/mashr/spredixcan/spredixcan-Brain_Caudate_basal_ganglia-all_pvalue.pkl'

Whole_Blood


'/mnt/phenomexcan/results/preprocessed_data/gene_associations/mashr/spredixcan/spredixcan-Whole_Blood-all_pvalue.pkl'

Artery_Coronary


'/mnt/phenomexcan/results/preprocessed_data/gene_associations/mashr/spredixcan/spredixcan-Artery_Coronary-all_pvalue.pkl'

Skin_Not_Sun_Exposed_Suprapubic


'/mnt/phenomexcan/results/preprocessed_data/gene_associations/mashr/spredixcan/spredixcan-Skin_Not_Sun_Exposed_Suprapubic-all_pvalue.pkl'

In [None]:
_run_all('zscore', phenotype_chunks, n_jobs=20)

## Load results and save as DataFrame

In [15]:
def load_results(setname, tissue):
    with open(os.path.join(OUTPUT_DIR, f'spredixcan-{tissue}-all_{setname}.pkl'), 'rb') as f:
        return pickle.load(f)

In [40]:
all_results = load_results('pvalue', 'Adipose_Subcutaneous')

In [41]:
all_results.keys()

dict_keys([])

In [37]:
all_results['C3_TONGUENAS-Malignant_neoplasm_of_other_and_unspecified_parts_of_tongue'].shape

(13322,)

In [None]:
shared_unique_genes_before = shared_unique_genes

In [None]:
for setname in ('pvalue', 'zscore'):
    print(setname)
    
    all_results = load_results(setname)
    
    assert len(all_results) == N_EXPECTED_SPREDIXCAN_PHENOTYPES
    
    spredixcan_genes_associations = pd.DataFrame(all_results)
    spredixcan_genes_associations.index.rename(('tissue', 'gene_name'), inplace=True)

    assert spredixcan_genes_associations.index.is_unique
    
    display(spredixcan_genes_associations.shape)
    display(spredixcan_genes_associations.head())
    
    # Remove genes with no results
    spredixcan_genes_associations = spredixcan_genes_associations.dropna(axis=0, how='all')
    assert spredixcan_genes_associations.isna().sum().sum() == 0
    
    assert spredixcan_genes_associations.shape == (280936, N_EXPECTED_SPREDIXCAN_PHENOTYPES)

    # Remove also these genes from shared and omim genes
    #n_genes_shared_before = len(shared_unique_genes)
    display(f'n genes shared before: {len(shared_unique_genes_before)}')

    _spredixcan_gene_name_level = spredixcan_genes_associations.index.get_level_values('gene_name')
    shared_unique_genes = _spredixcan_gene_name_level.unique().intersection(omim_unique_genes)
    display(f'n genes shared now: {len(shared_unique_genes)}')
    
    # trim spredixcan genes
    #spredixcan_genes_associations = spredixcan_genes_associations.loc[(_spredixcan_gene_name_level.isin(shared_unique_genes))]

    # trim omim genes
    omim_genes_associations = omim_genes_associations.loc[shared_unique_genes]
    display(f'OMIM shape: {omim_genes_associations.shape}')
    # and remove any OMIM disease with no genes associated (after removing those)
    _tmp = omim_genes_associations.sum()
    cols_to_remove = _tmp[_tmp == 0].index
    omim_genes_associations = omim_genes_associations.drop(columns=cols_to_remove)
    display(f'OMIM shape after removing cols: {omim_genes_associations.shape}')
    
    assert omim_genes_associations.shape == (13508, 4761)
    display(omim_genes_associations.shape)
    display(omim_genes_associations.head())
    
    # some testing
    if setname == 'zscore':
        print('Testing zscore')
        # For FinnGen
        assert spredixcan_genes_associations.loc[('Whole_Blood', 'ENSG00000110628'), 'C3_TONGUENAS-Malignant_neoplasm_of_other_and_unspecified_parts_of_tongue'] == 3.6574495536599714
        # For ICD10
        assert spredixcan_genes_associations.loc[('Artery_Tibial', 'ENSG00000169783'), 'N18-Diagnoses_main_ICD10_N18_Chronic_renal_failure'] == -2.3246862264204027
    elif setname == 'pvalue':
        print('Testing pvalue')
        # For FinnGen
        assert spredixcan_genes_associations.loc[('Whole_Blood', 'ENSG00000110628'), 'C3_TONGUENAS-Malignant_neoplasm_of_other_and_unspecified_parts_of_tongue'] == 0.0002547373742003848
        # For ICD10
        assert spredixcan_genes_associations.loc[('Artery_Tibial', 'ENSG00000169783'), 'N18-Diagnoses_main_ICD10_N18_Chronic_renal_failure'] == 0.020088744309727864
    
    print('Saving')
    
    # spredixcan df
    spredixcan_genes_associations_filename = os.path.join(genes_associations_dir, f'spredixcan-genes_associations-{setname}.pkl.xz')
    display(spredixcan_genes_associations_filename)

    spredixcan_genes_associations.to_pickle(spredixcan_genes_associations_filename)
    
    # omim df
    if setname == 'zscore': # save only once
        omim_genes_associations_filename = os.path.join(genes_associations_dir, 'omim-genes_associations.pkl.xz')
        display(omim_genes_associations_filename)

        omim_genes_associations.to_pickle(omim_genes_associations_filename)