In [1]:
%load_ext autoreload
%autoreload 2

In [2]:
import os
import pickle
from glob import glob
from concurrent.futures import ProcessPoolExecutor, as_completed

import numpy as np
import pandas as pd
from scipy import stats

import settings as conf
from utils import is_number, chunker

# Load S-PrediXcan results

## From Rapid GWAS project

In [3]:
from results.spredixcan import PhenoResults

In [4]:
_path = os.path.join(conf.SPREDIXCAN_RESULTS_DIR['RapidGWASProject'] + '/*')
display(_path)
all_spredixcan_results_dirs = glob(_path)
display(len(all_spredixcan_results_dirs))
assert len(all_spredixcan_results_dirs) == conf.SPREDIXCAN_EXPECTED_PHENOTYPES['RapidGWASProject']

'/mnt/phenomexcan_base/results/spredixcan/rapid_gwas_project/*'

4049

In [5]:
all_spredixcan_phenotypes = [PhenoResults(p) for p in all_spredixcan_results_dirs]

display(len(all_spredixcan_phenotypes))
assert len(all_spredixcan_phenotypes) == conf.SPREDIXCAN_EXPECTED_PHENOTYPES['RapidGWASProject']

4049

## From GTEx GWAS manuscript

In [6]:
_path = os.path.join(conf.SPREDIXCAN_RESULTS_DIR['GTEX_GWAS'] + '/*')
display(_path)
all_extra_results_dirs = glob(_path)
display(len(all_extra_results_dirs))
assert len(all_extra_results_dirs) == conf.SPREDIXCAN_EXPECTED_PHENOTYPES['GTEX_GWAS']

'/mnt/phenomexcan_base/results/spredixcan/gtex_gwas/*'

42

In [7]:
all_extra_results_dirs[:5]

['/mnt/phenomexcan_base/results/spredixcan/gtex_gwas/PGC_ADHD_EUR_2017',
 '/mnt/phenomexcan_base/results/spredixcan/gtex_gwas/CNCR_Insomnia_all',
 '/mnt/phenomexcan_base/results/spredixcan/gtex_gwas/IMMUNOBASE_Systemic_lupus_erythematosus_hg19',
 '/mnt/phenomexcan_base/results/spredixcan/gtex_gwas/Astle_et_al_2016_Reticulocyte_count',
 '/mnt/phenomexcan_base/results/spredixcan/gtex_gwas/Astle_et_al_2016_Granulocyte_count']

In [8]:
_file_pattern = 'spredixcan_igwas_gtexmashrv8_(?P<code>[^/]+)__PM__(?P<tissue>.+)\.csv$'
all_extra_phenotypes = [PhenoResults(p, _file_pattern) for p in all_extra_results_dirs]
all_extra_phenotypes_plain_names = pd.Index([p.pheno_info.get_plain_name() for p in all_extra_phenotypes])

display(len(all_extra_phenotypes))
assert len(all_extra_phenotypes) == conf.SMULTIXCAN_EXPECTED_PHENOTYPES['GTEX_GWAS']

42

# S-PrediXcan: direction of effect

## Effect direction: consensus

In [9]:
from results.spredixcan import PhenoResults

In [10]:
# from scipy import stats

# stats.norm.cdf(-1.96) * 2

# def _get_effect_direction(zscores):
#     pvalues = pd.Series(stats.norm.cdf(np.abs(zscores) * -1.0) * 2.0, index=zscores.index.tolist())
#     pvalues = pvalues[pvalues < pval_threshold]

#     if pvalues.shape[0] == 0:
#         return 0.0

#     zscores = zscores.loc[pvalues.index]
#     zscores_sign = np.sign(zscores)
#     return zscores_sign.value_counts().sort_values(ascending=False).index[0]

# _tmp = all_spredixcan_phenotypes[2]
# display(_tmp)

# data_dict = {t:_tmp.get_tissue_data(t, 'zscore', index_col='gene_simple') for t in _tmp.tissues}

# data = pd.DataFrame(data_dict)

# data.head()

# assert not data.isin([np.inf, -np.inf]).any().any()

# zscores = data.loc['ENSG00000137185']

# pval_threshold=1e-4

# pvalues = pd.Series(stats.norm.cdf(np.abs(zscores) * -1.0) * 2.0, index=zscores.index.tolist())

# pvalues = pvalues[pvalues < pval_threshold]

# pvalues

# zscores = zscores.loc[pvalues.index]

# zscores

# zscores_sign = np.sign(zscores)

# zscores_sign.value_counts().sort_values(ascending=False).index[0]

### Compute results

In [11]:
def _get_combined_results(phenos):
    return {
        pheno.pheno_info.get_plain_name():
            pheno.get_consensus_effect_direction()
        for pheno in phenos
    }

In [12]:
def _run_all(phenotype_chunks, n_jobs=conf.N_JOBS_HIGH):
    all_results = {}
    
    with ProcessPoolExecutor(max_workers=n_jobs) as executor:
        tasks = [executor.submit(_get_combined_results, chunk) for chunk in phenotype_chunks]
        for future in as_completed(tasks):
            res = future.result()
            all_results.update(res)
    
    return all_results

In [13]:
# phenotype_chunks = chunker(all_spredixcan_phenotypes[:5] + all_extra_phenotypes[:5], 2)
phenotype_chunks = chunker(all_spredixcan_phenotypes + all_extra_phenotypes, 25)

In [14]:
all_results = _run_all(phenotype_chunks, n_jobs=20)

In [15]:
len(all_results)

4091

### Create DataFrame

In [16]:
_n_expected_phenos = np.sum(list(conf.SMULTIXCAN_EXPECTED_PHENOTYPES.values()))
display(_n_expected_phenos)
assert len(all_results) == _n_expected_phenos, len(all_results)

4091

In [17]:
# the category dtype is for efficiency in storage/loading
spredixcan_genes_effect_directions = pd.DataFrame(all_results, dtype='category')
spredixcan_genes_effect_directions.index.rename('gene_name', inplace=True)

assert spredixcan_genes_effect_directions.index.is_unique

display(spredixcan_genes_effect_directions.shape)
display(spredixcan_genes_effect_directions.head())

(22518, 4091)

Unnamed: 0_level_0,C_OTHER_SKIN-Other_malignant_neoplasms_of_skin,40001_C56-Underlying_primary_cause_of_death_ICD10_C56_Malignant_neoplasm_of_ovary,2237-Plays_computer_games,I9_STR-Stroke_excluding_SAH,2405-Number_of_children_fathered,2316-Wheeze_or_whistling_in_the_chest_in_last_year,22601_35623273-Job_coding_personnel_or_industrial_relations_officer_recruitment_consultant,20003_1140923670-Treatmentmedication_code_gtn_glyceryl_trinitrate,20002_1453-Noncancer_illness_code_selfreported_psoriasis,100014_raw-Folate,...,Astle_et_al_2016_Sum_eosinophil_basophil_counts,ENIGMA_Intracraneal_Volume,RA_OKADA_TRANS_ETHNIC,SSGAC_Education_Years_Pooled,GIANT_HEIGHT,EAGLE_Eczema,IGAP_Alzheimer,Astle_et_al_2016_Sum_neutrophil_eosinophil_counts,Astle_et_al_2016_Monocyte_count,IBD.EUR.Inflammatory_Bowel_Disease
gene_name,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1,Unnamed: 5_level_1,Unnamed: 6_level_1,Unnamed: 7_level_1,Unnamed: 8_level_1,Unnamed: 9_level_1,Unnamed: 10_level_1,Unnamed: 11_level_1,Unnamed: 12_level_1,Unnamed: 13_level_1,Unnamed: 14_level_1,Unnamed: 15_level_1,Unnamed: 16_level_1,Unnamed: 17_level_1,Unnamed: 18_level_1,Unnamed: 19_level_1,Unnamed: 20_level_1,Unnamed: 21_level_1
ENSG00000000419,,,,,,,,,,,...,,,,,,,,,,
ENSG00000000457,,,-1.0,,,,,,,,...,,,,,,,,,,
ENSG00000000460,,,,,,,,,,,...,-1.0,,,,,,,,,
ENSG00000000938,,,,,,,,,,,...,,,,,,,,,,
ENSG00000000971,,,,,,,,,,,...,,,,,,,,,,


In [18]:
# For this method we should not remove nans
# Remove genes with no results
#spredixcan_genes_effect_directions = spredixcan_genes_effect_directions.dropna(axis=0, how='all')

In [19]:
# how many entries are nan
spredixcan_genes_effect_directions.isna().sum().sum()

91634228

In [20]:
_tmp = pd.Series(spredixcan_genes_effect_directions.values.flatten()).dropna().astype(float).unique()
display(_tmp)
assert set(_tmp) == set([0, 1, -1]), set(_tmp)

array([-1.,  1.,  0.])

In [21]:
display(f'Results shape: {spredixcan_genes_effect_directions.shape}')

assert spredixcan_genes_effect_directions.shape == (22518, _n_expected_phenos), spredixcan_genes_effect_directions.shape

'Results shape: (22518, 4091)'

## Testing

In [22]:
# _tmp.loc['CNCR_Insomnia_all'].sort_values(ascending=False).head()

# _tmp = all_spredixcan_phenotypes[2]

# _tmp.pheno_info.get_plain_name()

# data_dict = {t:_tmp.get_tissue_data(t, 'zscore', index_col='gene_simple') for t in _tmp.tissues}

# data = pd.DataFrame(data_dict)

# data.max(axis=1).sort_values()

In [23]:
spredixcan_genes_effect_directions.loc[
    [
        'ENSG00000137185',
        'ENSG00000000457',
        'ENSG00000095464',
        'ENSG00000228397',
        'ENSG00000279325',
        'ENSG00000000419',
    ],
    [
        '6157_3-Why_stopped_smoking_Health_precaution',
        'I50-Diagnoses_main_ICD10_I50_Heart_failure',
        'CNCR_Insomnia_all',
    ]
]

Unnamed: 0_level_0,6157_3-Why_stopped_smoking_Health_precaution,I50-Diagnoses_main_ICD10_I50_Heart_failure,CNCR_Insomnia_all
gene_name,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1
ENSG00000137185,-1.0,,
ENSG00000000457,,,
ENSG00000095464,,1.0,
ENSG00000228397,,,1.0
ENSG00000279325,,,-1.0
ENSG00000000419,,,


In [24]:
assert spredixcan_genes_effect_directions.loc['ENSG00000137185', '6157_3-Why_stopped_smoking_Health_precaution'] == -1.0
assert pd.isnull(spredixcan_genes_effect_directions.loc['ENSG00000000457', '6157_3-Why_stopped_smoking_Health_precaution'])
assert pd.isnull(spredixcan_genes_effect_directions.loc['ENSG00000095464', '6157_3-Why_stopped_smoking_Health_precaution'])

assert spredixcan_genes_effect_directions.loc['ENSG00000095464', 'I50-Diagnoses_main_ICD10_I50_Heart_failure'] == 1.0

assert pd.isnull(spredixcan_genes_effect_directions.loc['ENSG00000137185', 'CNCR_Insomnia_all'])
assert spredixcan_genes_effect_directions.loc['ENSG00000228397', 'CNCR_Insomnia_all'] == 1.0
assert spredixcan_genes_effect_directions.loc['ENSG00000279325', 'CNCR_Insomnia_all'] == -1.0

The code below was used to write the assert above; see for each gene if first and last (min and max) correspond to sign above

In [25]:
rapid_gwas_dir = conf.SPREDIXCAN_RESULTS_DIR['RapidGWASProject']
gtex_gwas_dir = conf.SPREDIXCAN_RESULTS_DIR['GTEX_GWAS']

In [26]:
%%bash -s "$rapid_gwas_dir"
cd $1/6157_3
parallel 'cat {} | cut -f1-3 -d, | column -t -s, | grep "ENSG00000137185"' ::: *.csv | sort -k3 -g
echo ""
parallel 'cat {} | cut -f1-3 -d, | column -t -s, | grep "ENSG00000000457"' ::: *.csv | sort -k3 -g | sed -e 1b -e '$!d'
echo ""
parallel 'cat {} | cut -f1-3 -d, | column -t -s, | grep "ENSG00000095464"' ::: *.csv | sort -k3 -g | sed -e 1b -e '$!d'

ENSG00000137185.11  ZSCAN9               -5.83616067550634
ENSG00000137185.11  ZSCAN9               -5.792628572580804
ENSG00000137185.11  ZSCAN9               -5.681816314702161
ENSG00000137185.11  ZSCAN9               -5.412943710075788
ENSG00000137185.11  ZSCAN9               -5.352010726928711
ENSG00000137185.11  ZSCAN9               -5.279010199840854
ENSG00000137185.11  ZSCAN9               -5.108492106415525
ENSG00000137185.11  ZSCAN9               -5.079929626396137
ENSG00000137185.11  ZSCAN9               -5.063558049159138
ENSG00000137185.11  ZSCAN9               -5.043577015552067
ENSG00000137185.11  ZSCAN9               -5.003944335955496
ENSG00000137185.11  ZSCAN9               -5.001958915404419
ENSG00000137185.11  ZSCAN9               -4.975072786720918
ENSG00000137185.11  ZSCAN9               -4.896579024471048
ENSG00000137185.11  ZSCAN9               -4.890546437827829
ENSG00000137185.11  ZSCAN9               -4.820902875527851
ENSG00000137185.11  ZSCAN9               

In [27]:
%%bash -s "$rapid_gwas_dir"
cd $1/I50
parallel 'cat {} | cut -f1-3 -d, | column -t -s, | grep "ENSG00000095464"' ::: *.csv | sort -k3 -g

ENSG00000095464.9   PDE6C                NA
ENSG00000095464.9   PDE6C                -2.987583875656128
ENSG00000095464.9   PDE6C                -1.706084537916532
ENSG00000095464.9   PDE6C                -1.6929545931506047
ENSG00000095464.9   PDE6C                -1.6694166874361254
ENSG00000095464.9   PDE6C                -1.6645126380746706
ENSG00000095464.9   PDE6C                -1.658251762390137
ENSG00000095464.9   PDE6C                -1.6582517623901367
ENSG00000095464.9   PDE6C                -1.6582517623901367
ENSG00000095464.9   PDE6C                -1.6582517623901367
ENSG00000095464.9   PDE6C                -1.652789255603333
ENSG00000095464.9   PDE6C                -1.6400961218813983
ENSG00000095464.9   PDE6C                -1.6347826064398305
ENSG00000095464.9   PDE6C                -1.6275348613259335
ENSG00000095464.9   PDE6C                -1.6171702146530154
ENSG00000095464.9   PDE6C                -1.6171702146530154
ENSG00000095464.9   PDE6C                -1.6

In [28]:
%%bash -s "$gtex_gwas_dir"
cd $1/CNCR_Insomnia_all
parallel 'cat {} | cut -f1-3 -d, | column -t -s, | grep "ENSG00000137185"' ::: *.csv | sort -k3 -g # | sed -e 1b -e '$!d'
echo ""
parallel 'cat {} | cut -f1-3 -d, | column -t -s, | grep "ENSG00000228397"' ::: *.csv | sort -k3 -g # | sed -e 1b -e '$!d'
echo ""
parallel 'cat {} | cut -f1-3 -d, | column -t -s, | grep "ENSG00000279325"' ::: *.csv | sort -k3 -g # | sed -e 1b -e '$!d'

ENSG00000137185.11  ZSCAN9               -2.0075128138942824
ENSG00000137185.11  ZSCAN9               -1.9716453552246094
ENSG00000137185.11  ZSCAN9               -1.9451855273280618
ENSG00000137185.11  ZSCAN9               -1.8545247091875776
ENSG00000137185.11  ZSCAN9               -1.7974615020480318
ENSG00000137185.11  ZSCAN9               -1.6697390108898214
ENSG00000137185.11  ZSCAN9               -1.637959840329144
ENSG00000137185.11  ZSCAN9               -1.5990879264599478
ENSG00000137185.11  ZSCAN9               -1.5792834503640862
ENSG00000137185.11  ZSCAN9               -1.5003131550674709
ENSG00000137185.11  ZSCAN9               -1.4885983833529386
ENSG00000137185.11  ZSCAN9               -1.4863373695166933
ENSG00000137185.11  ZSCAN9               -1.4656920469620405
ENSG00000137185.11  ZSCAN9               -1.4643227059342168
ENSG00000137185.11  ZSCAN9               -1.4559472334714858
ENSG00000137185.11  ZSCAN9               -1.4441600680748996
ENSG00000137185.11  ZSCAN

### Save

In [29]:
spredixcan_genes_effect_directions.shape

(22518, 4091)

In [30]:
spredixcan_genes_effect_directions.head()

Unnamed: 0_level_0,C_OTHER_SKIN-Other_malignant_neoplasms_of_skin,40001_C56-Underlying_primary_cause_of_death_ICD10_C56_Malignant_neoplasm_of_ovary,2237-Plays_computer_games,I9_STR-Stroke_excluding_SAH,2405-Number_of_children_fathered,2316-Wheeze_or_whistling_in_the_chest_in_last_year,22601_35623273-Job_coding_personnel_or_industrial_relations_officer_recruitment_consultant,20003_1140923670-Treatmentmedication_code_gtn_glyceryl_trinitrate,20002_1453-Noncancer_illness_code_selfreported_psoriasis,100014_raw-Folate,...,Astle_et_al_2016_Sum_eosinophil_basophil_counts,ENIGMA_Intracraneal_Volume,RA_OKADA_TRANS_ETHNIC,SSGAC_Education_Years_Pooled,GIANT_HEIGHT,EAGLE_Eczema,IGAP_Alzheimer,Astle_et_al_2016_Sum_neutrophil_eosinophil_counts,Astle_et_al_2016_Monocyte_count,IBD.EUR.Inflammatory_Bowel_Disease
gene_name,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1,Unnamed: 5_level_1,Unnamed: 6_level_1,Unnamed: 7_level_1,Unnamed: 8_level_1,Unnamed: 9_level_1,Unnamed: 10_level_1,Unnamed: 11_level_1,Unnamed: 12_level_1,Unnamed: 13_level_1,Unnamed: 14_level_1,Unnamed: 15_level_1,Unnamed: 16_level_1,Unnamed: 17_level_1,Unnamed: 18_level_1,Unnamed: 19_level_1,Unnamed: 20_level_1,Unnamed: 21_level_1
ENSG00000000419,,,,,,,,,,,...,,,,,,,,,,
ENSG00000000457,,,-1.0,,,,,,,,...,,,,,,,,,,
ENSG00000000460,,,,,,,,,,,...,-1.0,,,,,,,,,
ENSG00000000938,,,,,,,,,,,...,,,,,,,,,,
ENSG00000000971,,,,,,,,,,,...,,,,,,,,,,


In [31]:
# Save
spredixcan_genes_effect_directions_filename = os.path.join(conf.GENE_ASSOC_DIR, f'spredixcan-mashr-effect_direction-consensus.pkl.xz')
display(spredixcan_genes_effect_directions_filename)

'/mnt/phenomexcan_base/gene_assoc/spredixcan-mashr-effect_direction-consensus.pkl.xz'

In [32]:
spredixcan_genes_effect_directions.to_pickle(spredixcan_genes_effect_directions_filename)

### Save in HDF5 format for webapp

In [33]:
spredixcan_genes_effect_directions = pd.read_pickle(spredixcan_genes_effect_directions_filename)

In [34]:
spredixcan_genes_effect_directions.shape

(22518, 4091)

In [35]:
from utils import simplify_string_for_hdf5

In [36]:
OUTPUT_HDF5_FILE = os.path.join(conf.GENE_ASSOC_DIR, 'spredixcan-mashr-effect_direction-consensus.h5')
display(OUTPUT_HDF5_FILE)

'/mnt/phenomexcan_base/gene_assoc/spredixcan-mashr-effect_direction-consensus.h5'

In [37]:
with pd.HDFStore(OUTPUT_HDF5_FILE, mode='w', complevel=1) as store:
    for col in spredixcan_genes_effect_directions.columns:
        #print('.', flush=True, end='')
        clean_col = simplify_string_for_hdf5(col)
        store[clean_col] = spredixcan_genes_effect_directions[col].astype(float)

In [38]:
# testing
with pd.HDFStore(OUTPUT_HDF5_FILE, mode='r') as store:
    store_keys = list(store.keys())
    assert len(store_keys) == spredixcan_genes_effect_directions.shape[1]
    display(store_keys[:5])
    
    clean_col = simplify_string_for_hdf5('6157_3-Why_stopped_smoking_Health_precaution')
    data = store[clean_col]
    assert data.shape == (22518,), data.shape
    assert data.loc['ENSG00000137185'] == -1.0
    assert pd.isnull(data.loc['ENSG00000000457'])
    assert pd.isnull(data.loc['ENSG00000095464'])
    
    clean_col = simplify_string_for_hdf5('I50-Diagnoses_main_ICD10_I50_Heart_failure')
    data = store[clean_col]
    assert data.shape == (22518,), data.shape
    assert data.loc['ENSG00000095464'] == 1.0
    
    clean_col = simplify_string_for_hdf5('CNCR_Insomnia_all')
    data = store[clean_col]
    assert data.shape == (22518,), data.shape
    assert pd.isnull(data.loc['ENSG00000137185'])
    assert data.loc['ENSG00000228397'] == 1.0
    assert data.loc['ENSG00000279325'] == -1.0

['/c100001_raw_Food_weight',
 '/c100002_raw_Energy',
 '/c100003_raw_Protein',
 '/c100004_raw_Fat',
 '/c100005_raw_Carbohydrate']