In [1]:
%load_ext autoreload
%autoreload 2

In [2]:
import os
import pickle
from glob import glob
from concurrent.futures import ProcessPoolExecutor, as_completed

import numpy as np
import pandas as pd
from scipy import stats

import settings as conf
from utils import is_number, chunker

# Load S-PrediXcan results

## From Rapid GWAS project

In [3]:
from results.spredixcan import PhenoResults

In [4]:
_path = os.path.join(conf.SPREDIXCAN_RESULTS_DIR['RapidGWASProject'] + '/*')
display(_path)
all_spredixcan_results_dirs = glob(_path)
display(len(all_spredixcan_results_dirs))
assert len(all_spredixcan_results_dirs) == conf.SPREDIXCAN_EXPECTED_PHENOTYPES['RapidGWASProject']

'/mnt/phenomexcan_base/results/spredixcan/rapid_gwas_project/*'

4049

In [5]:
all_spredixcan_phenotypes = [PhenoResults(p) for p in all_spredixcan_results_dirs]

display(len(all_spredixcan_phenotypes))
assert len(all_spredixcan_phenotypes) == conf.SPREDIXCAN_EXPECTED_PHENOTYPES['RapidGWASProject']

4049

## From GTEx GWAS manuscript

In [6]:
_path = os.path.join(conf.SPREDIXCAN_RESULTS_DIR['GTEX_GWAS'] + '/*')
display(_path)
all_extra_results_dirs = glob(_path)
display(len(all_extra_results_dirs))
assert len(all_extra_results_dirs) == conf.SPREDIXCAN_EXPECTED_PHENOTYPES['GTEX_GWAS']

'/mnt/phenomexcan_base/results/spredixcan/gtex_gwas/*'

42

In [7]:
all_extra_results_dirs[:5]

['/mnt/phenomexcan_base/results/spredixcan/gtex_gwas/PGC_ADHD_EUR_2017',
 '/mnt/phenomexcan_base/results/spredixcan/gtex_gwas/CNCR_Insomnia_all',
 '/mnt/phenomexcan_base/results/spredixcan/gtex_gwas/IMMUNOBASE_Systemic_lupus_erythematosus_hg19',
 '/mnt/phenomexcan_base/results/spredixcan/gtex_gwas/Astle_et_al_2016_Reticulocyte_count',
 '/mnt/phenomexcan_base/results/spredixcan/gtex_gwas/Astle_et_al_2016_Granulocyte_count']

In [8]:
_file_pattern = 'spredixcan_igwas_gtexmashrv8_(?P<code>[^/]+)__PM__(?P<tissue>.+)\.csv$'
all_extra_phenotypes = [PhenoResults(p, _file_pattern) for p in all_extra_results_dirs]
all_extra_phenotypes_plain_names = pd.Index([p.pheno_info.get_plain_name() for p in all_extra_phenotypes])

display(len(all_extra_phenotypes))
assert len(all_extra_phenotypes) == conf.SMULTIXCAN_EXPECTED_PHENOTYPES['GTEX_GWAS']

42

# S-PrediXcan: direction of effect

## Effect direction: most significant

### Compute results

In [9]:
def _get_combined_results(phenos):
    return {
        pheno.pheno_info.get_plain_name():
            pheno.get_most_significant_effect_direction()
        for pheno in phenos
    }

In [10]:
def _run_all(phenotype_chunks, n_jobs=conf.N_JOBS_HIGH):
    all_results = {}
    
    with ProcessPoolExecutor(max_workers=n_jobs) as executor:
        tasks = [executor.submit(_get_combined_results, chunk) for chunk in phenotype_chunks]
        for future in as_completed(tasks):
            res = future.result()
            all_results.update(res)
    
    return all_results

In [11]:
# phenotype_chunks = chunker(all_spredixcan_phenotypes[:5] + all_extra_phenotypes[:5], 2)
phenotype_chunks = chunker(all_spredixcan_phenotypes + all_extra_phenotypes, 25)

In [12]:
all_results = _run_all(phenotype_chunks, n_jobs=20)

In [13]:
len(all_results)

4091

### Create DataFrame

In [14]:
_n_expected_phenos = np.sum(list(conf.SMULTIXCAN_EXPECTED_PHENOTYPES.values()))
display(_n_expected_phenos)
assert len(all_results) == _n_expected_phenos, len(all_results)

4091

In [15]:
# the category dtype is for efficiency in storage/loading
spredixcan_genes_effect_directions = pd.DataFrame(all_results, dtype='category')
spredixcan_genes_effect_directions.index.rename('gene_name', inplace=True)

assert spredixcan_genes_effect_directions.index.is_unique

display(spredixcan_genes_effect_directions.shape)
display(spredixcan_genes_effect_directions.head())

(22518, 4091)

Unnamed: 0_level_0,22617_9219-Job_SOC_coding_Elementary_office_occupations_nec,6157_3-Why_stopped_smoking_Health_precaution,I50-Diagnoses_main_ICD10_I50_Heart_failure,20003_1140874370-Treatmentmedication_code_aciclovir,N02-Diagnoses_main_ICD10_N02_Recurrent_and_persistent_haematuria,20002_1585-Noncancer_illness_code_selfreported_mitral_regurgitation_incompetence,6138_6-Qualifications_Other_professional_qualifications_eg_nursing_teaching,20003_1140869930-Treatmentmedication_code_azathioprine,102800-Cheese_consumers,40001_C159-Underlying_primary_cause_of_death_ICD10_C159_Oesophagus_unspecified,...,Astle_et_al_2016_Sum_eosinophil_basophil_counts,ENIGMA_Intracraneal_Volume,RA_OKADA_TRANS_ETHNIC,SSGAC_Education_Years_Pooled,GIANT_HEIGHT,EAGLE_Eczema,IGAP_Alzheimer,Astle_et_al_2016_Sum_neutrophil_eosinophil_counts,Astle_et_al_2016_Monocyte_count,IBD.EUR.Inflammatory_Bowel_Disease
gene_name,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1,Unnamed: 5_level_1,Unnamed: 6_level_1,Unnamed: 7_level_1,Unnamed: 8_level_1,Unnamed: 9_level_1,Unnamed: 10_level_1,Unnamed: 11_level_1,Unnamed: 12_level_1,Unnamed: 13_level_1,Unnamed: 14_level_1,Unnamed: 15_level_1,Unnamed: 16_level_1,Unnamed: 17_level_1,Unnamed: 18_level_1,Unnamed: 19_level_1,Unnamed: 20_level_1,Unnamed: 21_level_1
ENSG00000000419,-1.0,1.0,1.0,-1.0,-1.0,-1.0,1.0,1.0,1.0,1.0,...,1.0,-1.0,1.0,-1.0,1.0,1.0,-1.0,1.0,1.0,-1.0
ENSG00000000457,1.0,-1.0,-1.0,-1.0,1.0,-1.0,1.0,-1.0,-1.0,1.0,...,-1.0,-1.0,1.0,-1.0,-1.0,1.0,1.0,1.0,1.0,-1.0
ENSG00000000460,-1.0,-1.0,1.0,-1.0,-1.0,-1.0,1.0,-1.0,-1.0,1.0,...,-1.0,-1.0,1.0,1.0,-1.0,1.0,-1.0,-1.0,1.0,1.0
ENSG00000000938,1.0,1.0,-1.0,-1.0,-1.0,1.0,1.0,1.0,0.0,1.0,...,-1.0,-1.0,-1.0,1.0,1.0,0.0,1.0,1.0,0.0,1.0
ENSG00000000971,1.0,1.0,1.0,1.0,-1.0,-1.0,-1.0,-1.0,1.0,-1.0,...,1.0,1.0,1.0,-1.0,1.0,-1.0,-1.0,-1.0,-1.0,-1.0


In [16]:
# Remove genes with no results
#spredixcan_genes_effect_directions = spredixcan_genes_effect_directions.dropna(axis=0, how='all')

In [17]:
# how many entries are nan
spredixcan_genes_effect_directions.isna().sum().sum()

1065328

In [18]:
pd.Series(spredixcan_genes_effect_directions.values.flatten()).dropna().astype(float).unique()

array([-1.,  1.,  0.])

In [19]:
display(f'Results shape: {spredixcan_genes_effect_directions.shape}')

assert spredixcan_genes_effect_directions.shape == (22518, _n_expected_phenos), spredixcan_genes_effect_directions.shape

'Results shape: (22518, 4091)'

## Testing

In [20]:
spredixcan_genes_effect_directions.loc[
    [
        'ENSG00000000419',
        'ENSG00000000457',
        'ENSG00000000460',
        'ENSG00000186090', # zero
        'ENSG00000007202', # zero
    ],
    [
        'N02-Diagnoses_main_ICD10_N02_Recurrent_and_persistent_haematuria',
        'Astle_et_al_2016_Reticulocyte_count',
        'PGC_ADHD_EUR_2017',
        'IMMUNOBASE_Systemic_lupus_erythematosus_hg19',
    ]
]

Unnamed: 0_level_0,N02-Diagnoses_main_ICD10_N02_Recurrent_and_persistent_haematuria,Astle_et_al_2016_Reticulocyte_count,PGC_ADHD_EUR_2017,IMMUNOBASE_Systemic_lupus_erythematosus_hg19
gene_name,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1
ENSG00000000419,-1.0,-1.0,-1.0,-1.0
ENSG00000000457,1.0,1.0,1.0,1.0
ENSG00000000460,-1.0,1.0,-1.0,1.0
ENSG00000186090,0.0,0.0,0.0,0.0
ENSG00000007202,0.0,0.0,0.0,0.0


In [21]:
assert spredixcan_genes_effect_directions.loc['ENSG00000000419', 'N02-Diagnoses_main_ICD10_N02_Recurrent_and_persistent_haematuria'] == -1.0
assert spredixcan_genes_effect_directions.loc['ENSG00000000457', 'N02-Diagnoses_main_ICD10_N02_Recurrent_and_persistent_haematuria'] == 1.0
assert spredixcan_genes_effect_directions.loc['ENSG00000000460', 'N02-Diagnoses_main_ICD10_N02_Recurrent_and_persistent_haematuria'] == -1.0

assert spredixcan_genes_effect_directions.loc['ENSG00000000419', 'Astle_et_al_2016_Reticulocyte_count'] == -1.0
assert spredixcan_genes_effect_directions.loc['ENSG00000000457', 'Astle_et_al_2016_Reticulocyte_count'] == 1.0
assert spredixcan_genes_effect_directions.loc['ENSG00000000460', 'Astle_et_al_2016_Reticulocyte_count'] == 1.0

assert spredixcan_genes_effect_directions.loc['ENSG00000186090', 'PGC_ADHD_EUR_2017'] == 0.0
assert spredixcan_genes_effect_directions.loc['ENSG00000007202', 'PGC_ADHD_EUR_2017'] == 0.0

assert spredixcan_genes_effect_directions.loc['ENSG00000007202', 'IMMUNOBASE_Systemic_lupus_erythematosus_hg19'] == 0.0

The code below was used to write the assert above; see for each gene if first and last (min and max) correspond to sign above

In [22]:
rapid_gwas_dir = conf.SPREDIXCAN_RESULTS_DIR['RapidGWASProject']
gtex_gwas_dir = conf.SPREDIXCAN_RESULTS_DIR['GTEX_GWAS']

In [23]:
%%bash -s "$rapid_gwas_dir"
cd $1/N02
parallel 'cat {} | cut -f1-3 -d, | column -t -s, | grep "ENSG00000000419"' ::: *.csv | sort -k3 -g | sed -e 1b -e '$!d'
echo ""
parallel 'cat {} | cut -f1-3 -d, | column -t -s, | grep "ENSG00000000457"' ::: *.csv | sort -k3 -g | sed -e 1b -e '$!d'
echo ""
parallel 'cat {} | cut -f1-3 -d, | column -t -s, | grep "ENSG00000000460"' ::: *.csv | sort -k3 -g | sed -e 1b -e '$!d'

ENSG00000000419.12  DPM1                 -0.7218472957611084
ENSG00000000419.12  DPM1                 -0.010349205695092676

ENSG00000000457.13  SCYL3                -0.5192564725875854
ENSG00000000457.13  SCYL3                1.358436666457161

ENSG00000000460.16  C1orf112             -2.2396101272579236
ENSG00000000460.16  C1orf112             2.2287585735321045


In [24]:
%%bash -s "$gtex_gwas_dir"
cd $1/Astle_et_al_2016_Reticulocyte_count
parallel 'cat {} | cut -f1-3 -d, | column -t -s, | grep "ENSG00000000419"' ::: *.csv | sort -k3 -g | sed -e 1b -e '$!d'
echo ""
parallel 'cat {} | cut -f1-3 -d, | column -t -s, | grep "ENSG00000000457"' ::: *.csv | sort -k3 -g | sed -e 1b -e '$!d'
echo ""
parallel 'cat {} | cut -f1-3 -d, | column -t -s, | grep "ENSG00000000460"' ::: *.csv | sort -k3 -g | sed -e 1b -e '$!d'

ENSG00000000419.12  DPM1                 -0.8962860703468323
ENSG00000000419.12  DPM1                 -0.10965126752853394

ENSG00000000457.13  SCYL3                -0.8681464019116717
ENSG00000000457.13  SCYL3                1.8784626842063896

ENSG00000000460.16  C1orf112             -1.39273476600647
ENSG00000000460.16  C1orf112             1.8037986617280548


In [25]:
%%bash -s "$gtex_gwas_dir"
cd $1/PGC_ADHD_EUR_2017
parallel 'cat {} | cut -f1-3 -d, | column -t -s, | grep "ENSG00000186090"' ::: *.csv | sort -k3 -g | sed -e 1b -e '$!d'
echo ""
parallel 'cat {} | cut -f1-3 -d, | column -t -s, | grep "ENSG00000007202"' ::: *.csv | sort -k3 -g | sed -e 1b -e '$!d'

ENSG00000186090.10  HTR3D                0.0

ENSG00000007202.14  KIAA0100             -0.6554159522056578
ENSG00000007202.14  KIAA0100             0.655415952205658


In [26]:
%%bash -s "$gtex_gwas_dir"
cd $1/IMMUNOBASE_Systemic_lupus_erythematosus_hg19
parallel 'cat {} | cut -f1-3 -d, | column -t -s, | grep "ENSG00000007202"' ::: *.csv | sort -k3 -g | sed -e 1b -e '$!d'

ENSG00000007202.14  KIAA0100             0.0
ENSG00000007202.14  KIAA0100             0.0


### Save

In [27]:
spredixcan_genes_effect_directions.shape

(22518, 4091)

In [28]:
spredixcan_genes_effect_directions.head()

Unnamed: 0_level_0,22617_9219-Job_SOC_coding_Elementary_office_occupations_nec,6157_3-Why_stopped_smoking_Health_precaution,I50-Diagnoses_main_ICD10_I50_Heart_failure,20003_1140874370-Treatmentmedication_code_aciclovir,N02-Diagnoses_main_ICD10_N02_Recurrent_and_persistent_haematuria,20002_1585-Noncancer_illness_code_selfreported_mitral_regurgitation_incompetence,6138_6-Qualifications_Other_professional_qualifications_eg_nursing_teaching,20003_1140869930-Treatmentmedication_code_azathioprine,102800-Cheese_consumers,40001_C159-Underlying_primary_cause_of_death_ICD10_C159_Oesophagus_unspecified,...,Astle_et_al_2016_Sum_eosinophil_basophil_counts,ENIGMA_Intracraneal_Volume,RA_OKADA_TRANS_ETHNIC,SSGAC_Education_Years_Pooled,GIANT_HEIGHT,EAGLE_Eczema,IGAP_Alzheimer,Astle_et_al_2016_Sum_neutrophil_eosinophil_counts,Astle_et_al_2016_Monocyte_count,IBD.EUR.Inflammatory_Bowel_Disease
gene_name,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1,Unnamed: 5_level_1,Unnamed: 6_level_1,Unnamed: 7_level_1,Unnamed: 8_level_1,Unnamed: 9_level_1,Unnamed: 10_level_1,Unnamed: 11_level_1,Unnamed: 12_level_1,Unnamed: 13_level_1,Unnamed: 14_level_1,Unnamed: 15_level_1,Unnamed: 16_level_1,Unnamed: 17_level_1,Unnamed: 18_level_1,Unnamed: 19_level_1,Unnamed: 20_level_1,Unnamed: 21_level_1
ENSG00000000419,-1.0,1.0,1.0,-1.0,-1.0,-1.0,1.0,1.0,1.0,1.0,...,1.0,-1.0,1.0,-1.0,1.0,1.0,-1.0,1.0,1.0,-1.0
ENSG00000000457,1.0,-1.0,-1.0,-1.0,1.0,-1.0,1.0,-1.0,-1.0,1.0,...,-1.0,-1.0,1.0,-1.0,-1.0,1.0,1.0,1.0,1.0,-1.0
ENSG00000000460,-1.0,-1.0,1.0,-1.0,-1.0,-1.0,1.0,-1.0,-1.0,1.0,...,-1.0,-1.0,1.0,1.0,-1.0,1.0,-1.0,-1.0,1.0,1.0
ENSG00000000938,1.0,1.0,-1.0,-1.0,-1.0,1.0,1.0,1.0,0.0,1.0,...,-1.0,-1.0,-1.0,1.0,1.0,0.0,1.0,1.0,0.0,1.0
ENSG00000000971,1.0,1.0,1.0,1.0,-1.0,-1.0,-1.0,-1.0,1.0,-1.0,...,1.0,1.0,1.0,-1.0,1.0,-1.0,-1.0,-1.0,-1.0,-1.0


In [29]:
# Save
spredixcan_genes_effect_directions_filename = os.path.join(conf.GENE_ASSOC_DIR, f'spredixcan-mashr-effect_direction-most_signif.pkl.xz')
display(spredixcan_genes_effect_directions_filename)

'/mnt/phenomexcan_base/gene_assoc/spredixcan-mashr-effect_direction-most_signif.pkl.xz'

In [30]:
spredixcan_genes_effect_directions.to_pickle(spredixcan_genes_effect_directions_filename)

### Save in HDF5 format for webapp

In [31]:
spredixcan_genes_effect_directions = pd.read_pickle(spredixcan_genes_effect_directions_filename)

In [32]:
spredixcan_genes_effect_directions.shape

(22518, 4091)

In [33]:
from utils import simplify_string_for_hdf5

In [34]:
OUTPUT_HDF5_FILE = os.path.join(conf.GENE_ASSOC_DIR, 'spredixcan-mashr-effect_direction-most_signif.h5')
display(OUTPUT_HDF5_FILE)

'/mnt/phenomexcan_base/gene_assoc/spredixcan-mashr-effect_direction-most_signif.h5'

In [35]:
with pd.HDFStore(OUTPUT_HDF5_FILE, mode='w', complevel=1) as store:
    for col in spredixcan_genes_effect_directions.columns:
        #print('.', flush=True, end='')
        clean_col = simplify_string_for_hdf5(col)
        store[clean_col] = spredixcan_genes_effect_directions[col].astype(float)

In [36]:
# testing
with pd.HDFStore(OUTPUT_HDF5_FILE, mode='r') as store:
    store_keys = list(store.keys())
    assert len(store_keys) == spredixcan_genes_effect_directions.shape[1]
    display(store_keys[:5])
    
    clean_col = simplify_string_for_hdf5('N02-Diagnoses_main_ICD10_N02_Recurrent_and_persistent_haematuria')
    data = store[clean_col]
    assert data.shape == (22518,), data.shape
    assert data.loc['ENSG00000000419'] == -1.0
    assert data.loc['ENSG00000000457'] == 1.0
    assert data.loc['ENSG00000000460'] == -1.0
    
    clean_col = simplify_string_for_hdf5('Astle_et_al_2016_Reticulocyte_count')
    data = store[clean_col]
    assert data.shape == (22518,), data.shape
    assert data.loc['ENSG00000000419'] == -1.0
    assert data.loc['ENSG00000000457'] == 1.0
    assert data.loc['ENSG00000000460'] == 1.0
    
    clean_col = simplify_string_for_hdf5('PGC_ADHD_EUR_2017')
    data = store[clean_col]
    assert data.shape == (22518,), data.shape
    assert data.loc['ENSG00000186090'] == 0.0
    assert data.loc['ENSG00000007202'] == 0.0
    
    clean_col = simplify_string_for_hdf5('IMMUNOBASE_Systemic_lupus_erythematosus_hg19')
    data = store[clean_col]
    assert data.shape == (22518,), data.shape
    assert data.loc['ENSG00000007202'] == 0.0

['/c100001_raw_Food_weight',
 '/c100002_raw_Energy',
 '/c100003_raw_Protein',
 '/c100004_raw_Fat',
 '/c100005_raw_Carbohydrate']