In [None]:
%load_ext autoreload
%autoreload 2

In [None]:
import os
import pickle
from glob import glob
from concurrent.futures import ProcessPoolExecutor, as_completed

import numpy as np
import pandas as pd
from scipy import stats

import settings as conf
from utils import is_number, chunker

# Load S-PrediXcan results

## From Rapid GWAS project

In [None]:
from results.spredixcan import PhenoResults

In [None]:
_path = os.path.join(conf.SPREDIXCAN_RESULTS_DIR['RapidGWASProject'] + '/*')
display(_path)
all_spredixcan_results_dirs = glob(_path)
display(len(all_spredixcan_results_dirs))
assert len(all_spredixcan_results_dirs) == conf.SPREDIXCAN_EXPECTED_PHENOTYPES['RapidGWASProject']

In [None]:
all_spredixcan_phenotypes = [PhenoResults(p) for p in all_spredixcan_results_dirs]

display(len(all_spredixcan_phenotypes))
assert len(all_spredixcan_phenotypes) == conf.SPREDIXCAN_EXPECTED_PHENOTYPES['RapidGWASProject']

## From GTEx GWAS manuscript

In [None]:
_path = os.path.join(conf.SPREDIXCAN_RESULTS_DIR['GTEX_GWAS'] + '/*')
display(_path)
all_extra_results_dirs = glob(_path)
display(len(all_extra_results_dirs))
assert len(all_extra_results_dirs) == conf.SPREDIXCAN_EXPECTED_PHENOTYPES['GTEX_GWAS']

In [None]:
all_extra_results_dirs[:5]

In [None]:
_file_pattern = 'spredixcan_igwas_gtexmashrv8_(?P<code>[^/]+)__PM__(?P<tissue>.+)\.csv$'
all_extra_phenotypes = [PhenoResults(p, _file_pattern) for p in all_extra_results_dirs]
all_extra_phenotypes_plain_names = pd.Index([p.pheno_info.get_plain_name() for p in all_extra_phenotypes])

display(len(all_extra_phenotypes))
assert len(all_extra_phenotypes) == conf.SMULTIXCAN_EXPECTED_PHENOTYPES['GTEX_GWAS']

# S-PrediXcan: direction of effect

## Effect direction: most significant

### Compute results

In [None]:
def _get_combined_results(phenos):
    return {
        pheno.pheno_info.get_plain_name():
            pheno.get_most_significant_effect_direction()
        for pheno in phenos
    }

In [None]:
def _run_all(phenotype_chunks, n_jobs=conf.N_JOBS_HIGH):
    all_results = {}
    
    with ProcessPoolExecutor(max_workers=n_jobs) as executor:
        tasks = [executor.submit(_get_combined_results, chunk) for chunk in phenotype_chunks]
        for future in as_completed(tasks):
            res = future.result()
            all_results.update(res)
    
    return all_results

In [None]:
# phenotype_chunks = chunker(all_spredixcan_phenotypes[:5] + all_extra_phenotypes[:5], 2)
phenotype_chunks = chunker(all_spredixcan_phenotypes + all_extra_phenotypes, 25)

In [None]:
all_results = _run_all(phenotype_chunks, n_jobs=20)

In [None]:
len(all_results)

### Create DataFrame

In [None]:
_n_expected_phenos = np.sum(list(conf.SMULTIXCAN_EXPECTED_PHENOTYPES.values()))
display(_n_expected_phenos)
assert len(all_results) == _n_expected_phenos, len(all_results)

In [None]:
# the category dtype is for efficiency in storage/loading
spredixcan_genes_effect_directions = pd.DataFrame(all_results, dtype='category')
spredixcan_genes_effect_directions.index.rename('gene_name', inplace=True)

assert spredixcan_genes_effect_directions.index.is_unique

display(spredixcan_genes_effect_directions.shape)
display(spredixcan_genes_effect_directions.head())

In [None]:
# Remove genes with no results
spredixcan_genes_effect_directions = spredixcan_genes_effect_directions.dropna(axis=0, how='all')

In [None]:
# how many entries are nan
spredixcan_genes_effect_directions.isna().sum().sum()

In [None]:
pd.Series(spredixcan_genes_effect_directions.values.flatten()).dropna().astype(float).unique()

In [None]:
display(f'Results shape: {spredixcan_genes_effect_directions.shape}')

# FIXME: uncomment and check numbers
# assert spredixcan_genes_effect_directions.shape == (22515, _n_expected_phenos), spredixcan_genes_effect_directions.shape

## Testing

In [None]:
spredixcan_genes_effect_directions.loc[
    [
        'ENSG00000000419',
        'ENSG00000000457',
        'ENSG00000000460',
        'ENSG00000186090', # zero
        'ENSG00000007202', # zero
    ],
    [
        'N02-Diagnoses_main_ICD10_N02_Recurrent_and_persistent_haematuria',
        'Astle_et_al_2016_Reticulocyte_count',
        'PGC_ADHD_EUR_2017',
        'IMMUNOBASE_Systemic_lupus_erythematosus_hg19',
    ]
]

In [None]:
assert spredixcan_genes_effect_directions.loc['ENSG00000000419', 'N02-Diagnoses_main_ICD10_N02_Recurrent_and_persistent_haematuria'] == -1.0
assert spredixcan_genes_effect_directions.loc['ENSG00000000457', 'N02-Diagnoses_main_ICD10_N02_Recurrent_and_persistent_haematuria'] == 1.0
assert spredixcan_genes_effect_directions.loc['ENSG00000000460', 'N02-Diagnoses_main_ICD10_N02_Recurrent_and_persistent_haematuria'] == -1.0

assert spredixcan_genes_effect_directions.loc['ENSG00000000419', 'Astle_et_al_2016_Reticulocyte_count'] == -1.0
assert spredixcan_genes_effect_directions.loc['ENSG00000000457', 'Astle_et_al_2016_Reticulocyte_count'] == 1.0
assert spredixcan_genes_effect_directions.loc['ENSG00000000460', 'Astle_et_al_2016_Reticulocyte_count'] == 1.0

assert spredixcan_genes_effect_directions.loc['ENSG00000186090', 'PGC_ADHD_EUR_2017'] == 0.0
assert spredixcan_genes_effect_directions.loc['ENSG00000007202', 'PGC_ADHD_EUR_2017'] == 0.0

assert spredixcan_genes_effect_directions.loc['ENSG00000007202', 'IMMUNOBASE_Systemic_lupus_erythematosus_hg19'] == 0.0

The code below was used to write the assert above; see for each gene if first and last (min and max) correspond to sign above

In [None]:
rapid_gwas_dir = conf.SPREDIXCAN_RESULTS_DIR['RapidGWASProject']
gtex_gwas_dir = conf.SPREDIXCAN_RESULTS_DIR['GTEX_GWAS']

In [None]:
%%bash -s "$rapid_gwas_dir"
cd $1/N02
parallel 'cat {} | cut -f1-3 -d, | column -t -s, | grep "ENSG00000000419"' ::: *.csv | sort -k3 -g | sed -e 1b -e '$!d'
echo ""
parallel 'cat {} | cut -f1-3 -d, | column -t -s, | grep "ENSG00000000457"' ::: *.csv | sort -k3 -g | sed -e 1b -e '$!d'
echo ""
parallel 'cat {} | cut -f1-3 -d, | column -t -s, | grep "ENSG00000000460"' ::: *.csv | sort -k3 -g | sed -e 1b -e '$!d'

In [None]:
%%bash -s "$gtex_gwas_dir"
cd $1/Astle_et_al_2016_Reticulocyte_count
parallel 'cat {} | cut -f1-3 -d, | column -t -s, | grep "ENSG00000000419"' ::: *.csv | sort -k3 -g | sed -e 1b -e '$!d'
echo ""
parallel 'cat {} | cut -f1-3 -d, | column -t -s, | grep "ENSG00000000457"' ::: *.csv | sort -k3 -g | sed -e 1b -e '$!d'
echo ""
parallel 'cat {} | cut -f1-3 -d, | column -t -s, | grep "ENSG00000000460"' ::: *.csv | sort -k3 -g | sed -e 1b -e '$!d'

In [None]:
%%bash -s "$gtex_gwas_dir"
cd $1/PGC_ADHD_EUR_2017
parallel 'cat {} | cut -f1-3 -d, | column -t -s, | grep "ENSG00000186090"' ::: *.csv | sort -k3 -g | sed -e 1b -e '$!d'
echo ""
parallel 'cat {} | cut -f1-3 -d, | column -t -s, | grep "ENSG00000007202"' ::: *.csv | sort -k3 -g | sed -e 1b -e '$!d'

In [None]:
%%bash -s "$gtex_gwas_dir"
cd $1/IMMUNOBASE_Systemic_lupus_erythematosus_hg19
parallel 'cat {} | cut -f1-3 -d, | column -t -s, | grep "ENSG00000007202"' ::: *.csv | sort -k3 -g | sed -e 1b -e '$!d'

### Save

In [None]:
spredixcan_genes_effect_directions.shape

In [None]:
spredixcan_genes_effect_directions.head()

In [None]:
# Save
spredixcan_genes_effect_directions_filename = os.path.join(conf.GENE_ASSOC_DIR, f'spredixcan-mashr-effect_direction-most_signif.pkl.xz')
display(spredixcan_genes_effect_directions_filename)

In [None]:
spredixcan_genes_effect_directions.to_pickle(spredixcan_genes_effect_directions_filename)

### Save in HDF5 format for webapp

In [None]:
spredixcan_genes_effect_directions = pd.read_pickle(spredixcan_genes_effect_directions_filename)

In [None]:
spredixcan_genes_effect_directions.shape

In [None]:
from utils import simplify_string_for_hdf5

In [None]:
OUTPUT_HDF5_FILE = os.path.join(conf.GENE_ASSOC_DIR, 'spredixcan-mashr-effect_direction-most_signif.h5')
display(OUTPUT_HDF5_FILE)

In [None]:
with pd.HDFStore(OUTPUT_HDF5_FILE, mode='w', complevel=1) as store:
    for col in spredixcan_genes_effect_directions.columns:
        print('.', flush=True, end='')
        
        clean_col = simplify_string_for_hdf5(col)
        store[clean_col] = spredixcan_genes_effect_directions[col].astype(float)

In [None]:
# testing
with pd.HDFStore(OUTPUT_HDF5_FILE, mode='r') as store:
    store_keys = list(store.keys())
    assert len(store_keys) == spredixcan_genes_effect_directions.shape[1]
    display(store_keys[:5])
    
    clean_col = simplify_string_for_hdf5('N02-Diagnoses_main_ICD10_N02_Recurrent_and_persistent_haematuria')
    data = store[clean_col]
    assert data.shape == (22515,), data.shape
    assert data.loc['ENSG00000000419'] == -1.0
    assert data.loc['ENSG00000000457'] == 1.0
    assert data.loc['ENSG00000000460'] == -1.0
    
    clean_col = simplify_string_for_hdf5('Astle_et_al_2016_Reticulocyte_count')
    data = store[clean_col]
    assert data.shape == (22515,), data.shape
    assert data.loc['ENSG00000000419'] == -1.0
    assert data.loc['ENSG00000000457'] == 1.0
    assert data.loc['ENSG00000000460'] == 1.0
    
    clean_col = simplify_string_for_hdf5('PGC_ADHD_EUR_2017')
    data = store[clean_col]
    assert data.shape == (22515,), data.shape
    assert data.loc['ENSG00000186090'] == 0.0
    assert data.loc['ENSG00000007202'] == 0.0
    
    clean_col = simplify_string_for_hdf5('IMMUNOBASE_Systemic_lupus_erythematosus_hg19')
    data = store[clean_col]
    assert data.shape == (22515,), data.shape
    assert data.loc['ENSG00000007202'] == 0.0