In [None]:
%load_ext autoreload
%autoreload 2

In [None]:
import os
import pickle
from glob import glob
from concurrent.futures import ProcessPoolExecutor, as_completed

import numpy as np
import pandas as pd
from scipy import stats

import settings as conf
from utils import is_number, chunker

# Load S-PrediXcan results

## From Rapid GWAS project

In [None]:
from results.spredixcan import PhenoResults

In [None]:
_path = os.path.join(conf.SPREDIXCAN_RESULTS_DIR['RapidGWASProject'] + '/*')
display(_path)
all_spredixcan_results_dirs = glob(_path)
display(len(all_spredixcan_results_dirs))
assert len(all_spredixcan_results_dirs) == conf.SPREDIXCAN_EXPECTED_PHENOTYPES['RapidGWASProject']

In [None]:
all_spredixcan_phenotypes = [PhenoResults(p) for p in all_spredixcan_results_dirs]

display(len(all_spredixcan_phenotypes))
assert len(all_spredixcan_phenotypes) == conf.SPREDIXCAN_EXPECTED_PHENOTYPES['RapidGWASProject']

## From GTEx GWAS manuscript

In [None]:
_path = os.path.join(conf.SPREDIXCAN_RESULTS_DIR['GTEX_GWAS'] + '/*')
display(_path)
all_extra_results_dirs = glob(_path)
display(len(all_extra_results_dirs))
assert len(all_extra_results_dirs) == conf.SPREDIXCAN_EXPECTED_PHENOTYPES['GTEX_GWAS']

In [None]:
all_extra_results_dirs[:5]

In [None]:
_file_pattern = 'spredixcan_igwas_gtexmashrv8_(?P<code>[^/]+)__PM__(?P<tissue>.+)\.csv$'
all_extra_phenotypes = [PhenoResults(p, _file_pattern) for p in all_extra_results_dirs]
all_extra_phenotypes_plain_names = pd.Index([p.pheno_info.get_plain_name() for p in all_extra_phenotypes])

display(len(all_extra_phenotypes))
assert len(all_extra_phenotypes) == conf.SMULTIXCAN_EXPECTED_PHENOTYPES['GTEX_GWAS']

# S-PrediXcan: pvalues

In [None]:
from results.spredixcan import PhenoResults
from results.gtex_model import GTEXModel

In [None]:
OUTPUT_FOLDER = os.path.join(conf.GENE_ASSOC_DIR, f'spredixcan')
display(OUTPUT_FOLDER)
os.makedirs(OUTPUT_FOLDER, exist_ok=True)

In [None]:
OUTPUT_FILE_FORMAT = 'spredixcan-{tissue}-{column}.{format}'
display(OUTPUT_FILE_FORMAT)

In [None]:
ALL_TISSUES = GTEXModel.get_tissues(conf.GTEX_MODELS_DIR)
assert len(ALL_TISSUES) == 49

### Compute results

In [None]:
# import threading
# from queue import Queue

from utils import simplify_string_for_hdf5

In [None]:
# to_write = Queue(maxsize=5)

In [None]:
def _get_combined_results(phenos, tissue, column):
    return {
        pheno.pheno_info.get_plain_name():
            pheno.get_tissue_data(tissue, cols=[column], index_col='gene_simple')
        for pheno in phenos
    }

In [None]:
# def dataframe_creator(pqueue):
def dataframe_creator(results, tissue_name, column_name):
#     for results, tissue_name, column_name in iter(pqueue.get, None):
    log_prefix = f'[{tissue_name} - {column_name}]'

    _n_expected_phenos = np.sum(list(conf.SMULTIXCAN_EXPECTED_PHENOTYPES.values()))
    assert len(results) == _n_expected_phenos, len(results)

    spredixcan = pd.DataFrame(results)
    spredixcan.index.rename('gene_name', inplace=True)
    assert spredixcan.index.is_unique

    # hdf5
    spredixcan_filename = OUTPUT_FILE_FORMAT.format(
        tissue=tissue_name,
        column=column_name,
        format='h5'
    )
    spredixcan_filename = os.path.join(OUTPUT_FOLDER, spredixcan_filename)

    print(f'  {log_prefix}, saving to: {spredixcan_filename}', flush=True)
    with pd.HDFStore(spredixcan_filename, mode='w', complevel=1) as store:
        for col in spredixcan.columns:
            clean_col = simplify_string_for_hdf5(col)
            store[clean_col] = spredixcan[col]#.astype(float)

In [None]:
def _run(tissue, column, phenotype_chunks, n_jobs=conf.N_JOBS_HIGH):
    all_results = {}
    with ProcessPoolExecutor(max_workers=n_jobs) as executor:
        tasks = [executor.submit(_get_combined_results, chunk, tissue, column) for chunk in phenotype_chunks]
        for future in as_completed(tasks):
            res = future.result()
            all_results.update(res)

    dataframe_creator(all_results, tissue, column)
#     to_write.put((all_results, tissue, column))

In [None]:
def run_all(tissues, phenotype_chunks, n_jobs=conf.N_JOBS_HIGH):
    for tissue in tissues:
        print(tissue, flush=True)
        
        for column in ('pvalue', 'zscore', 'effect_size'):
            _run(tissue, column, phenotype_chunks, n_jobs)
    
#     to_write.put(None)

In [None]:
# phenotype_chunks = chunker(all_spredixcan_phenotypes[:5] + all_extra_phenotypes[:5], 2)
phenotype_chunks = chunker(all_spredixcan_phenotypes + all_extra_phenotypes, 25)

In [None]:
phenotype_chunks = list(phenotype_chunks)

In [None]:
tissues = GTEXModel.get_tissues(conf.GTEX_MODELS_DIR)
#tissues = tissues[:2]

In [None]:
# writing_thread = threading.Thread(target=dataframe_creator, args=((to_write),))
# writing_thread.start()

In [None]:
run_all(tissues, phenotype_chunks)

In [None]:
# writing_thread.join()

## Testing

In [None]:
n_expected_phenos = np.sum(list(conf.SMULTIXCAN_EXPECTED_PHENOTYPES.values()))

In [None]:
n_expected_phenos

In [None]:
output_hdf5_file = os.path.join(OUTPUT_FOLDER, 'spredixcan-Thyroid-pvalue.h5')

with pd.HDFStore(output_hdf5_file, mode='r') as store:
    store_keys = list(store.keys())
    assert len(store_keys) == n_expected_phenos
    display(store_keys[:5])
    
    clean_col = simplify_string_for_hdf5('N02-Diagnoses_main_ICD10_N02_Recurrent_and_persistent_haematuria')
    data = store[clean_col]
    assert data.shape == (15289,), data.shape
    assert data.loc['ENSG00000213965'] == 0.00023756504804916094, data.loc['ENSG00000213965']
    assert pd.isnull(data.loc['ENSG00000198670'])
    assert data.loc['ENSG00000177025'] == 1.586957013502016e-05, data.loc['ENSG00000177025']
    
    clean_col = simplify_string_for_hdf5('MAGNETIC_LDL.C')
    data = store[clean_col]
    assert data.shape == (15289,), data.shape
    assert data.loc['ENSG00000113163'] == 1.3600016190892495e-13, data.loc['ENSG00000113163']
    assert pd.isnull(data.loc['ENSG00000223510'])
    assert data.loc['ENSG00000204241'] == 0.9990521310244208, data.loc['ENSG00000204241']

In [None]:
output_hdf5_file = os.path.join(OUTPUT_FOLDER, 'spredixcan-Thyroid-zscore.h5')

with pd.HDFStore(output_hdf5_file, mode='r') as store:
    store_keys = list(store.keys())
    assert len(store_keys) == n_expected_phenos
    display(store_keys[:5])
    
    clean_col = simplify_string_for_hdf5('N02-Diagnoses_main_ICD10_N02_Recurrent_and_persistent_haematuria')
    data = store[clean_col]
    assert data.shape == (15289,), data.shape
    assert data.loc['ENSG00000213965'] == -3.6753054157625686, data.loc['ENSG00000213965']
    assert pd.isnull(data.loc['ENSG00000198670'])
    assert data.loc['ENSG00000177025'] == 4.316259089446458, data.loc['ENSG00000177025']
    
    clean_col = simplify_string_for_hdf5('MAGNETIC_LDL.C')
    data = store[clean_col]
    assert data.shape == (15289,), data.shape
    assert data.loc['ENSG00000113163'] == -7.400179862976074, data.loc['ENSG00000113163']
    assert pd.isnull(data.loc['ENSG00000223510'])
    assert data.loc['ENSG00000204241'] == 0.0011879778668467532, data.loc['ENSG00000204241']

In [None]:
output_hdf5_file = os.path.join(OUTPUT_FOLDER, 'spredixcan-Thyroid-effect_size.h5')

with pd.HDFStore(output_hdf5_file, mode='r') as store:
    store_keys = list(store.keys())
    assert len(store_keys) == n_expected_phenos
    display(store_keys[:5])
    
    clean_col = simplify_string_for_hdf5('N02-Diagnoses_main_ICD10_N02_Recurrent_and_persistent_haematuria')
    data = store[clean_col]
    assert data.shape == (15289,), data.shape
    assert data.loc['ENSG00000213965'] == -0.0011122695712738851, data.loc['ENSG00000213965']
    assert pd.isnull(data.loc['ENSG00000198670'])
    assert data.loc['ENSG00000177025'] == 0.0013606910719667048, data.loc['ENSG00000177025']
    
    clean_col = simplify_string_for_hdf5('MAGNETIC_LDL.C')
    data = store[clean_col]
    assert data.shape == (15289,), data.shape
    assert pd.isnull(data.loc['ENSG00000113163'])
    assert pd.isnull(data.loc['ENSG00000223510'])
    assert pd.isnull(data.loc['ENSG00000204241'])

In [None]:
output_hdf5_file = os.path.join(OUTPUT_FOLDER, 'spredixcan-Skin_Not_Sun_Exposed_Suprapubic-pvalue.h5')

with pd.HDFStore(output_hdf5_file, mode='r') as store:
    store_keys = list(store.keys())
    assert len(store_keys) == n_expected_phenos
    display(store_keys[:5])
    
    clean_col = simplify_string_for_hdf5('N02-Diagnoses_main_ICD10_N02_Recurrent_and_persistent_haematuria')
    data = store[clean_col]
    assert data.shape == (14920,), data.shape
    assert data.loc['ENSG00000214575'] == 0.999887282076106, data.loc['ENSG00000214575']
    assert pd.isnull(data.loc['ENSG00000231131'])
    assert data.loc['ENSG00000177025'] == 2.0403800371097046e-05, data.loc['ENSG00000177025']

The code below was used to write the assert above; see for each gene if first and last (min and max) correspond to sign above

In [None]:
rapid_gwas_dir = conf.SPREDIXCAN_RESULTS_DIR['RapidGWASProject']
gtex_gwas_dir = conf.SPREDIXCAN_RESULTS_DIR['GTEX_GWAS']

In [None]:
%%bash -s "$rapid_gwas_dir"
cd $1/N02
head -1 N02-gtex_v8-Thyroid-2018_10.csv | cut -f1-5 -d, | column -s, -t
echo ""
cat N02-gtex_v8-Thyroid-2018_10.csv | grep ENSG00000213965 | cut -f1-5 -d, | column -s, -t
echo ""
cat N02-gtex_v8-Thyroid-2018_10.csv | grep ENSG00000198670 | cut -f1-5 -d, | column -s, -t
echo ""
cat N02-gtex_v8-Thyroid-2018_10.csv | grep ENSG00000177025 | cut -f1-5 -d, | column -s, -t

In [None]:
%%bash -s "$rapid_gwas_dir"
cd $1/N02
head -1 N02-gtex_v8-Skin_Not_Sun_Exposed_Suprapubic-2018_10.csv | cut -f1-5 -d, | column -s, -t
echo ""
cat N02-gtex_v8-Skin_Not_Sun_Exposed_Suprapubic-2018_10.csv | grep ENSG00000214575 | cut -f1-5 -d, | column -s, -t
echo ""
cat N02-gtex_v8-Skin_Not_Sun_Exposed_Suprapubic-2018_10.csv | grep ENSG00000231131 | cut -f1-5 -d, | column -s, -t
echo ""
cat N02-gtex_v8-Skin_Not_Sun_Exposed_Suprapubic-2018_10.csv | grep ENSG00000177025 | cut -f1-5 -d, | column -s, -t

In [None]:
%%bash -s "$gtex_gwas_dir"
cd $1/MAGNETIC_LDL.C
head -1 spredixcan_igwas_gtexmashrv8_MAGNETIC_LDL.C__PM__Thyroid.csv | cut -f1-5 -d, | column -s, -t
echo ""
cat spredixcan_igwas_gtexmashrv8_MAGNETIC_LDL.C__PM__Thyroid.csv | grep ENSG00000113163 | cut -f1-5 -d, | column -s, -t
echo ""
cat spredixcan_igwas_gtexmashrv8_MAGNETIC_LDL.C__PM__Thyroid.csv | grep ENSG00000223510 | cut -f1-5 -d, | column -s, -t
echo ""
cat spredixcan_igwas_gtexmashrv8_MAGNETIC_LDL.C__PM__Thyroid.csv | grep ENSG00000204241 | cut -f1-5 -d, | column -s, -t