In [1]:
%load_ext autoreload
%autoreload 2

In [2]:
import os
from glob import glob
from concurrent.futures import ProcessPoolExecutor, as_completed

import numpy as np
import pandas as pd

import settings as conf
import metadata
from utils import is_number, chunker
from results.multixcan import MXPhenoInfo, MXPhenoResults

# fastENLOC reading functions

In [3]:
with open(conf.FASTENLOC_GTEX_TISSUES_FILE, 'r') as f:
    FASTENLOC_TISSUES_NAMES = set([x.strip() for x in f.readlines()])

In [4]:
# create summary of files
_path = os.path.join(conf.FASTENLOC_RESULTS_DIR['RapidGWASProject'], '**/*.enloc.enrich.out')
display(_path)
all_fastenloc_results_files = glob(_path)

'/mnt/phenomexcan_base/results/fastenloc/rapid_gwas_project/**/*.enloc.enrich.out'

In [5]:
len(all_fastenloc_results_files)

198401

In [6]:
PHENO_CODES = set(metadata.RAPID_GWAS_PHENO_INFO.index)

In [7]:
len(PHENO_CODES)

4359

In [8]:
phenos = []
tissues = []

for f in all_fastenloc_results_files:
    f = os.path.basename(f)
    split = f.split('-')
    
    tiss = split[-1].split('.')[0]
    if tiss not in FASTENLOC_TISSUES_NAMES:
        tiss = '-'.join(split[-2:]).split('.')[0]
    
    phen = split[1]
    if phen not in PHENO_CODES:
        phen = '-'.join(split[1:3])
    
    phenos.append(phen)
    tissues.append(tiss)

results_summary = pd.DataFrame({'pheno': phenos, 'tissue': tissues, 'file': all_fastenloc_results_files})

In [9]:
display(results_summary.shape)
assert results_summary.shape[0] == int(conf.FASTENLOC_EXPECTED_PHENOTYPES['RapidGWASProject'] * conf.GTEX_MODELS_N_EXPECTED_TISSUES)

(198401, 3)

In [10]:
results_summary.head()

Unnamed: 0,pheno,tissue,file
0,22617_9219,Thyroid,/mnt/phenomexcan_base/results/fastenloc/rapid_...
1,22617_9219,Nerve_Tibial,/mnt/phenomexcan_base/results/fastenloc/rapid_...
2,22617_9219,Brain_Hypothalamus,/mnt/phenomexcan_base/results/fastenloc/rapid_...
3,22617_9219,Small_Intestine_Terminal_Ileum,/mnt/phenomexcan_base/results/fastenloc/rapid_...
4,22617_9219,Breast_Mammary_Tissue,/mnt/phenomexcan_base/results/fastenloc/rapid_...


In [11]:
results_summary['pheno'].unique().shape

(4049,)

In [12]:
assert np.all([t in PHENO_CODES for t in results_summary['pheno'].unique()])

In [13]:
assert results_summary['tissue'].unique().shape[0] == conf.GTEX_MODELS_N_EXPECTED_TISSUES

In [14]:
assert np.all([t in FASTENLOC_TISSUES_NAMES for t in results_summary['tissue'].unique()])

In [15]:
_all_tissues_in_results = results_summary['tissue'].unique()
assert np.all([t in _all_tissues_in_results for t in FASTENLOC_TISSUES_NAMES])

In [16]:
assert len(all_fastenloc_results_files) == int(conf.SMULTIXCAN_EXPECTED_PHENOTYPES['RapidGWASProject'] * conf.GTEX_MODELS_N_EXPECTED_TISSUES)

In [72]:
def read_fe_enrich(filename):
    enrich_data = pd.read_fwf(filename, header=None)
    enrich_data = enrich_data.rename(columns={0: 'type', 1: 'alpha', 2: 'se'})
    assert enrich_data.shape[0] == 3
    
    alpha0_data = enrich_data[enrich_data['type'] == 'Intercept']
    assert alpha0_data.shape[0] == 1
    alpha0_data = alpha0_data.iloc[0].loc[['alpha', 'se']]\
        .rename({'alpha': 'alpha0', 'se': 'alpha0_se'}).astype(float)
    
    alpha1_data = enrich_data[enrich_data['type'] == 'Enrichment (likelihood)']
    assert alpha1_data.shape[0] == 1
    alpha1_data = alpha1_data.iloc[0].loc[['alpha', 'se']]\
        .rename({'alpha': 'alpha1', 'se': 'alpha1_se'}).astype(float)
    
    alpha1_s_data = enrich_data[enrich_data['type'] == 'Enrichment (shrinkage)']
    assert alpha1_s_data.shape[0] == 1
    alpha1_s_data = alpha1_s_data.iloc[0].loc[['alpha', 'se']]\
        .rename({'alpha': 'alpha1_shrinkage', 'se': 'alpha1_shrinkage_se'}).astype(float)
    
    df = pd.concat([alpha0_data, alpha1_data, alpha1_s_data])
    return df
    

def read_fe_enrich_pheno(pheno): 
    _tmp = results_summary[results_summary['pheno'] == pheno] 
    if _tmp.shape[0] == 0: 
        return None 
     
    _res = {}
#     _all_genes = set()
    for idx, _data in _tmp.iterrows():
        tissue_data = read_fe_enrich(_data.file).rename(_data.pheno)
        _res[_data.tissue] = tissue_data
#         _all_genes.update(tissue_data.index) 
    
    df = pd.DataFrame.from_dict(_res, orient='index')#.T#, index=_all_genes)#.fillna(-1).max(axis=1)
    df.index.rename('tissue', inplace=True)
    df = df.reset_index().assign(trait=pheno)
    return df[['tissue', 'trait', 'alpha0', 'alpha0_se', 'alpha1', 'alpha1_se', 'alpha1_shrinkage', 'alpha1_shrinkage_se']]

In [73]:
results_summary.iloc[1].loc['file']

'/mnt/phenomexcan_base/results/fastenloc/rapid_gwas_project/22617_9219/fastenloc-22617_9219-Nerve_Tibial.enloc.enrich.out'

In [74]:
read_fe_enrich(results_summary.iloc[1].loc['file'])

alpha0                  -13.786
alpha0_se                 0.339
alpha1                   -2.092
alpha1_se              2334.334
alpha1_shrinkage         -0.000
alpha1_shrinkage_se       1.000
dtype: float64

In [75]:
read_fe_enrich_pheno('22617_9219').head()

Unnamed: 0,tissue,trait,alpha0,alpha0_se,alpha1,alpha1_se,alpha1_shrinkage,alpha1_shrinkage_se
0,Adipose_Subcutaneous,22617_9219,-13.771,0.335,-9.018,3740.24,-0.0,1.0
1,Adipose_Visceral_Omentum,22617_9219,-13.806,0.341,5.588,2.327,0.871,0.919
2,Adrenal_Gland,22617_9219,-13.795,0.339,2.43,2245.33,0.0,1.0
3,Artery_Aorta,22617_9219,-13.759,0.333,-13.785,6920.785,-0.0,1.0
4,Artery_Coronary,22617_9219,-13.781,0.337,-7.001,5998.139,-0.0,1.0


### Testing

In [76]:
# testing
t = read_fe_enrich(os.path.join(conf.FASTENLOC_RESULTS_DIR['RapidGWASProject'], 'J15/fastenloc-J15-Whole_Blood.enloc.enrich.out'))

In [78]:
if 'alpha1_shrinkage' not in t.index:
    assert t['alpha0'] == -11.752
    assert t['alpha0_se'] == 0.350

    assert t['alpha1'] == -11.286
    assert t['alpha1_se'] == 764.307
else:
    assert t['alpha0'] == -11.620
    assert t['alpha0_se'] == 0.114

    assert t['alpha1'].round(3) == -13.851
    assert t['alpha1_se'] == 2491.946
    
    assert t['alpha1_shrinkage'] == -0.000
    assert t['alpha1_shrinkage_se'] == 1.000

In [79]:
t = read_fe_enrich_pheno('J15')

In [80]:
assert t.shape[0] == conf.GTEX_MODELS_N_EXPECTED_TISSUES

In [81]:
t.head()

Unnamed: 0,tissue,trait,alpha0,alpha0_se,alpha1,alpha1_se,alpha1_shrinkage,alpha1_shrinkage_se
0,Adipose_Subcutaneous,J15,-11.612,0.114,-14.541,3148.718,-0.0,1.0
1,Adipose_Visceral_Omentum,J15,-11.618,0.114,-14.123,2916.263,-0.0,1.0
2,Adrenal_Gland,J15,-11.621,0.114,-14.158,3494.905,-0.0,1.0
3,Artery_Aorta,J15,-11.615,0.114,-14.494,3321.603,-0.0,1.0
4,Artery_Coronary,J15,-11.621,0.114,-14.408,4208.734,-0.0,1.0


In [93]:
if 'alpha1_shrinkage' not in t.columns:
    _t_tmp = t[t['tissue'] == 'Adipose_Visceral_Omentum'].iloc[0]
    assert _t_tmp['tissue'] == 'Adipose_Visceral_Omentum'
    assert _t_tmp['trait'] == 'J15'
    assert _t_tmp['alpha0'] == -11.747
    assert _t_tmp['alpha0_se'].round(3) == 0.349
    assert _t_tmp['alpha1'] == -11.061
    assert _t_tmp['alpha1_se'].round(3) == 877.891

    _t_tmp = t[t['tissue'] == 'Artery_Coronary'].iloc[0]
    assert _t_tmp['tissue'] == 'Artery_Coronary'
    assert _t_tmp['trait'] == 'J15'
    assert _t_tmp['alpha0'].round(3) == -11.736
    assert _t_tmp['alpha0_se'].round(3) == 0.348
    assert _t_tmp['alpha1'].round(3) == -11.979
    assert _t_tmp['alpha1_se'].round(3) == 1336.767
else:
    _t_tmp = t[t['tissue'] == 'Adipose_Visceral_Omentum'].iloc[0]
    assert _t_tmp['tissue'] == 'Adipose_Visceral_Omentum'
    assert _t_tmp['trait'] == 'J15'
    assert _t_tmp['alpha0'] == -11.618
    assert _t_tmp['alpha0_se'].round(3) == 0.114
    assert _t_tmp['alpha1'] == -14.123
    assert _t_tmp['alpha1_se'].round(3) == 2916.263
    assert _t_tmp['alpha1_shrinkage'] == -0.000
    assert _t_tmp['alpha1_shrinkage_se'].round(3) == 1.000

    _t_tmp = t[t['tissue'] == 'Artery_Coronary'].iloc[0]
    assert _t_tmp['tissue'] == 'Artery_Coronary'
    assert _t_tmp['trait'] == 'J15'
    assert _t_tmp['alpha0'].round(3) == -11.621
    assert _t_tmp['alpha0_se'].round(3) == 0.114
    assert _t_tmp['alpha1'].round(3) == -14.408
    assert _t_tmp['alpha1_se'].round(3) == 4208.734
    assert _t_tmp['alpha1_shrinkage'] == -0.000
    assert _t_tmp['alpha1_shrinkage_se'].round(3) == 1.000

# Get Rapid GWAS phenotypes

In [94]:
all_smultixcan_files = glob(os.path.join(conf.SMULTIXCAN_RESULTS_DIR['RapidGWASProject'], '*.tsv.gz'))
all_smultixcan_phenotypes = [MXPhenoResults(p) for p in all_smultixcan_files]

In [95]:
assert len(all_smultixcan_files) == len(all_smultixcan_phenotypes) == conf.SMULTIXCAN_EXPECTED_PHENOTYPES['RapidGWASProject']

In [96]:
all_smultixcan_files[:5]

['/mnt/phenomexcan_base/results/smultixcan/rapid_gwas_project/smultixcan_20096_1_ccn30.tsv.gz',
 '/mnt/phenomexcan_base/results/smultixcan/rapid_gwas_project/smultixcan_2345_ccn30.tsv.gz',
 '/mnt/phenomexcan_base/results/smultixcan/rapid_gwas_project/smultixcan_N49_ccn30.tsv.gz',
 '/mnt/phenomexcan_base/results/smultixcan/rapid_gwas_project/smultixcan_100011_raw_ccn30.tsv.gz',
 '/mnt/phenomexcan_base/results/smultixcan/rapid_gwas_project/smultixcan_5221_ccn30.tsv.gz']

# Read all results

In [97]:
def _get_combined_results(phenos):
    all_res = [read_fe_enrich_pheno(pheno.pheno_info.pheno_code) for pheno in phenos]
    return pd.concat(all_res, ignore_index=True)

In [98]:
# testing
_tmp = _get_combined_results(all_smultixcan_phenotypes[:4])
assert _tmp.shape[0] == int(4 * 49)

In [99]:
_pending = read_fe_enrich_pheno('C_MYELOID-LEUKAEMIA')

In [100]:
_pending.head()

Unnamed: 0,tissue,trait,alpha0,alpha0_se,alpha1,alpha1_se,alpha1_shrinkage,alpha1_shrinkage_se
0,Adipose_Subcutaneous,C_MYELOID-LEUKAEMIA,-12.629,0.189,-14.168,4349.3,-0.0,1.0
1,Adipose_Visceral_Omentum,C_MYELOID-LEUKAEMIA,-12.635,0.19,-12.395,5047.165,-0.0,1.0
2,Adrenal_Gland,C_MYELOID-LEUKAEMIA,-12.64,0.19,-14.214,5881.797,-0.0,1.0
3,Artery_Aorta,C_MYELOID-LEUKAEMIA,-12.638,0.19,-13.551,3721.215,-0.0,1.0
4,Artery_Coronary,C_MYELOID-LEUKAEMIA,-12.642,0.191,-14.163,6317.918,-0.0,1.0


In [101]:
def _run_all(phenotype_chunks, n_jobs=conf.N_JOBS_HIGH):
    all_results = []
    
    with ProcessPoolExecutor(max_workers=n_jobs) as executor:
        tasks = [executor.submit(_get_combined_results, chunk) for chunk in phenotype_chunks]
        for future in as_completed(tasks):
            res = future.result()
            all_results.append(res)

    return all_results

In [102]:
# phenotype_chunks = chunker(all_smultixcan_phenotypes[:5] + all_extra_phenotypes[:5], 2)
phenotype_chunks = chunker(all_smultixcan_phenotypes, 200)

In [None]:
all_results = _run_all(phenotype_chunks)

## Save as DataFrame

In [104]:
fastenloc_genes_associations = pd.concat(all_results, ignore_index=True)

display(fastenloc_genes_associations.shape)
display(fastenloc_genes_associations.head())

assert fastenloc_genes_associations.shape[0] == int(conf.FASTENLOC_EXPECTED_PHENOTYPES['RapidGWASProject'] * conf.GTEX_MODELS_N_EXPECTED_TISSUES)

(198401, 8)

Unnamed: 0,tissue,trait,alpha0,alpha0_se,alpha1,alpha1_se,alpha1_shrinkage,alpha1_shrinkage_se
0,Adipose_Subcutaneous,O46,-12.544,0.181,-14.486,4880.497,-0.0,1.0
1,Adipose_Visceral_Omentum,O46,-12.549,0.182,-14.398,5137.845,-0.0,1.0
2,Adrenal_Gland,O46,-12.553,0.182,-14.527,6584.453,-0.0,1.0
3,Artery_Aorta,O46,-12.547,0.182,-14.459,5361.933,-0.0,1.0
4,Artery_Coronary,O46,-12.555,0.182,-14.534,7127.824,-0.0,1.0


In [105]:
assert fastenloc_genes_associations['tissue'].unique().shape[0] == conf.GTEX_MODELS_N_EXPECTED_TISSUES

In [106]:
assert fastenloc_genes_associations['trait'].unique().shape[0] == conf.FASTENLOC_EXPECTED_PHENOTYPES['RapidGWASProject']

In [107]:
assert fastenloc_genes_associations.shape == fastenloc_genes_associations.dropna().shape

In [108]:
# check columns data type
_tmp = fastenloc_genes_associations.dtypes.value_counts()
display(_tmp)
assert _tmp.shape[0] == 2

float64    6
object     2
dtype: int64

In [109]:
fastenloc_genes_associations.isna().sum().sum()

0

In [110]:
# some testing
t = fastenloc_genes_associations[fastenloc_genes_associations['trait'] == '22601_12253140']

In [125]:
t.head()

Unnamed: 0,tissue,trait,alpha0,alpha0_se,alpha1,alpha1_se,alpha1_shrinkage,alpha1_shrinkage_se
2303,Adipose_Subcutaneous,22601_12253140,-10.727,0.073,-14.861,2370.776,-0.0,1.0
2304,Adipose_Visceral_Omentum,22601_12253140,-10.729,0.073,-14.786,2547.338,-0.0,1.0
2305,Adrenal_Gland,22601_12253140,-10.731,0.073,-14.825,3070.578,-0.0,1.0
2306,Artery_Aorta,22601_12253140,-10.729,0.073,-14.861,2554.334,-0.0,1.0
2307,Artery_Coronary,22601_12253140,-10.732,0.073,-14.895,3423.376,-0.0,1.0


In [115]:
if 'alpha1_shrinkage' not in t.columns:
    tt = t[t['tissue'] == 'Brain_Frontal_Cortex_BA9'].iloc[0]
    assert tt.loc['alpha0'].round(3) == -10.781
    assert tt.loc['alpha0_se'].round(3) == 0.274
    assert tt.loc['alpha1'].round(3) == -12.584
    assert tt.loc['alpha1_se'] == 1050.572
else:
    tt = t[t['tissue'] == 'Brain_Frontal_Cortex_BA9'].iloc[0]
    assert tt.loc['alpha0'].round(3) == -10.731
    assert tt.loc['alpha0_se'].round(3) == 0.073
    assert tt.loc['alpha1'].round(3) == -14.893
    assert tt.loc['alpha1_se'] == 3230.940
    assert tt.loc['alpha1_shrinkage'].round(3) == -0.000
    assert tt.loc['alpha1_shrinkage_se'] == 1.000

In [122]:
if 'alpha1_shrinkage' not in t.columns:
    tt = t[t['tissue'] == 'Artery_Coronary'].iloc[0]
    assert tt.loc['alpha0'].round(3) == -10.783
    assert tt.loc['alpha0_se'].round(3) == 0.274
    assert tt.loc['alpha1'].round(3) == -12.448
    assert tt.loc['alpha1_se'].round(3) == 1042.841
else:
    tt = t[t['tissue'] == 'Artery_Coronary'].iloc[0]
    assert tt.loc['alpha0'].round(3) == -10.732
    assert tt.loc['alpha0_se'].round(3) == 0.073
    assert tt.loc['alpha1'].round(3) == -14.895
    assert tt.loc['alpha1_se'].round(3) == 3423.376
    assert tt.loc['alpha1_shrinkage'].round(3) == -0.000
    assert tt.loc['alpha1_shrinkage_se'] == 1.000

### Save

In [126]:
os.makedirs(conf.ANALYSES_DIR, exist_ok=True)

In [127]:
# Save
# new filename to save the alpha1 of newer fastENLOC runs
fastenloc_genes_associations_filename = os.path.join(conf.ANALYSES_DIR, f'fastenloc-alpha1-stats-v2.pkl.xz')

# filename for original alpha1 runs of fastENLOC
#fastenloc_genes_associations_filename = os.path.join(conf.ANALYSES_DIR, f'fastenloc-alpha1-stats.pkl.xz')

display(fastenloc_genes_associations_filename)

'/mnt/phenomexcan_base/analyses/fastenloc-alpha1-stats-v2.pkl.xz'

In [128]:
fastenloc_genes_associations.to_pickle(fastenloc_genes_associations_filename)

### Save in text format

In [129]:
# new filename to save the alpha1 of newer fastENLOC runs
output_file = os.path.join(conf.ANALYSES_DIR, 'fastenloc-alpha1-stats-v2.tsv.gz')

# filename for original alpha1 runs of fastENLOC
# output_file = os.path.join(conf.ANALYSES_DIR, 'fastenloc-alpha1-stats.tsv.gz')

display(output_file)

'/mnt/phenomexcan_base/analyses/fastenloc-alpha1-stats-v2.tsv.gz'

In [130]:
fastenloc_genes_associations.to_csv(output_file, index=False, sep='\t', float_format='%.3f')

In [131]:
# test "for publication" file
_tmp = pd.read_csv(output_file, sep='\t')

In [132]:
display(_tmp.shape)
assert _tmp.shape == fastenloc_genes_associations.shape

(198401, 8)

In [133]:
_tmp.head()

Unnamed: 0,tissue,trait,alpha0,alpha0_se,alpha1,alpha1_se,alpha1_shrinkage,alpha1_shrinkage_se
0,Adipose_Subcutaneous,O46,-12.544,0.181,-14.486,4880.497,-0.0,1.0
1,Adipose_Visceral_Omentum,O46,-12.549,0.182,-14.398,5137.845,-0.0,1.0
2,Adrenal_Gland,O46,-12.553,0.182,-14.527,6584.453,-0.0,1.0
3,Artery_Aorta,O46,-12.547,0.182,-14.459,5361.933,-0.0,1.0
4,Artery_Coronary,O46,-12.555,0.182,-14.534,7127.824,-0.0,1.0


In [134]:
assert not _tmp.isin([np.inf, -np.inf]).any().any()

In [135]:
assert not _tmp.isna().any().any()

In [136]:
assert fastenloc_genes_associations.equals(_tmp)

Now a plot can be made with notebook `150_validation/35_fastenloc_alpha_se_plots.ipynb`