In [1]:
%load_ext autoreload
%autoreload 2

In [2]:
import os
import pickle
from glob import glob
import re
from concurrent.futures import ProcessPoolExecutor, as_completed

import numpy as np
import pandas as pd
#from tqdm import tqdm
from scipy import stats
from sklearn.metrics import pairwise_distances

import utils.constants as constants
from data.multixcan_data import MXPhenoInfo, MXPhenoResults
from utils.utils import is_number, chunker

In [3]:
genes_associations_dir = os.path.join(constants.PREPROCESSED_BASED_DIR, 'gene_associations')

In [4]:
OUTPUT_DIR = os.path.join(genes_associations_dir, 'fastenloc')
display(OUTPUT_DIR)
os.makedirs(OUTPUT_DIR, exist_ok=True)

'/mnt/phenomexcan/results/preprocessed_data/gene_associations/fastenloc'

In [5]:
FASTENLOC_RESULTS_DIR = os.path.join(constants.RESULTS_DIR, 'fastenloc', 'torus_based')
display(FASTENLOC_RESULTS_DIR)

'/mnt/phenomexcan/results/fastenloc/torus_based'

In [6]:
N_EXPECTED_PHENOTYPES = 4083
N_EXPECTED_TISSUES = 49

# Load all S-PrediXcan gene mappings

In [7]:
with open(os.path.join(constants.PREPROCESSED_METADATA_DIR, 'genes_mapping_simplified-0.pkl'), 'rb') as f:
    genes_mapping_0 = pickle.load(f)

with open(os.path.join(constants.PREPROCESSED_METADATA_DIR, 'genes_mapping_simplified-1.pkl'), 'rb') as f:
    genes_mapping_1 = pickle.load(f)

In [8]:
all_gene_ids = set(genes_mapping_0.keys())

# fastENLOC reading functions

In [9]:
# create summary of files
all_fastenloc_results_files = glob(os.path.join(FASTENLOC_RESULTS_DIR, '*.enloc.sig.out.gz'))
all_fastenloc_phenotypes = [f.split('-')[1] for f in all_fastenloc_results_files]
all_fastenloc_tissues = [f.split('.enloc.sig.out.gz')[0].split('-')[-1] for f in all_fastenloc_results_files]

results_summary = pd.DataFrame({'pheno': all_fastenloc_phenotypes, 'tissue': all_fastenloc_tissues, 'file': all_fastenloc_results_files})

In [10]:
assert len(set(all_fastenloc_phenotypes)) == N_EXPECTED_PHENOTYPES
assert len(set(all_fastenloc_tissues)) == N_EXPECTED_TISSUES
assert len(all_fastenloc_results_files) == int(N_EXPECTED_PHENOTYPES * N_EXPECTED_TISSUES)

In [11]:
def read_fe(filename, rename_genes=True): 
    fe_data = pd.read_csv(filename, sep='\s+', usecols=[0, 5], header=None).rename(columns={0: 'gene_cluster', 5: 'rcp'})
    fe_data = fe_data = fe_data.assign(gene=fe_data['gene_cluster'].apply(lambda x: x.split(':')[0])) 
    fe_data = fe_data.assign(gene_cluster=fe_data['gene_cluster'].apply(lambda x: x.split(':')[1]))
    
    # FIXME: is it correct to sum across clusters of a gene? or pick max? Haky thinks summing is fine.
    fe_max = fe_data.groupby('gene')['rcp'].sum()
    fe_max = fe_max.loc[fe_max.index.intersection(all_gene_ids)]
    if rename_genes:
        fe_max = fe_max.rename(index=genes_mapping_0)
    return fe_max

def read_pheno(pheno): 
    _tmp = results_summary[results_summary['pheno'] == pheno] 
    if _tmp.shape[0] == 0: 
        return None 
     
    _res = {} 
    _all_genes = set() 
    for idx, _data in _tmp.iterrows(): 
        #print(_data.tissue) 
         
        tissue_data = read_fe(_data.file) 
        _res[_data.tissue] = tissue_data 
        _all_genes.update(tissue_data.index) 
    
    return pd.DataFrame(_res, index=_all_genes).fillna(-1).max(axis=1)

In [12]:
# testing
t = read_fe(os.path.join(constants.RESULTS_DIR, 'fastenloc', 'torus_based', 'fastenloc-J15-Whole_Blood.enloc.sig.out.gz'))

In [13]:
genes_mapping_1['SLCO4C1']

'ENSG00000173930'

In [14]:
genes_mapping_0['ENSG00000238009']

'RP11-34P13.7'

In [15]:
t.sort_values(ascending=False).head()

SLCO4C1    1.869481e-07
ENPP4      2.257609e-08
EIF3F      1.147284e-08
CREM       1.070100e-08
RNF144A    4.772313e-09
Name: rcp, dtype: float64

In [16]:
assert t['SLCO4C1'] == (1.869e-07 + 1.578e-11 + 5.201e-12 + 1.409e-13 + 2.694e-11)

In [17]:
assert t['RP11-34P13.7'] == 0.00, t['RP11-34P13.7']

```parallel -j4 'zcat {} | grep ENSG00000049246' ::: fastenloc-1180-*.gz | column -t | sort -k6 -g```

In [18]:
display(all_fastenloc_phenotypes[0])
t = read_pheno(all_fastenloc_phenotypes[0])

'J15'

In [19]:
t.sort_values(ascending=False).head()

BMP8B      0.011245
OXCT2      0.009545
SLCO4C1    0.000965
ENPP4      0.000835
ENPP5      0.000821
dtype: float64

In [20]:
genes_mapping_1['BMP8B']

'ENSG00000116985'

In [21]:
t.loc['BMP8B']

0.01124506203

In [22]:
assert t.loc['BMP8B'] == (1.124e-02 + 5.425e-07 + 1.153e-08 + 2.864e-06 + 1.644e-06)

# Get all S-MultiXcan phenos

In [23]:
all_smultixcan_files = glob(os.path.join(constants.SMULTIXCAN_MASHR_RESULTS_DIR, '*.tsv.gz'))
all_smultixcan_phenotypes = [MXPhenoResults(p) for p in all_smultixcan_files]

In [24]:
assert len(all_smultixcan_files) == len(all_smultixcan_phenotypes) == N_EXPECTED_PHENOTYPES

## Compute results

In [25]:
def _get_combined_results(phenos):
    return {pheno.pheno_info.get_plain_name() : read_pheno(pheno.pheno_info.pheno_code) for pheno in phenos}

In [None]:
# testing
_tmp = _get_combined_results(all_smultixcan_phenotypes[:4])
assert len(_tmp) == 4

In [40]:
_pending = read_pheno('C_MYELOID')

In [41]:
_pending.head()

DFNA5       7.883540e-10
RPS27       4.784000e-12
KANSL1      2.464491e-10
BROX        4.186500e-11
ARHGEF18    3.673000e-10
dtype: float64

### pvalues and z-scores

In [27]:
output_file = os.path.join(OUTPUT_DIR, f'fastenloc-rcp.pkl.xz')
display(output_file)

'/mnt/phenomexcan/results/preprocessed_data/gene_associations/fastenloc/fastenloc-rcp.pkl.xz'

In [28]:
def _run_all(phenotype_chunks, n_jobs=20):
    all_results = {}
    
    with ProcessPoolExecutor(max_workers=n_jobs) as executor:
        tasks = [executor.submit(_get_combined_results, chunk) for chunk in phenotype_chunks]
        for future in as_completed(tasks):
            res = future.result()
            all_results.update(res)
    
    # Save all results from S-PrediXcan
    with open(output_file, 'wb') as f:
        pickle.dump(all_results, f)

In [29]:
# phenotype_chunks = list(chunker(all_smultixcan_phenotypes[:5], 2))
phenotype_chunks = chunker(all_smultixcan_phenotypes, 200)

In [None]:
_run_all(phenotype_chunks)

## Load results and save as DataFrame

In [30]:
with open(output_file, 'rb') as f:
    all_results = pickle.load(f)

In [46]:
# all_results['C_MYELOID-LEUKAEMIA-Myeloid_leukaemia'] = _pending

In [47]:
# with open(output_file, 'wb') as f:
#     pickle.dump(all_results, f)

In [48]:
assert len(all_results) == N_EXPECTED_PHENOTYPES

In [49]:
spredixcan_genes_associations = pd.DataFrame(all_results)
spredixcan_genes_associations.index.rename('gene_name', inplace=True)

assert spredixcan_genes_associations.index.is_unique

display(spredixcan_genes_associations.shape)
display(spredixcan_genes_associations.head())

(22233, 4083)

Unnamed: 0_level_0,L12_NAILDIS-Nail_disorders,J69-Diagnoses_main_ICD10_J69_Pneumonitis_due_to_solids_and_liquids,R07-Diagnoses_main_ICD10_R07_Pain_in_throat_and_chest,M46-Diagnoses_main_ICD10_M46_Other_inflammatory_spondylopathies,20003_2038460150-Treatmentmedication_code_paracetamol,E10-Diagnoses_main_ICD10_E10_Insulindependent_diabetes_mellitus,5134_raw-6mm_strong_meridian_left,E86-Diagnoses_main_ICD10_E86_Volume_depletion,L12_ERYTHEMATOUSOTH-Other_erythematous_conditions,K83-Diagnoses_main_ICD10_K83_Other_diseases_of_biliary_tract,...,20090_394-Type_of_fatoil_used_in_cooking_Unknown_soft_margarine,22617_3512-Job_SOC_coding_Aircraft_pilots_and_flight_engineers,6034-Target_heart_rate_achieved,20003_1140883066-Treatmentmedication_code_insulin_product,22601_41223241-Job_coding_accounts_and_wages_clerkassistantsupervisor_bookkeeper_cost_or_ledger_clerk_audit_assistant_budget_officer_student_loans_officer_paymaster,I82-Diagnoses_main_ICD10_I82_Other_venous_embolism_and_thrombosis,20107_12-Illnesses_of_father_Severe_depression,B07-Diagnoses_main_ICD10_B07_Viral_warts,22601_12253140-Job_coding_sports_centre_manager_riding_school_owner_sports_ground_manager_baths_manager,2664_2-Reason_for_reducing_amount_of_alcohol_drunk_Doctors_advice
gene_name,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1,Unnamed: 5_level_1,Unnamed: 6_level_1,Unnamed: 7_level_1,Unnamed: 8_level_1,Unnamed: 9_level_1,Unnamed: 10_level_1,Unnamed: 11_level_1,Unnamed: 12_level_1,Unnamed: 13_level_1,Unnamed: 14_level_1,Unnamed: 15_level_1,Unnamed: 16_level_1,Unnamed: 17_level_1,Unnamed: 18_level_1,Unnamed: 19_level_1,Unnamed: 20_level_1,Unnamed: 21_level_1
A1BG,1.5153e-08,2.216225e-08,0.005937,5.48296e-10,0.006081,0.000458,0.006905,2.241375e-10,1.7531e-09,2.7742e-05,...,0.000292,1.49871e-10,6.837e-06,0.000714,1.50342e-10,5.317725e-08,0.0005242997,1.807621e-08,7.5456e-08,0.000154
A1CF,2.796517e-11,1.696862e-10,0.00197,1.021794e-10,0.010776,6e-06,0.00585,7.8333e-11,2.507511e-10,3.895151e-10,...,0.000225,1.869519e-10,4.08106e-11,0.00027,2.78311e-11,1.083806e-10,3.367239e-07,5.118762e-11,1.050704e-10,0.000449
A2M,1.92698e-08,7.8582e-09,0.000947,5.466e-11,0.003225,2.5e-05,0.003633,4.74549e-10,5.6103e-10,9.63365e-06,...,8.1e-05,1.60843e-10,3.14128e-06,0.000401,1.096541e-10,3.252662e-08,0.00056958,1.128942e-08,1.132e-08,0.00013
A2ML1,1.14227e-09,3.4926e-09,0.000428,1.4761e-10,0.004078,0.000247,0.018624,3.0557e-10,3.5895e-10,3.79213e-06,...,0.000197,2.7592e-10,7.272498e-06,0.000148,1.64236e-11,8.91271e-08,0.000874904,1.3268e-10,2.2358e-09,5e-06
A3GALT2,1.99748e-08,5.515e-10,0.002141,2.41272e-09,0.004778,0.0002,0.14135,1.314787e-10,1.59708e-09,2.149724e-05,...,0.000311,2.357997e-10,1.82613e-09,0.000989,6.6555e-11,1.471335e-10,0.0006330287,4.799799e-09,1.11504e-07,5e-05


In [50]:
spredixcan_genes_associations.isna().sum().sum()

0

In [51]:
# _tmp = spredixcan_genes_associations.dropna(axis=1, how='all').columns

In [52]:
# spredixcan_genes_associations.columns.difference(_tmp)

In [53]:
# Remove genes with no results
spredixcan_genes_associations = spredixcan_genes_associations.dropna(axis=0, how='all')
assert spredixcan_genes_associations.isna().sum().sum() == 0

display(spredixcan_genes_associations.shape)

(22233, 4083)

In [55]:
assert spredixcan_genes_associations.shape == (22233, N_EXPECTED_PHENOTYPES)

In [56]:
# some testing

# For FinnGen
# assert spredixcan_genes_associations.loc['ENSG00000110628', 'C3_TONGUENAS-Malignant_neoplasm_of_other_and_unspecified_parts_of_tongue'] == 0.005086576789507484
# assert spredixcan_genes_associations.loc['ENSG00000169783', 'C3_TONGUENAS-Malignant_neoplasm_of_other_and_unspecified_parts_of_tongue'] == 0.3757187601354043
# assert spredixcan_genes_associations.loc['ENSG00000137959', 'C3_TONGUENAS-Malignant_neoplasm_of_other_and_unspecified_parts_of_tongue'] == 5.132614371931036e-07
# # For ICD10
# assert spredixcan_genes_associations.loc['ENSG00000135775', 'N18-Diagnoses_main_ICD10_N18_Chronic_renal_failure'] == 2.795075036067939e-05
# assert spredixcan_genes_associations.loc['ENSG00000169783', 'N18-Diagnoses_main_ICD10_N18_Chronic_renal_failure'] == 0.06668736815697908
# assert spredixcan_genes_associations.loc['ENSG00000174226', 'N18-Diagnoses_main_ICD10_N18_Chronic_renal_failure'] == 0.02496852053808064

In [57]:
# Save
spredixcan_genes_associations_filename = os.path.join(OUTPUT_DIR, f'fastenloc-torus-genes_rcp.pkl.xz')
display(spredixcan_genes_associations_filename)

'/mnt/phenomexcan/results/preprocessed_data/gene_associations/fastenloc/fastenloc-torus-genes_rcp.pkl.xz'

In [None]:
spredixcan_genes_associations.to_pickle(spredixcan_genes_associations_filename)

# Export tables for web publication

In [None]:
import yaml
from clustering.biclustering.analysis import Trait

In [None]:
spredixcan_genes_associations.head()

In [None]:
ukb_traits = spredixcan_genes_associations.columns.tolist()

In [None]:
len(ukb_traits)

In [None]:
ukb_traits[:5]

In [None]:
file_template = '{trait_code}'

In [None]:
filename_to_description = {}

for t in ukb_traits:
    trait = Trait(t)
    trait_code = trait.trait_code
    filename = file_template.format(trait_code=trait_code)
    
    trait_plain_name = trait.trait_plain_name
    if pd.isnull(trait_plain_name):
        trait_plain_name = filename
    
    filename_to_description[filename] = trait_plain_name

In [None]:
# should be empty
[(k, v) for k, v in filename_to_description.items() if pd.isnull(v)]

In [None]:
len(filename_to_description)

In [None]:
with open('/mnt/tmp/ukb_filename_maps.yml', 'w') as outfile:
    yaml.dump({'ukb_name': filename_to_description}, outfile, default_flow_style=False, default_style='yaml_dump')

Now I need to copy this file to CRI to include only these genes