In [1]:
%load_ext autoreload
%autoreload 2

In [2]:
import os
import pickle
from glob import glob
from concurrent.futures import ProcessPoolExecutor, as_completed

import numpy as np
import pandas as pd
from scipy import stats

import settings as conf
from utils import is_number, chunker
from results.multixcan import MXPhenoInfo, MXPhenoResults

In [3]:
os.path.join(conf.DELIVERABLES_DIR, 'genes_info.tsv.gz')

'/mnt/phenomexcan_base/deliverables/genes_info.tsv.gz'

In [4]:
gene_mappings = pd.read_csv(os.path.join(conf.DELIVERABLES_DIR, 'genes_info.tsv.gz'), sep='\t')

In [5]:
gene_mappings.head()

Unnamed: 0,gene_id,gene,gene_name,gene_type,band
0,ENSG00000000419,ENSG00000000419.12,DPM1,protein_coding,20q13.13
1,ENSG00000000457,ENSG00000000457.13,SCYL3,protein_coding,1q24.2
2,ENSG00000000460,ENSG00000000460.16,C1orf112,protein_coding,1q24.2
3,ENSG00000000938,ENSG00000000938.12,FGR,protein_coding,1p35.3
4,ENSG00000000971,ENSG00000000971.15,CFH,protein_coding,1q31.3


# Load fastENLOC results

In [28]:
fastenloc_rcp_file = os.path.join(conf.GENE_ASSOC_DIR, f'fastenloc-torus-rcp.pkl.xz')
display(fastenloc_rcp_file)

'/mnt/phenomexcan_base/gene_assoc/fastenloc-torus-rcp.pkl.xz'

In [29]:
fastenloc_gene_associations = pd.read_pickle(fastenloc_rcp_file)

In [30]:
fastenloc_gene_associations.shape

(37967, 4091)

In [31]:
fastenloc_gene_associations.head()

Unnamed: 0_level_0,20485-Ever_contemplated_selfharm,6160_2-Leisuresocial_activities_Pub_or_social_club,6139_3-Gas_or_solidfuel_cookingheating_An_open_solid_fuel_fire_that_you_use_regularly_in_winter_time,L12_FOLLICULARNAS-Other_and_unspecified_follicular_disorders,2654_8-Nonbutter_spread_type_details_Other_low_or_reduced_fat_spread,22617_8139-Job_SOC_coding_Assemblers_and_routine_operatives_nec,I9_CARDMPRI-Cardiomyopathies_Primaryintrinsic,20003_1141188442-Treatmentmedication_code_glucosamine_product,RHEUMA_SEROPOS_OTH-Otherunspecified_seropositiverheumatoid_arthritis,22601_11523251-Job_coding_office_manager_accountpayrolls_manager_administration_manager_litigationsolicitors_office_manager,...,PGC_ADHD_EUR_2017,BCAC_ER_negative_BreastCancer_EUR,SSGAC_Education_Years_Pooled,pgc.scz2,MAGNETIC_HDL.C,MAGIC_ln_FastingInsulin,Astle_et_al_2016_Sum_eosinophil_basophil_counts,SSGAC_Depressive_Symptoms,Jones_et_al_2016_SleepDuration,Astle_et_al_2016_Red_blood_cell_count
gene_name,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1,Unnamed: 5_level_1,Unnamed: 6_level_1,Unnamed: 7_level_1,Unnamed: 8_level_1,Unnamed: 9_level_1,Unnamed: 10_level_1,Unnamed: 11_level_1,Unnamed: 12_level_1,Unnamed: 13_level_1,Unnamed: 14_level_1,Unnamed: 15_level_1,Unnamed: 16_level_1,Unnamed: 17_level_1,Unnamed: 18_level_1,Unnamed: 19_level_1,Unnamed: 20_level_1,Unnamed: 21_level_1
ENSG00000000419,8e-06,0.000505,0.000305,3.30014e-11,3.9e-05,3.7367e-07,1e-05,1.7e-05,4.4e-05,2.3994e-07,...,0.0,,0.001,0.0,,,0.001,,,0.001
ENSG00000000457,0.000131,0.002876,0.003605,8.84029e-11,0.000203,7.7147e-07,0.000195,0.000601,0.000245,5.0476e-06,...,,,0.0,,,,0.0,,,0.0
ENSG00000000460,1.4e-05,0.000975,0.005453,9.497296e-11,0.000147,1.0709e-06,2.9e-05,0.000224,0.004665,3.25856e-06,...,,,0.0,,,,0.0,,,0.0
ENSG00000000938,1.9e-05,0.000449,0.014776,8.85556e-11,9.1e-05,6.395e-07,9e-05,0.000767,0.000453,5.3252e-07,...,,,,0.002,,,0.001,,,
ENSG00000000971,6e-06,0.000674,0.001918,4.44882e-10,5.7e-05,1.796e-07,4.4e-05,4.9e-05,0.000212,9.644e-07,...,,,,,,,0.0,,,0.0


In [40]:
fastenloc_gene_associations.loc['ENSG00000267462', '100001_raw-Food_weight']

3.6406000000000003e-06

In [34]:
fastenloc_gene_associations.columns[fastenloc_gene_associations.columns.str.contains('food')]

Index(['6144_1-Never_eat_eggs_dairy_wheat_sugar_Eggs_or_foods_containing_eggs',
       '22601_12233383-Job_coding_restaurant_or_catering_manager_restaurateur_canteen_manager_takeaway_food_shop_manager',
       '102700-Starchy_food_consumers',
       '20002_1562-Noncancer_illness_code_selfreported_food_intolerance',
       '6144_4-Never_eat_eggs_dairy_wheat_sugar_Sugar_or_foodsdrinks_containing_sugar',
       '20003_1199-Treatmentmedication_code_food_supplementplantherbal_extract',
       '1478-Salt_added_to_food',
       'R63-Diagnoses_main_ICD10_R63_Symptoms_and_signs_concerning_food_and_fluid_intake',
       '20002_1385-Noncancer_illness_code_selfreported_allergy_or_anaphylactic_reaction_to_food',
       '22601_71113328-Job_coding_sales_assistant_retailshop_assistant_counter_assistant_sub_post_office_assistant_takeaway_food_worker'],
      dtype='object')

In [47]:
import re

In [52]:
HDF5_KEY_NO_PATTERN = re.compile('[^0-9a-zA-Z_]')

In [56]:
re.sub(HDF5_KEY_NO_PATTERN, '_', '100001_raw-Food_weight')

'100001_raw_Food_weight'

In [41]:
from utils import simplify_string_for_hdf5

In [43]:
_pheno = simplify_string_for_hdf5('100001_raw-Food_weight')
display(_pheno)
_tmp = pd.read_hdf('/mnt/phenomexcan_base/gene_assoc/fastenloc-torus-rcp.h5', key=_pheno)

'c_'

In [45]:
_tmp.head()

gene_id
ENSG00000000419    0.001
ENSG00000000457    0.000
ENSG00000000460    0.000
ENSG00000000938      NaN
ENSG00000000971    0.000
Name: Astle_et_al_2016_Red_blood_cell_count, dtype: float64

# Combine S-MultiXcan results into one file

In [52]:
smultixcan_mash_dir = conf.SMULTIXCAN_RESULTS_DIR['RapidGWASProject']
display(smultixcan_mash_dir)

'/mnt/phenomexcan_base/results/smultixcan/rapid_gwas_project'

In [54]:
%%bash -s "$smultixcan_mash_dir" "$output_file"
bash combine_smultixcan_results.sh ${1} ${2} zcat --add-header

Using directory: /mnt/phenomexcan_base/results/smultixcan/rapid_gwas_project
Using output file: /mnt/tmp/output.tsv
Using command: zcat
Adding header
Concatenating S-MultiXcan results


In [55]:
smultixcan_gtex_gwas_dir = conf.SMULTIXCAN_RESULTS_DIR['GTEX_GWAS']
display(smultixcan_gtex_gwas_dir)

'/mnt/phenomexcan_base/results/smultixcan/gtex_gwas'

In [56]:
%%bash -s "$smultixcan_gtex_gwas_dir" "$output_file"
bash combine_smultixcan_results.sh ${1} ${2} cat

Using directory: /mnt/phenomexcan_base/results/smultixcan/gtex_gwas
Using output file: /mnt/tmp/output.tsv
Using command: cat
Concatenating S-MultiXcan results


# Load phenotype information

In [35]:
os.path.join(conf.DELIVERABLES_DIR, 'phenotypes_info.tsv.gz')

'/mnt/phenomexcan_base/deliverables/phenotypes_info.tsv.gz'

In [36]:
pheno_info = pd.read_csv(os.path.join(conf.DELIVERABLES_DIR, 'phenotypes_info.tsv.gz'), sep='\t')

In [37]:
pheno_info.shape

(4091, 9)

In [38]:
pheno_info.head()

Unnamed: 0,full_code,short_code,description,unique_description,type,n,n_cases,n_controls,source
0,100001_raw-Food_weight,100001_raw,Food weight,Food weight,continuous_raw,51453,,,UK Biobank
1,100002_raw-Energy,100002_raw,Energy,Energy,continuous_raw,51453,,,UK Biobank
2,100003_raw-Protein,100003_raw,Protein,Protein,continuous_raw,51453,,,UK Biobank
3,100004_raw-Fat,100004_raw,Fat,Fat,continuous_raw,51453,,,UK Biobank
4,100005_raw-Carbohydrate,100005_raw,Carbohydrate,Carbohydrate,continuous_raw,51453,,,UK Biobank


In [60]:
pheno_id_to_desc = pheno_info[['short_code', 'description']].set_index('short_code').to_dict()['description']

In [61]:
pheno_id_to_desc['J45']

'Diagnoses - main ICD10: J45 Asthma'

In [62]:
pheno_id_to_full_code = pheno_info[['short_code', 'full_code']].set_index('short_code').to_dict()['full_code']

In [63]:
pheno_id_to_full_code['100001_raw']

'100001_raw-Food_weight'

# Sample of combined file

In [57]:
output_file = '/mnt/tmp/output.tsv'

In [58]:
_tmp = pd.read_csv(output_file, sep='\t', iterator=True).get_chunk(1000000)

In [59]:
_tmp.head()

Unnamed: 0,gene,gene_name,band,pheno_desc,pheno_source,pvalue,n,n_indep,p_i_best,t_i_best,rcp
0,ENSG00000116406.18,EDEM3,1q25.3,Energy,UK Biobank,2.9e-05,42,4,8e-06,Cells_EBV-transformed_lymphocytes,0.0002831
1,ENSG00000101182.14,PSMA7,20q13.33,Energy,UK Biobank,5.1e-05,48,3,0.001494,Pancreas,4.536e-07
2,ENSG00000233068.2,RP1-140K8.1,6p25.2,Energy,UK Biobank,6.1e-05,2,1,6.1e-05,Artery_Aorta,9.07e-07
3,ENSG00000071073.12,MGAT4A,2q11.2,Energy,UK Biobank,7.9e-05,48,7,0.000858,Muscle_Skeletal,1.363e-06
4,ENSG00000182224.11,CYB5D1,17p13.1,Energy,UK Biobank,8.8e-05,45,4,0.000686,Pituitary,1.4958e-05


In [60]:
_tmp.dtypes

gene             object
gene_name        object
band             object
pheno_desc       object
pheno_source     object
pvalue          float64
n                 int64
n_indep           int64
p_i_best        float64
t_i_best         object
rcp             float64
dtype: object

In [64]:
_tmp['gene'].apply(len).max()

18

In [67]:
_tmp['gene_name'].apply(len).max()

19

# Process S-MultiXcan results

In [40]:
def _get_pheno_ids(filename):
    pheno_code = filename['file'].split('smultixcan_')[1].split('_ccn30')[0]
    return pheno_id_to_desc[pheno_code], pheno_id_to_full_code[pheno_code]

def _process_chunk(data):
    # take only selected columns
    data = data[['gene', 'gene_name', 'file', 'pvalue', 'n', 'n_indep', 't_i_best', 'p_i_best']]
    
    # drop nan results
    data = data.dropna(subset=['pvalue'])
    
    # get phenotype and add column
    _tmp = data.apply(_get_pheno_ids, axis=1, result_type='expand')
    data = data.assign(pheno_desc=_tmp[0])
    data = data.assign(pheno_full_code=_tmp[1])
    data = data.drop(columns=['file'])
    
    # add phenotype description
    
    # add gene band
    
    # add fastenloc rcp
    
    # write to output with no headers
    
    return data

In [17]:
CHUNKSIZE=1000000

In [18]:
combined_file = pd.read_csv(output_file, sep='\t', iterator=True, chunksize=CHUNKSIZE)

In [44]:
_tmp = combined_file.get_chunk(1000000)

ParserError: Error tokenizing data. C error: Expected 9 fields in line 2313, saw 15


In [20]:
_tmp.head()

Unnamed: 0,gene,gene_name,pvalue,n,n_indep,p_i_best,t_i_best,status,file
0,ENSG00000267462.1,RP11-866E20.3,2e-05,2.0,2.0,0.0004429014,Testis,0,smultixcan_100001_raw_ccn30.tsv.gz
1,ENSG00000182901.16,RGS7,4e-05,20.0,9.0,1.246461e-05,Nerve_Tibial,0,smultixcan_100001_raw_ccn30.tsv.gz
2,ENSG00000134755.14,DSC2,4.8e-05,28.0,4.0,0.0001649988,Pancreas,0,smultixcan_100001_raw_ccn30.tsv.gz
3,ENSG00000176075.7,LINC00302,5.9e-05,1.0,1.0,5.872267e-05,Skin_Sun_Exposed_Lower_leg,0,smultixcan_100001_raw_ccn30.tsv.gz
4,ENSG00000257817.1,RP4-601P9.2,5.9e-05,8.0,5.0,9.454374e-07,Testis,0,smultixcan_100001_raw_ccn30.tsv.gz


In [41]:
_tmp2 = _process_chunk(_tmp)

In [43]:
_tmp2.head()

Unnamed: 0,gene,gene_name,pvalue,n,n_indep,t_i_best,p_i_best,pheno_desc,pheno_full_code
0,ENSG00000267462.1,RP11-866E20.3,2e-05,2.0,2.0,Testis,0.0004429014,Food weight,100001_raw-Food_weight
1,ENSG00000182901.16,RGS7,4e-05,20.0,9.0,Nerve_Tibial,1.246461e-05,Food weight,100001_raw-Food_weight
2,ENSG00000134755.14,DSC2,4.8e-05,28.0,4.0,Pancreas,0.0001649988,Food weight,100001_raw-Food_weight
3,ENSG00000176075.7,LINC00302,5.9e-05,1.0,1.0,Skin_Sun_Exposed_Lower_leg,5.872267e-05,Food weight,100001_raw-Food_weight
4,ENSG00000257817.1,RP4-601P9.2,5.9e-05,8.0,5.0,Testis,9.454374e-07,Food weight,100001_raw-Food_weight


In [None]:
# run in paralell and write several files, which will be combined at the end

## From Rapid GWAS project

In [None]:
_path = os.path.join(conf.SMULTIXCAN_RESULTS_DIR['RapidGWASProject'], '*.tsv.gz')
display(_path)
all_smultixcan_results_dirs = glob(_path)
display(len(all_smultixcan_results_dirs))
assert len(all_smultixcan_results_dirs) == conf.SMULTIXCAN_EXPECTED_PHENOTYPES['RapidGWASProject']

In [None]:
all_smultixcan_phenotypes = [MXPhenoResults(p) for p in all_smultixcan_results_dirs]
all_smultixcan_phenotypes_plain_names = pd.Index([p.pheno_info.get_plain_name() for p in all_smultixcan_phenotypes])

display(len(all_smultixcan_phenotypes))
assert len(all_smultixcan_phenotypes) == conf.SMULTIXCAN_EXPECTED_PHENOTYPES['RapidGWASProject']
assert len(all_smultixcan_phenotypes) == len(all_smultixcan_phenotypes_plain_names)

## From GTEx GWAS manuscript

In [None]:
_path = os.path.join(conf.SMULTIXCAN_RESULTS_DIR['GTEX_GWAS'], '*_ccn30.txt')
display(_path)
all_extra_results_dirs = glob(_path)
display(len(all_extra_results_dirs))
assert len(all_extra_results_dirs) == conf.SMULTIXCAN_EXPECTED_PHENOTYPES['GTEX_GWAS']

In [None]:
_file_pattern = '(?P<code>[^/]+)_smultixcan_imputed_gwas_gtexv8mashr_ccn30\.txt'
all_extra_phenotypes = [MXPhenoResults(p, _file_pattern) for p in all_extra_results_dirs]
all_extra_phenotypes_plain_names = pd.Index([p.pheno_info.get_plain_name() for p in all_extra_phenotypes])

display(len(all_extra_phenotypes))
assert len(all_extra_phenotypes) == conf.SMULTIXCAN_EXPECTED_PHENOTYPES['GTEX_GWAS']

In [None]:
all_phenotypes_plain_names = all_smultixcan_phenotypes_plain_names.union(all_extra_phenotypes_plain_names)
display(all_phenotypes_plain_names.shape)
assert all_phenotypes_plain_names.shape[0] == all_smultixcan_phenotypes_plain_names.shape[0] + all_extra_phenotypes_plain_names.shape[0]

## Run loading

In [None]:
os.makedirs(conf.WEBAPP_DIR, exist_ok=True)

In [None]:
def _get_combined_results(phenos, column):
    res = {}
    for pheno in phenos:
        key = pheno.pheno_info.get_plain_name()
        value = pheno.get_data(cols=['gene_name', column], index_col='gene_simple')[column]
        res[key] = value
    
    return res

In [None]:
def _run_all(column_name, phenotype_chunks, n_jobs=20):
    all_results = {}
    
    with ProcessPoolExecutor(max_workers=n_jobs) as executor:
        tasks = [executor.submit(_get_combined_results, chunk, column_name) for chunk in phenotype_chunks]
        for future in as_completed(tasks):
            res = future.result()
            all_results.update(res)
    
    return all_results

In [None]:
COLUMNS_TO_READ = {
    'pvalue': np.float64,
    'n': np.uint16,
    'n_indep': np.uint16,
    'p_i_best': np.float64,
    't_i_best': 'category',
    'p_i_worst': np.float64,
    't_i_worst': 'category',
    'status': np.uint8,
    'gene_name': 'category',
}

In [None]:
# phenotype_chunks = chunker(all_smultixcan_phenotypes[:5] + all_extra_phenotypes[:5], 2)
phenotype_chunks = chunker(all_smultixcan_phenotypes + all_extra_phenotypes, 200)

phenotype_chunks = list(phenotype_chunks)

In [None]:
MULTIXCAN_H5_FILE = os.path.join(conf.WEBAPP_DIR, 'smultixcan_results-table_format.h5')
display(MULTIXCAN_H5_FILE)

In [None]:
with pd.HDFStore(MULTIXCAN_H5_FILE, mode='w', complevel=1) as store:
    for col in COLUMNS_TO_READ.keys():
        print(col, flush=True)

        all_results = _run_all(col, phenotype_chunks)
        if len(all_results) == 0:
            raise Exception('No results')

        df = pd.DataFrame(all_results)
        assert df.index.is_unique
        assert df.columns.is_unique
        df = df.dropna(axis=0, how='all')

        df = df.unstack().sort_index()
        df.index.rename(['phenotype', 'gene'], inplace=True)
        
        if col in ('t_i_best', 't_i_worst'):
            df = df.astype('category')
        elif col in ('n', 'n_indep', 'status'):
            df = df.apply(lambda x: f'{int(x):d}' if not pd.isnull(x) else None).astype('category')
        
        #store[col] = df
        store.put(col, df, format='table', chunksize=10000000)
        store.flush()
        del df, all_results

In [None]:
lens = set()
with pd.HDFStore(MULTIXCAN_H5_FILE, mode='r') as store:
    keys = [k for k in store.keys() if len(k.split('/')) == 2]
    print(keys)
    assert len(keys) == len(COLUMNS_TO_READ)
    for k in keys:
        kl = pd.read_hdf(store, key=k).shape[0]
        lens.add(kl)

In [None]:
# there are more than one unique value because of the gene_name columns, which is never null
lens

# Write full CSV file

In [None]:
print('Writing full CSV file')

In [None]:
output_file = os.path.join(conf.WEBAPP_DIR, 'smultixcan_table.tsv')
display(output_file)

In [None]:
with pd.HDFStore(MULTIXCAN_H5_FILE, mode='r') as store:
    keys = [k for k in store.keys() if len(k.split('/')) == 2]
    
    iterators = {k:pd.read_hdf(store, key=k, iterator=True, chunksize=100000).__iter__() for k in keys}
    
    idx = 0
    while True:
        try:
            chunks = {k[1:]:next(iterators[k]) for k in keys}
        except StopIteration:
            break
        
        df_final = pd.DataFrame(chunks).dropna(subset=['pvalue'])
        df_final.to_csv(output_file, sep='\t', mode='w' if idx == 0 else 'a', header=True if idx == 0 else False)
        idx = idx + 1

### Testing

In [None]:
with pd.HDFStore(MULTIXCAN_H5_FILE, mode='r') as store:
    keys = [k for k in store.keys() if len(k.split('/')) == 2]
    print(keys)
    df = pd.read_hdf(store, key='pvalue')

In [None]:
df.shape

In [None]:
df.head()

In [None]:
print('running tests')

# For FinnGen
assert df.loc[('C_TONGUENAS-Malignant_neoplasm_of_other_and_unspecified_parts_of_tongue', 'ENSG00000110628')] == 0.005086576789507484
assert df.loc[('ENSG00000169783', 'C_TONGUENAS-Malignant_neoplasm_of_other_and_unspecified_parts_of_tongue')] == 0.3757187601354043
assert df.loc[('ENSG00000137959', 'C_TONGUENAS-Malignant_neoplasm_of_other_and_unspecified_parts_of_tongue')] == 5.132614371931036e-07

# For ICD10
assert df.loc[('ENSG00000135775', 'N18-Diagnoses_main_ICD10_N18_Chronic_renal_failure')] == 2.795075036067939e-05
assert df.loc[('ENSG00000169783', 'N18-Diagnoses_main_ICD10_N18_Chronic_renal_failure')] == 0.06668736815697908
assert df.loc[('ENSG00000174226', 'N18-Diagnoses_main_ICD10_N18_Chronic_renal_failure')] == 0.02496852053808064

# For extra phenotypes
assert df.loc[('ENSG00000135775', 'MAGIC_ln_FastingInsulin')] == 0.08712399858507687
assert df.loc[('ENSG00000169783', 'IMMUNOBASE_Systemic_lupus_erythematosus_hg19')] == 0.10974365378971256
assert df.loc[('ENSG00000158691', 'pgc.scz2')] == 2.698821020217747e-28