In [1]:
%load_ext autoreload
%autoreload 2

In [2]:
import os
from glob import glob
from concurrent.futures import ProcessPoolExecutor, as_completed

import numpy as np
import pandas as pd

import settings as conf
import metadata
from utils import is_number, chunker
from results.multixcan import MXPhenoInfo, MXPhenoResults

# fastENLOC reading functions

In [3]:
with open(conf.FASTENLOC_GTEX_TISSUES_FILE, 'r') as f:
    FASTENLOC_TISSUES_NAMES = set([x.strip() for x in f.readlines()])

In [4]:
# create summary of files
_path = os.path.join(conf.FASTENLOC_RESULTS_DIR['RapidGWASProject'], '**/*.enloc.sig.out')
display(_path)
all_fastenloc_results_files = glob(_path)

'/mnt/phenomexcan_base/results/fastenloc/rapid_gwas_project/**/*.enloc.sig.out'

In [5]:
len(all_fastenloc_results_files)

198401

In [6]:
PHENO_CODES = set(metadata.RAPID_GWAS_PHENO_INFO.index)

In [7]:
len(PHENO_CODES)

4359

In [8]:
phenos = []
tissues = []

for f in all_fastenloc_results_files:
    f = os.path.basename(f)
    split = f.split('-')
    
    tiss = split[-1].split('.')[0]
    if tiss not in FASTENLOC_TISSUES_NAMES:
        tiss = '-'.join(split[-2:]).split('.')[0]
    
    phen = split[1]
    if phen not in PHENO_CODES:
        phen = '-'.join(split[1:3])
    
    phenos.append(phen)
    tissues.append(tiss)

results_summary = pd.DataFrame({'pheno': phenos, 'tissue': tissues, 'file': all_fastenloc_results_files})

In [9]:
display(results_summary.shape)
assert results_summary.shape[0] == int(conf.FASTENLOC_EXPECTED_PHENOTYPES['RapidGWASProject'] * conf.GTEX_MODELS_N_EXPECTED_TISSUES)

(198401, 3)

In [10]:
results_summary.head()

Unnamed: 0,pheno,tissue,file
0,22617_9219,Brain_Spinal_cord_cervical_c-1,/mnt/phenomexcan_base/results/fastenloc/rapid_...
1,22617_9219,Artery_Coronary,/mnt/phenomexcan_base/results/fastenloc/rapid_...
2,22617_9219,Brain_Cortex,/mnt/phenomexcan_base/results/fastenloc/rapid_...
3,22617_9219,Brain_Substantia_nigra,/mnt/phenomexcan_base/results/fastenloc/rapid_...
4,22617_9219,Breast_Mammary_Tissue,/mnt/phenomexcan_base/results/fastenloc/rapid_...


In [11]:
results_summary['pheno'].unique().shape

(4049,)

In [12]:
assert np.all([t in PHENO_CODES for t in results_summary['pheno'].unique()])

In [13]:
assert results_summary['tissue'].unique().shape[0] == 49

In [14]:
assert np.all([t in FASTENLOC_TISSUES_NAMES for t in results_summary['tissue'].unique()])

In [15]:
_all_tissues_in_results = results_summary['tissue'].unique()
assert np.all([t in _all_tissues_in_results for t in FASTENLOC_TISSUES_NAMES])

In [16]:
assert len(all_fastenloc_results_files) == int(conf.SMULTIXCAN_EXPECTED_PHENOTYPES['RapidGWASProject'] * conf.GTEX_MODELS_N_EXPECTED_TISSUES)

In [17]:
def read_fe(filename): 
    fe_data = pd.read_csv(filename, sep='\s+', usecols=[0, 5], header=None).rename(columns={0: 'gene_cluster', 5: 'rcp'})
    fe_data = fe_data = fe_data.assign(gene=fe_data['gene_cluster'].apply(lambda x: x.split(':')[0])) 
    fe_data = fe_data.assign(gene_cluster=fe_data['gene_cluster'].apply(lambda x: x.split(':')[1]))
    
    fe_max = fe_data.groupby('gene')['rcp'].sum()
    return fe_max

def read_pheno(pheno): 
    _tmp = results_summary[results_summary['pheno'] == pheno] 
    if _tmp.shape[0] == 0: 
        return None 
     
    _res = {} 
    _all_genes = set() 
    for idx, _data in _tmp.iterrows(): 
        tissue_data = read_fe(_data.file) 
        _res[_data.tissue] = tissue_data 
        _all_genes.update(tissue_data.index) 
    
    return pd.DataFrame(_res, index=_all_genes).fillna(-1).max(axis=1)

### Testing

In [18]:
# testing
t = read_fe(os.path.join(conf.FASTENLOC_RESULTS_DIR['RapidGWASProject'], 'J15/fastenloc-J15-Whole_Blood.enloc.sig.out'))

In [19]:
t.sort_values(ascending=False).head()

gene
ENSG00000173930    1.869481e-07
ENSG00000260329    5.090216e-08
ENSG00000001561    2.257609e-08
ENSG00000231769    2.001572e-08
ENSG00000175390    1.147284e-08
Name: rcp, dtype: float64

In [20]:
_gid = metadata.GENE_NAME_TO_ID_MAP['SLCO4C1']
assert t[_gid] == (1.869e-07 + 1.578e-11 + 5.201e-12 + 1.409e-13 + 2.694e-11)

In [21]:
_gid = metadata.GENE_NAME_TO_ID_MAP['RP11-34P13.7']
assert t[_gid] == 0.00, t[_gid]

```parallel -j4 'zcat {} | grep ENSG00000049246' ::: fastenloc-1180-*.gz | column -t | sort -k6 -g```

In [22]:
t = read_pheno('J15')

In [23]:
t.sort_values(ascending=False).head()

ENSG00000116985    0.011245
ENSG00000198754    0.009545
ENSG00000279039    0.002292
ENSG00000173930    0.000965
ENSG00000231769    0.000865
dtype: float64

In [24]:
t.loc['ENSG00000116985']

0.01124506203

In [25]:
assert t.loc['ENSG00000116985'] == (1.124e-02 + 5.425e-07 + 1.153e-08 + 2.864e-06 + 1.644e-06)

# ENLOC reading functions

In [26]:
display(conf.FASTENLOC_RESULTS_DIR['GTEX_GWAS'])

'/mnt/phenomexcan_base/results/fastenloc/gtex_gwas'

In [27]:
def read_enloc(enloc_result_path):
    enloc_filename = enloc_result_path.split('/')[-1]
    phenotype = enloc_filename.split('__PM__')[0]
    tissue = enloc_filename.split('__PM__')[1].split('.enloc.rst.gz')[0]
    
    enloc_result = pd.read_csv(enloc_result_path, sep='\s+')
    enloc_result = enloc_result.assign(gene_id=enloc_result['molecular_qtl_trait'].apply(lambda x: x.split('.')[0]))
    
    return enloc_result.groupby('gene_id')['locus_rcp'].max().rename(tissue)

def read_enloc_pheno(phenotype_code):
    all_tissues_data = {}
    all_genes = set()
    
    all_phenotype_files = glob(os.path.join(conf.FASTENLOC_RESULTS_DIR['GTEX_GWAS'], phenotype_code, f'{phenotype_code}__PM__*.enloc.rst'))
    
    for f in all_phenotype_files:
        f_data = read_enloc(f)
        all_tissues_data[f] = f_data
        all_genes.update(f_data.index)
    
    return pd.DataFrame(all_tissues_data, index=all_genes).fillna(-1).max(axis=1)

In [28]:
t = read_enloc_pheno('SSGAC_Education_Years_Pooled')

In [29]:
t.head()

ENSG00000165476    0.052
ENSG00000166090    0.001
ENSG00000253892    0.000
ENSG00000105976    0.000
ENSG00000242618    0.000
dtype: float64

In [30]:
# testing
assert t.loc['ENSG00000106113'] == 0.00
assert t.loc['ENSG00000164050'] == 0.006
assert t.loc['ENSG00000081377'] == 0.317

# Get Rapid GWAS phenotypes

In [31]:
all_smultixcan_files = glob(os.path.join(conf.SMULTIXCAN_RESULTS_DIR['RapidGWASProject'], '*.tsv.gz'))
all_smultixcan_phenotypes = [MXPhenoResults(p) for p in all_smultixcan_files]

In [32]:
assert len(all_smultixcan_files) == len(all_smultixcan_phenotypes) == conf.SMULTIXCAN_EXPECTED_PHENOTYPES['RapidGWASProject']

In [33]:
all_smultixcan_files[:5]

['/mnt/phenomexcan_base/results/smultixcan/rapid_gwas_project/smultixcan_20096_1_ccn30.tsv.gz',
 '/mnt/phenomexcan_base/results/smultixcan/rapid_gwas_project/smultixcan_2345_ccn30.tsv.gz',
 '/mnt/phenomexcan_base/results/smultixcan/rapid_gwas_project/smultixcan_N49_ccn30.tsv.gz',
 '/mnt/phenomexcan_base/results/smultixcan/rapid_gwas_project/smultixcan_100011_raw_ccn30.tsv.gz',
 '/mnt/phenomexcan_base/results/smultixcan/rapid_gwas_project/smultixcan_5221_ccn30.tsv.gz']

# Get GTEx GWAS phenotypes

In [34]:
_path = os.path.join(conf.SMULTIXCAN_RESULTS_DIR['GTEX_GWAS'], '*')
display(_path)
all_extra_results_files = glob(_path)
assert len(all_extra_results_files) == conf.SMULTIXCAN_EXPECTED_PHENOTYPES['GTEX_GWAS']

'/mnt/phenomexcan_base/results/smultixcan/gtex_gwas/*'

In [35]:
all_extra_results_files[:5]

['/mnt/phenomexcan_base/results/smultixcan/gtex_gwas/MAGNETIC_LDL.C_smultixcan_imputed_gwas_gtexv8mashr_ccn30.txt.gz',
 '/mnt/phenomexcan_base/results/smultixcan/gtex_gwas/BCAC_Overall_BreastCancer_EUR_smultixcan_imputed_gwas_gtexv8mashr_ccn30.txt.gz',
 '/mnt/phenomexcan_base/results/smultixcan/gtex_gwas/Astle_et_al_2016_Sum_neutrophil_eosinophil_counts_smultixcan_imputed_gwas_gtexv8mashr_ccn30.txt.gz',
 '/mnt/phenomexcan_base/results/smultixcan/gtex_gwas/BCAC_ER_negative_BreastCancer_EUR_smultixcan_imputed_gwas_gtexv8mashr_ccn30.txt.gz',
 '/mnt/phenomexcan_base/results/smultixcan/gtex_gwas/MAGNETIC_IDL.TG_smultixcan_imputed_gwas_gtexv8mashr_ccn30.txt.gz']

In [36]:
_file_pattern = '(?P<code>[^/]+)_smultixcan_imputed_gwas_gtexv8mashr_ccn30\.txt'
all_extra_phenotypes = [MXPhenoResults(p, _file_pattern) for p in all_extra_results_files]
all_extra_phenotypes_plain_names = pd.Index([p.pheno_info.get_plain_name() for p in all_extra_phenotypes])

display(len(all_extra_phenotypes))
assert len(all_extra_phenotypes) == conf.SMULTIXCAN_EXPECTED_PHENOTYPES['GTEX_GWAS']

42

In [37]:
all_extra_phenotypes[0].pheno_info.get_plain_name()

'MAGNETIC_LDL.C'

In [38]:
all_extra_phenotypes_codes = [os.path.basename(f).split('_smultixcan_imputed_gwas_gtexv8mashr_ccn30')[0] for f in all_extra_results_files]
assert len(all_extra_phenotypes_codes) == conf.SMULTIXCAN_EXPECTED_PHENOTYPES['GTEX_GWAS']

In [39]:
all_extra_phenotypes_codes[:5]

['MAGNETIC_LDL.C',
 'BCAC_Overall_BreastCancer_EUR',
 'Astle_et_al_2016_Sum_neutrophil_eosinophil_counts',
 'BCAC_ER_negative_BreastCancer_EUR',
 'MAGNETIC_IDL.TG']

In [40]:
all_extra_phenotypes_codes = set(all_extra_phenotypes_codes)

# Read all results

In [41]:
def read_generic(pheno_code):
    if pheno_code in all_extra_phenotypes_codes:
        return read_enloc_pheno(pheno_code)
    else:
        return read_pheno(pheno_code)

def _get_combined_results(phenos):
    return {pheno.pheno_info.get_plain_name() : read_generic(pheno.pheno_info.pheno_code) for pheno in phenos}

In [42]:
# testing
_tmp = _get_combined_results(all_smultixcan_phenotypes[:4])
assert len(_tmp) == 4

In [43]:
# testing
_tmp = _get_combined_results(all_extra_phenotypes[:4])
assert len(_tmp) == 4

In [44]:
_pending = read_generic('C_MYELOID-LEUKAEMIA')

In [45]:
_pending.head()

ENSG00000165476    1.829890e-10
ENSG00000183431    5.735600e-11
ENSG00000261522    0.000000e+00
ENSG00000145794    3.075410e-11
ENSG00000177370    1.187800e-10
dtype: float64

In [46]:
_pending = read_generic('SSGAC_Education_Years_Pooled')

In [47]:
_pending.head()

ENSG00000165476    0.052
ENSG00000166090    0.001
ENSG00000253892    0.000
ENSG00000105976    0.000
ENSG00000242618    0.000
dtype: float64

In [48]:
def _run_all(phenotype_chunks, n_jobs=conf.N_JOBS_HIGH):
    all_results = {}
    
    with ProcessPoolExecutor(max_workers=n_jobs) as executor:
        tasks = [executor.submit(_get_combined_results, chunk) for chunk in phenotype_chunks]
        for future in as_completed(tasks):
            res = future.result()
            all_results.update(res)

    return all_results

In [49]:
# phenotype_chunks = chunker(all_smultixcan_phenotypes[:5] + all_extra_phenotypes[:5], 2)
phenotype_chunks = chunker(all_smultixcan_phenotypes + all_extra_phenotypes, 200)

In [50]:
all_results = _run_all(phenotype_chunks)

## Save as DataFrame

In [51]:
assert len(all_results) == conf.FASTENLOC_EXPECTED_PHENOTYPES['RapidGWASProject'] + conf.FASTENLOC_EXPECTED_PHENOTYPES['GTEX_GWAS']

In [52]:
fastenloc_genes_associations = pd.DataFrame(all_results)
fastenloc_genes_associations.index.rename('gene_id', inplace=True)

assert fastenloc_genes_associations.index.is_unique

display(fastenloc_genes_associations.shape)
display(fastenloc_genes_associations.head())

(37967, 4091)

Unnamed: 0_level_0,20096_1-Size_of_red_wine_glass_drunk_small_125ml,2345-Ever_had_bowel_cancer_screening,N49-Diagnoses_main_ICD10_N49_Inflammatory_disorders_of_male_genital_organs_not_elsewhere_classified,100011_raw-Iron,5221-Index_of_best_refractometry_result_right,20003_1141150624-Treatmentmedication_code_zomig_25mg_tablet,S69-Diagnoses_main_ICD10_S69_Other_and_unspecified_injuries_of_wrist_and_hand,20024_1136-Job_code_deduced_Information_and_communication_technology_managers,20002_1385-Noncancer_illness_code_selfreported_allergy_or_anaphylactic_reaction_to_food,G6_SLEEPAPNO-Sleep_apnoea,...,Astle_et_al_2016_Sum_basophil_neutrophil_counts,RA_OKADA_TRANS_ETHNIC,pgc.scz2,PGC_ADHD_EUR_2017,MAGIC_FastingGlucose,Astle_et_al_2016_Red_blood_cell_count,SSGAC_Depressive_Symptoms,BCAC_ER_positive_BreastCancer_EUR,IBD.EUR.Inflammatory_Bowel_Disease,Astle_et_al_2016_High_light_scatter_reticulocyte_count
gene_id,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1,Unnamed: 5_level_1,Unnamed: 6_level_1,Unnamed: 7_level_1,Unnamed: 8_level_1,Unnamed: 9_level_1,Unnamed: 10_level_1,Unnamed: 11_level_1,Unnamed: 12_level_1,Unnamed: 13_level_1,Unnamed: 14_level_1,Unnamed: 15_level_1,Unnamed: 16_level_1,Unnamed: 17_level_1,Unnamed: 18_level_1,Unnamed: 19_level_1,Unnamed: 20_level_1,Unnamed: 21_level_1
ENSG00000000419,2.1732e-07,3.6e-05,1.020227e-10,2e-05,0.001497,3.42677e-11,1.3123e-10,1.5086e-09,1.1145e-05,4.1646e-09,...,0.001,,0.0,0.0,,0.001,,,,0.0
ENSG00000000457,1.3439e-06,0.000692,8.21606e-11,8.9e-05,0.004363,1.342669e-10,4.223476e-10,2.036232e-08,0.0004598924,3.57085e-08,...,0.0,,,,,0.0,,,,
ENSG00000000460,8.75775e-06,0.001713,1.11901e-10,1.6e-05,0.003566,1.533228e-10,3.7123e-10,6.5057e-09,1.8302e-05,6.0029e-08,...,0.0,,,,,0.0,,,,
ENSG00000000938,7.197e-08,0.000446,3.573442e-10,6e-06,0.004429,2.588022e-08,3.4615e-10,9.1102e-09,1.3514e-05,9.878e-09,...,0.263,,0.002,,,,,,,0.0
ENSG00000000971,2.72e-07,0.00031,1.3973e-10,0.000182,0.00316,3.940395e-11,1.7856e-10,2.329954e-10,5.325e-07,4.95874e-08,...,0.0,0.0,,,,0.0,,,,0.0


In [53]:
# check genes id format
_tmp = pd.Series(fastenloc_genes_associations.index).apply(len).value_counts()
display(_tmp)
assert _tmp.shape[0] == 1
assert _tmp.index[0] == 15

15    37967
Name: gene_id, dtype: int64

In [54]:
# check columns data type
_tmp = fastenloc_genes_associations.dtypes.value_counts()
display(_tmp)
assert _tmp.shape[0] == 1

float64    4091
dtype: int64

In [55]:
fastenloc_genes_associations.isna().sum().sum()

3341082

In [56]:
fastenloc_genes_associations.dropna(axis=0, how='all').shape

(37967, 4091)

In [57]:
display(fastenloc_genes_associations.shape)

(37967, 4091)

In [58]:
assert fastenloc_genes_associations.shape == (37967, conf.FASTENLOC_EXPECTED_PHENOTYPES['RapidGWASProject'] + conf.FASTENLOC_EXPECTED_PHENOTYPES['GTEX_GWAS'])

In [59]:
# some testing
t = fastenloc_genes_associations['SSGAC_Education_Years_Pooled']

In [60]:
t.head()

gene_id
ENSG00000000419    0.001
ENSG00000000457    0.000
ENSG00000000460    0.000
ENSG00000000938      NaN
ENSG00000000971      NaN
Name: SSGAC_Education_Years_Pooled, dtype: float64

In [61]:
assert t.loc['ENSG00000106113'] == 0.00
assert t.loc['ENSG00000164050'] == 0.006
assert t.loc['ENSG00000081377'] == 0.317
assert pd.isnull(t.loc['ENSG00000000938'])

### Save

In [None]:
os.makedirs(conf.GENE_ASSOC_DIR, exist_ok=True)

In [62]:
# Save
fastenloc_genes_associations_filename = os.path.join(conf.GENE_ASSOC_DIR, f'fastenloc-torus-rcp.pkl.xz')
display(fastenloc_genes_associations_filename)

'/mnt/phenomexcan_base/gene_assoc/fastenloc-torus-rcp.pkl.xz'

In [63]:
fastenloc_genes_associations.to_pickle(fastenloc_genes_associations_filename)

### Save for publication

In [64]:
output_file = os.path.join(conf.GENE_ASSOC_DIR, 'fastenloc-torus-rcp.tsv.gz')
display(output_file)

'/mnt/phenomexcan_base/gene_assoc/fastenloc-torus-rcp.tsv.gz'

In [65]:
fastenloc_genes_associations.to_csv(output_file, sep='\t', float_format='%.4e')

In [66]:
# test "for publication" file
_tmp = pd.read_csv(output_file, sep='\t', index_col='gene_id')

In [67]:
display(_tmp.shape)
assert _tmp.shape == fastenloc_genes_associations.shape

(37967, 4091)

In [68]:
_tmp.head()

Unnamed: 0_level_0,20096_1-Size_of_red_wine_glass_drunk_small_125ml,2345-Ever_had_bowel_cancer_screening,N49-Diagnoses_main_ICD10_N49_Inflammatory_disorders_of_male_genital_organs_not_elsewhere_classified,100011_raw-Iron,5221-Index_of_best_refractometry_result_right,20003_1141150624-Treatmentmedication_code_zomig_25mg_tablet,S69-Diagnoses_main_ICD10_S69_Other_and_unspecified_injuries_of_wrist_and_hand,20024_1136-Job_code_deduced_Information_and_communication_technology_managers,20002_1385-Noncancer_illness_code_selfreported_allergy_or_anaphylactic_reaction_to_food,G6_SLEEPAPNO-Sleep_apnoea,...,Astle_et_al_2016_Sum_basophil_neutrophil_counts,RA_OKADA_TRANS_ETHNIC,pgc.scz2,PGC_ADHD_EUR_2017,MAGIC_FastingGlucose,Astle_et_al_2016_Red_blood_cell_count,SSGAC_Depressive_Symptoms,BCAC_ER_positive_BreastCancer_EUR,IBD.EUR.Inflammatory_Bowel_Disease,Astle_et_al_2016_High_light_scatter_reticulocyte_count
gene_id,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1,Unnamed: 5_level_1,Unnamed: 6_level_1,Unnamed: 7_level_1,Unnamed: 8_level_1,Unnamed: 9_level_1,Unnamed: 10_level_1,Unnamed: 11_level_1,Unnamed: 12_level_1,Unnamed: 13_level_1,Unnamed: 14_level_1,Unnamed: 15_level_1,Unnamed: 16_level_1,Unnamed: 17_level_1,Unnamed: 18_level_1,Unnamed: 19_level_1,Unnamed: 20_level_1,Unnamed: 21_level_1
ENSG00000000419,2.1732e-07,3.6e-05,1.0202e-10,2e-05,0.001497,3.4268e-11,1.3123e-10,1.5086e-09,1.1145e-05,4.1646e-09,...,0.001,,0.0,0.0,,0.001,,,,0.0
ENSG00000000457,1.3439e-06,0.000692,8.2161e-11,8.9e-05,0.004363,1.3427e-10,4.2235e-10,2.0362e-08,0.00045989,3.5708e-08,...,0.0,,,,,0.0,,,,
ENSG00000000460,8.7578e-06,0.001713,1.119e-10,1.6e-05,0.003566,1.5332e-10,3.7123e-10,6.5057e-09,1.8302e-05,6.0029e-08,...,0.0,,,,,0.0,,,,
ENSG00000000938,7.197e-08,0.000446,3.5734e-10,6e-06,0.00443,2.588e-08,3.4615e-10,9.1102e-09,1.3514e-05,9.878e-09,...,0.263,,0.002,,,,,,,0.0
ENSG00000000971,2.72e-07,0.00031,1.3973e-10,0.000182,0.00316,3.9404e-11,1.7856e-10,2.33e-10,5.325e-07,4.9587e-08,...,0.0,0.0,,,,0.0,,,,0.0


In [69]:
assert not _tmp.isin([np.inf, -np.inf]).any().any()

In [70]:
assert _tmp.isna().any().any()

In [71]:
_tmp_flat = pd.Series(_tmp.values.flatten()).dropna()
assert ((_tmp_flat >= 0) & (_tmp_flat <= 3)).all().all()

In [72]:
assert np.allclose(_tmp.values, fastenloc_genes_associations.values, atol=1e-320, rtol=1e-4, equal_nan=True)

### Save in HDF5 format for webapp

In [73]:
display(fastenloc_genes_associations_filename)

'/mnt/phenomexcan_base/gene_assoc/fastenloc-torus-rcp.pkl.xz'

In [74]:
# FIXME: it's not necessary to load again
fastenloc_gene_associations = pd.read_pickle(fastenloc_genes_associations_filename)

In [75]:
fastenloc_gene_associations.shape

(37967, 4091)

In [76]:
from utils import simplify_string_for_hdf5

In [77]:
FASTENLOC_HDF5_FILE = os.path.join(conf.GENE_ASSOC_DIR, 'fastenloc-torus-rcp.h5')
display(FASTENLOC_HDF5_FILE)

'/mnt/phenomexcan_base/gene_assoc/fastenloc-torus-rcp.h5'

In [78]:
with pd.HDFStore(FASTENLOC_HDF5_FILE, mode='w', complevel=1) as store:
    for col in fastenloc_gene_associations.columns:
        print('.', flush=True, end='')
        
        clean_col = simplify_string_for_hdf5(col)
        store[clean_col] = fastenloc_gene_associations[col]

........................................................................................................................................................................................................................................................................................................................................................................................................................................................................................................................................................................................................................................................................................................................................................................................................................................................................................................................................................................................................................................

In [79]:
# testing
with pd.HDFStore(FASTENLOC_HDF5_FILE, mode='r') as store:
    store_keys = list(store.keys())
    assert len(store_keys) == fastenloc_gene_associations.shape[1]
    display(store_keys[:5])
    
    clean_col = simplify_string_for_hdf5('100001_raw-Food_weight')
    data = store[clean_col]
    assert data.shape == (37967,), data.shape
    assert data.loc['ENSG00000267462'] == 3.6406000000000003e-06

['/c100001_raw_Food_weight',
 '/c100002_raw_Energy',
 '/c100003_raw_Protein',
 '/c100004_raw_Fat',
 '/c100005_raw_Carbohydrate']