In [1]:
%load_ext autoreload
%autoreload 2

In [2]:
import os
import pickle
from glob import glob
import re
from concurrent.futures import ProcessPoolExecutor, as_completed

import numpy as np
import pandas as pd
#from tqdm import tqdm
from scipy import stats
from sklearn.metrics import pairwise_distances

import settings as conf
from results.multixcan import MXPhenoInfo, MXPhenoResults
from utils import is_number, chunker

# Load silver standard to map from UKB to MIM

In [3]:
omim_silver_standard = pd.read_csv(os.path.join(conf.DATA_DIR, 'omim_silver_standard.tsv'), sep='\t')
omim_silver_standard = omim_silver_standard.rename(columns={
    'pheno_mim': 'trait_mim',
    'mim': 'gene_mim',
})

In [4]:
omim_silver_standard.shape

(7822, 7)

In [5]:
omim_silver_standard.dropna(subset=['trait', 'trait_mim', 'ensembl_gene_id']).shape

(7809, 7)

In [6]:
omim_silver_standard = omim_silver_standard.dropna(subset=['trait', 'trait_mim', 'ensembl_gene_id'])

In [7]:
omim_silver_standard.shape

(7809, 7)

In [8]:
omim_silver_standard.head()

Unnamed: 0,trait,trait_mim,gene_mim,entry_type,entrez_gene_id,approved_gene_symbol,ensembl_gene_id
0,M41-Diagnoses_main_ICD10_M41_Scoliosis,101800,188830,gene,5573,PRKAR1A,ENSG00000108946
1,M41-Diagnoses_main_ICD10_M41_Scoliosis,102500,600275,gene,4853,NOTCH2,ENSG00000134250
2,M41-Diagnoses_main_ICD10_M41_Scoliosis,105830,601623,gene,7337,UBE3A,ENSG00000114062
3,M41-Diagnoses_main_ICD10_M41_Scoliosis,108120,190990,gene,7169,TPM2,ENSG00000198467
4,M41-Diagnoses_main_ICD10_M41_Scoliosis,108145,613629,gene,63895,PIEZO2,ENSG00000154864


In [10]:
# for publication (xlsx)
output_dir = os.path.join(conf.DELIVERABLES_DIR, 'supp_tables')
os.makedirs(output_dir, exist_ok=True)

output_file = os.path.join(output_dir, 'suppl_table_S2-UKBiobank_to_OMIM-standard.xlsx')
display(output_file)

'/mnt/phenomexcan_base/deliverables/supp_tables/suppl_table_S2-UKBiobank_to_OMIM-standard.xlsx'

In [11]:
omim_silver_standard.to_excel(output_file, index=False)

In [12]:
# some testing
_tmp = pd.read_excel(output_file)

In [13]:
assert omim_silver_standard.shape == _tmp.shape

In [14]:
_tmp.shape

(7809, 7)

In [15]:
_tmp.head()

Unnamed: 0,trait,trait_mim,gene_mim,entry_type,entrez_gene_id,approved_gene_symbol,ensembl_gene_id
0,M41-Diagnoses_main_ICD10_M41_Scoliosis,101800,188830,gene,5573,PRKAR1A,ENSG00000108946
1,M41-Diagnoses_main_ICD10_M41_Scoliosis,102500,600275,gene,4853,NOTCH2,ENSG00000134250
2,M41-Diagnoses_main_ICD10_M41_Scoliosis,105830,601623,gene,7337,UBE3A,ENSG00000114062
3,M41-Diagnoses_main_ICD10_M41_Scoliosis,108120,190990,gene,7169,TPM2,ENSG00000198467
4,M41-Diagnoses_main_ICD10_M41_Scoliosis,108145,613629,gene,63895,PIEZO2,ENSG00000154864
