In [1]:
%load_ext autoreload
%autoreload 2

In [2]:
import os
from glob import glob
import sqlite3

import numpy as np
import pandas as pd

import settings as conf

# Load all genes from models

In [3]:
gtex_models_files = glob(os.path.join(conf.GTEX_MODELS_DIR, '*.db'))
gtex_models_files[:3]

['/home/miltondp/projects/labs/hakyimlab/phenomexcan/base/data/gtex_v8/mashr/mashr_Skin_Not_Sun_Exposed_Suprapubic.db',
 '/home/miltondp/projects/labs/hakyimlab/phenomexcan/base/data/gtex_v8/mashr/mashr_Cells_EBV-transformed_lymphocytes.db',
 '/home/miltondp/projects/labs/hakyimlab/phenomexcan/base/data/gtex_v8/mashr/mashr_Brain_Frontal_Cortex_BA9.db']

In [4]:
# should be 49 for GTEx v8 models
display(len(gtex_models_files))
assert len(gtex_models_files) == 49

49

In [5]:
# get gene id, gene name and gene type from sqlite files

all_models = []
all_models_size = []

for model_file in gtex_models_files:
    cnx = sqlite3.connect(model_file)
    df = pd.read_sql_query("SELECT gene, genename as gene_name, gene_type FROM extra", cnx)
    df = df.assign(gene_id=df['gene'].apply(lambda x: x.split('.')[0]))
    
    all_models_size.append(df.shape[0])
    all_models.append(df)

In [6]:
genes_mapping = pd.concat(all_models, ignore_index=True)

In [7]:
genes_mapping.shape

(686241, 4)

In [8]:
genes_mapping.head(3)

Unnamed: 0,gene,gene_name,gene_type,gene_id
0,ENSG00000000457.13,SCYL3,protein_coding,ENSG00000000457
1,ENSG00000000938.12,FGR,protein_coding,ENSG00000000938
2,ENSG00000000971.15,CFH,protein_coding,ENSG00000000971


In [9]:
genes_mapping = genes_mapping.drop_duplicates()

In [10]:
genes_mapping.shape

(22535, 4)

In [11]:
assert genes_mapping.shape[0] == genes_mapping['gene'].unique().shape[0]

In [12]:
assert genes_mapping.shape[0] == genes_mapping['gene_id'].unique().shape[0]

In [13]:
genes_mapping.head()

Unnamed: 0,gene,gene_name,gene_type,gene_id
0,ENSG00000000457.13,SCYL3,protein_coding,ENSG00000000457
1,ENSG00000000938.12,FGR,protein_coding,ENSG00000000938
2,ENSG00000000971.15,CFH,protein_coding,ENSG00000000971
3,ENSG00000001036.13,FUCA2,protein_coding,ENSG00000001036
4,ENSG00000001084.10,GCLC,protein_coding,ENSG00000001084


## Save tissues

In [14]:
# save list of tissues
tissues_list = [os.path.basename(gtex_model).split('mashr_')[1].split('.db')[0] for gtex_model in gtex_models_files]

In [15]:
tissues_df = pd.DataFrame(tissues_list, columns=['tissue_name'])
tissues_df.index.rename('tissue_id', inplace=True)

In [16]:
tissues_df.shape

(49, 1)

In [17]:
tissues_df.head()

Unnamed: 0_level_0,tissue_name
tissue_id,Unnamed: 1_level_1
0,Skin_Not_Sun_Exposed_Suprapubic
1,Cells_EBV-transformed_lymphocytes
2,Brain_Frontal_Cortex_BA9
3,Kidney_Cortex
4,Brain_Substantia_nigra


In [18]:
os.makedirs(conf.DELIVERABLES_DIR, exist_ok=True)

In [19]:
tissues_file = os.path.join(conf.DELIVERABLES_DIR, 'tissues.tsv')
display(tissues_file)

tissues_df.to_csv(tissues_file, sep='\t')

'/home/miltondp/projects/labs/hakyimlab/phenomexcan/base/deliverables/tissues.tsv'

In [20]:
def _get_col_stats(column_data):
    if column_data.dtype.name == 'object':
        return column_data.apply(lambda x: len(str(x))).max()
    else:
        return (column_data.min(), column_data.max())

    return None

In [21]:
{
    c : _get_col_stats(tissues_df[c])
    for c in tissues_df.columns
}

{'tissue_name': 37}

# Add gene bands

In [22]:
biomart_genes = pd.read_csv(conf.BIOMART_GENES_INFO_FILE, index_col='ensembl_gene_id')

In [23]:
def _get_gene_band(gene_id):
    if gene_id not in biomart_genes.index:
        return ''

    gene_data = biomart_genes.loc[gene_id]
    chrom = gene_data['chromosome_name']
    band = gene_data['band']

    return f'{chrom}{band}'

In [24]:
genes_mapping = genes_mapping.assign(band=genes_mapping['gene_id'].apply(lambda x: _get_gene_band(x)))

In [25]:
genes_mapping.head()

Unnamed: 0,gene,gene_name,gene_type,gene_id,band
0,ENSG00000000457.13,SCYL3,protein_coding,ENSG00000000457,1q24.2
1,ENSG00000000938.12,FGR,protein_coding,ENSG00000000938,1p35.3
2,ENSG00000000971.15,CFH,protein_coding,ENSG00000000971,1q31.3
3,ENSG00000001036.13,FUCA2,protein_coding,ENSG00000001036,6q24.2
4,ENSG00000001084.10,GCLC,protein_coding,ENSG00000001084,6p12.1


In [26]:
os.makedirs(conf.GENES_METADATA_DIR, exist_ok=True)

In [27]:
# for internal use
final_filename = os.path.join(conf.GENES_METADATA_DIR, 'genes_mappings.pkl')
display(final_filename)

'/home/miltondp/projects/labs/hakyimlab/phenomexcan/base/data/genes_metadata/genes_mappings.pkl'

In [28]:
genes_mapping.to_pickle(final_filename)

In [29]:
# for general use
os.makedirs(conf.DELIVERABLES_DIR, exist_ok=True)
final_filename = os.path.join(conf.DELIVERABLES_DIR, 'genes_mappings.tsv.gz')
display(final_filename)

'/home/miltondp/projects/labs/hakyimlab/phenomexcan/base/deliverables/genes_mappings.tsv.gz'

In [30]:
genes_mapping.to_csv(final_filename, sep='\t', index=False)

# Save mappings

In [31]:
import pickle

### Gene id to name

In [32]:
gene_id_to_name = genes_mapping[['gene_id', 'gene_name']].set_index('gene_id').to_dict()['gene_name']
assert gene_id_to_name['ENSG00000000457'] == 'SCYL3'
assert gene_id_to_name['ENSG00000001036'] == 'FUCA2'

In [33]:
with open(os.path.join(conf.GENES_METADATA_DIR, 'genes_mapping_simplified-0.pkl'), 'wb') as f:
    pickle.dump(gene_id_to_name, f)

In [34]:
with open(os.path.join(conf.GENES_METADATA_DIR, 'genes_mapping_simplified-0.pkl'), 'rb') as f:
    gene_id_to_name = pickle.load(f)

assert gene_id_to_name['ENSG00000000457'] == 'SCYL3'
assert gene_id_to_name['ENSG00000001036'] == 'FUCA2'

### Gene id to band

In [35]:
gene_id_to_band = genes_mapping[['band', 'gene_id']].set_index('gene_id').to_dict()['band']
assert gene_id_to_band['ENSG00000000457'] == '1q24.2'
assert gene_id_to_band['ENSG00000000460'] == '1q24.2'
assert gene_id_to_band['ENSG00000001036'] == '6q24.2'

In [36]:
with open(os.path.join(conf.GENES_METADATA_DIR, 'genes_mapping_id_to_band.pkl'), 'wb') as f:
    pickle.dump(gene_id_to_band, f)

In [37]:
with open(os.path.join(conf.GENES_METADATA_DIR, 'genes_mapping_id_to_band.pkl'), 'rb') as f:
    gene_id_to_band = pickle.load(f)

assert gene_id_to_band['ENSG00000000457'] == '1q24.2'
assert gene_id_to_band['ENSG00000000460'] == '1q24.2'
assert gene_id_to_band['ENSG00000001036'] == '6q24.2'

### Gene name to id

In [38]:
gene_name_to_id = genes_mapping[['gene_id', 'gene_name']].set_index('gene_name').to_dict()['gene_id']
assert gene_name_to_id['SCYL3'] == 'ENSG00000000457'
assert gene_name_to_id['C1orf112'] == 'ENSG00000000460'

In [39]:
with open(os.path.join(conf.GENES_METADATA_DIR, 'genes_mapping_simplified-1.pkl'), 'wb') as f:
    pickle.dump(gene_name_to_id, f)

In [40]:
with open(os.path.join(conf.GENES_METADATA_DIR, 'genes_mapping_simplified-1.pkl'), 'rb') as f:
    gene_name_to_id = pickle.load(f)

assert gene_name_to_id['SCYL3'] == 'ENSG00000000457'
assert gene_name_to_id['C1orf112'] == 'ENSG00000000460'

### Gene name to band

In [41]:
gene_name_to_band = genes_mapping[['band', 'gene_name']].set_index('gene_name').to_dict()['band']
assert gene_name_to_band['SCYL3'] == '1q24.2'
assert gene_name_to_band['C1orf112'] == '1q24.2'
assert gene_name_to_band['FUCA2'] == '6q24.2'

In [42]:
with open(os.path.join(conf.GENES_METADATA_DIR, 'genes_mapping_name_to_band.pkl'), 'wb') as f:
    pickle.dump(gene_name_to_band, f)

In [43]:
with open(os.path.join(conf.GENES_METADATA_DIR, 'genes_mapping_name_to_band.pkl'), 'rb') as f:
    gene_name_to_band = pickle.load(f)

assert gene_name_to_band['SCYL3'] == '1q24.2'
assert gene_name_to_band['C1orf112'] == '1q24.2'
assert gene_name_to_band['FUCA2'] == '6q24.2'

# Deprecated

# Remove some genes with duplicated gene IDs

Keep only those IDs reported in GeneCards

In [44]:
# some gene names are duplicated (several gene ids per gene name)
genes_mapping.shape[0] == genes_mapping['gene_name'].unique().shape[0]

False

In [45]:
tmp = genes_mapping['gene_name'].value_counts()
tmp = tmp[tmp > 1]
display(tmp)

LINC01115    2
LINC00484    2
LYNX1        2
MAL2         2
SPATA13      2
GOLGA8M      2
LINC01422    2
LINC01297    2
Name: gene_name, dtype: int64

In [46]:
# taken from GeneCards
ens_ids_to_keep_map = {
    'LINC01115': 'ENSG00000237667',
    'MAL2': 'ENSG00000147676',
    'LYNX1': 'ENSG00000180155',
    'LINC01422': 'ENSG00000223704',
    'SPATA13': 'ENSG00000182957',
    'GOLGA8M': 'ENSG00000188626',
    'LINC01297': 'ENSG00000274827',
    'LINC00484': 'ENSG00000235641',
}

In [47]:
ens_ids_to_keep = list(ens_ids_to_keep_map.values())

In [48]:
genes_mapping[genes_mapping['gene_name'].isin(tmp.index)].sort_values('gene_name')

Unnamed: 0,gene,gene_name,gene_type,gene_id,band
11492,ENSG00000188626.6,GOLGA8M,protein_coding,ENSG00000188626,15q13.1
329628,ENSG00000261480.1,GOLGA8M,lincRNA,ENSG00000261480,15q13.1
26074,ENSG00000235641.4,LINC00484,lincRNA,ENSG00000235641,9q22.2
39454,ENSG00000229694.6,LINC00484,lincRNA,ENSG00000229694,9q22.2
13264,ENSG00000237667.5,LINC01115,lincRNA,ENSG00000237667,2p25.3
148765,ENSG00000272342.1,LINC01115,lincRNA,ENSG00000272342,2p25.3
229974,ENSG00000225255.6,LINC01297,lincRNA,ENSG00000225255,22q11.1
231948,ENSG00000274827.4,LINC01297,lincRNA,ENSG00000274827,14q11.2
12785,ENSG00000223704.1,LINC01422,lincRNA,ENSG00000223704,22q12.1
13187,ENSG00000235271.5,LINC01422,lincRNA,ENSG00000235271,22q12.1


In [49]:
# keep only those from GeneCards
to_remove = genes_mapping[genes_mapping['gene_name'].isin(tmp.index) & (~genes_mapping['gene_id'].isin(ens_ids_to_keep))]
assert to_remove.shape[0] == len(ens_ids_to_keep)

In [50]:
to_remove

Unnamed: 0,gene,gene_name,gene_type,gene_id,band
13187,ENSG00000235271.5,LINC01422,lincRNA,ENSG00000235271,22q12.1
13687,ENSG00000253972.5,MAL2,lincRNA,ENSG00000253972,8q24.12
14927,ENSG00000283992.1,LYNX1,protein_coding,ENSG00000283992,8q24.3
39454,ENSG00000229694.6,LINC00484,lincRNA,ENSG00000229694,9q22.2
51058,ENSG00000228741.2,SPATA13,lincRNA,ENSG00000228741,13q12.12
148765,ENSG00000272342.1,LINC01115,lincRNA,ENSG00000272342,2p25.3
229974,ENSG00000225255.6,LINC01297,lincRNA,ENSG00000225255,22q11.1
329628,ENSG00000261480.1,GOLGA8M,lincRNA,ENSG00000261480,15q13.1


In [51]:
n_before = genes_mapping.shape[0]
display(n_before)
genes_mapping = genes_mapping.drop(to_remove.index)
display(genes_mapping.shape)
assert genes_mapping.shape[0] == (n_before - len(to_remove.index))

22535

(22527, 5)