In [1]:
%load_ext autoreload
%autoreload 2

In [2]:
import os
from glob import glob
import sqlite3

import numpy as np
import pandas as pd

import settings as conf

# Load all genes from models

In [3]:
gtex_models_files = glob(os.path.join(conf.GTEX_MODELS_DIR, '*.db'))
gtex_models_files[:3]

['/mnt/phenomexcan_base/data/gtex_v8/mashr/mashr_Thyroid.db',
 '/mnt/phenomexcan_base/data/gtex_v8/mashr/mashr_Skin_Not_Sun_Exposed_Suprapubic.db',
 '/mnt/phenomexcan_base/data/gtex_v8/mashr/mashr_Colon_Transverse.db']

In [4]:
# should be 49 for GTEx v8 models
display(len(gtex_models_files))
assert len(gtex_models_files) == 49

49

In [5]:
# get gene id, gene name and gene type from sqlite files

all_models = []
all_models_size = []

for model_file in gtex_models_files:
    cnx = sqlite3.connect(model_file)
    df = pd.read_sql_query("SELECT gene, genename as gene_name, gene_type FROM extra", cnx)
    df = df.assign(gene_id=df['gene'].apply(lambda x: x.split('.')[0]))
    
    all_models_size.append(df.shape[0])
    all_models.append(df)

In [6]:
genes_mapping = pd.concat(all_models, ignore_index=True)

In [7]:
genes_mapping.shape

(686241, 4)

In [8]:
genes_mapping.head(3)

Unnamed: 0,gene,gene_name,gene_type,gene_id
0,ENSG00000000457.13,SCYL3,protein_coding,ENSG00000000457
1,ENSG00000000460.16,C1orf112,protein_coding,ENSG00000000460
2,ENSG00000000938.12,FGR,protein_coding,ENSG00000000938


In [9]:
genes_mapping = genes_mapping.drop_duplicates()

In [10]:
genes_mapping.shape

(22535, 4)

In [11]:
assert genes_mapping.shape[0] == genes_mapping['gene'].unique().shape[0]

In [12]:
assert genes_mapping.shape[0] == genes_mapping['gene_id'].unique().shape[0]

In [13]:
genes_mapping.head()

Unnamed: 0,gene,gene_name,gene_type,gene_id
0,ENSG00000000457.13,SCYL3,protein_coding,ENSG00000000457
1,ENSG00000000460.16,C1orf112,protein_coding,ENSG00000000460
2,ENSG00000000938.12,FGR,protein_coding,ENSG00000000938
3,ENSG00000000971.15,CFH,protein_coding,ENSG00000000971
4,ENSG00000001036.13,FUCA2,protein_coding,ENSG00000001036


# Add gene bands

In [14]:
biomart_genes = pd.read_csv(conf.BIOMART_GENES_INFO_FILE, index_col='ensembl_gene_id')

In [15]:
def _get_gene_band(gene_id):
    if gene_id not in biomart_genes.index:
        return ''

    gene_data = biomart_genes.loc[gene_id]
    chrom = gene_data['chromosome_name']
    band = gene_data['band']

    return f'{chrom}{band}'

In [16]:
genes_mapping = genes_mapping.assign(band=genes_mapping['gene_id'].apply(lambda x: _get_gene_band(x)))

In [17]:
genes_mapping.head()

Unnamed: 0,gene,gene_name,gene_type,gene_id,band
0,ENSG00000000457.13,SCYL3,protein_coding,ENSG00000000457,1q24.2
1,ENSG00000000460.16,C1orf112,protein_coding,ENSG00000000460,1q24.2
2,ENSG00000000938.12,FGR,protein_coding,ENSG00000000938,1p35.3
3,ENSG00000000971.15,CFH,protein_coding,ENSG00000000971,1q31.3
4,ENSG00000001036.13,FUCA2,protein_coding,ENSG00000001036,6q24.2


In [18]:
os.makedirs(conf.GENES_METADATA_DIR, exist_ok=True)

In [19]:
# for internal use
final_filename = os.path.join(conf.GENES_METADATA_DIR, 'genes_mappings.pkl')
display(final_filename)

'/mnt/phenomexcan_base/data/genes_metadata/genes_mappings.pkl'

In [20]:
genes_mapping.to_pickle(final_filename)

In [21]:
# for general use
os.makedirs(conf.DELIVERABLES_DIR, exist_ok=True)
final_filename = os.path.join(conf.DELIVERABLES_DIR, 'genes_mappings.tsv.gz')
display(final_filename)

'/mnt/phenomexcan_base/deliverables/genes_mappings.tsv.gz'

In [22]:
genes_mapping.to_csv(final_filename, sep='\t', index=False)

# Save mappings

In [23]:
import pickle

### Gene id to name

In [24]:
gene_id_to_name = genes_mapping[['gene_id', 'gene_name']].set_index('gene_id').to_dict()['gene_name']
assert gene_id_to_name['ENSG00000000457'] == 'SCYL3'
assert gene_id_to_name['ENSG00000001036'] == 'FUCA2'

In [25]:
with open(os.path.join(conf.GENES_METADATA_DIR, 'genes_mapping_simplified-0.pkl'), 'wb') as f:
    pickle.dump(gene_id_to_name, f)

In [26]:
with open(os.path.join(conf.GENES_METADATA_DIR, 'genes_mapping_simplified-0.pkl'), 'rb') as f:
    gene_id_to_name = pickle.load(f)

assert gene_id_to_name['ENSG00000000457'] == 'SCYL3'
assert gene_id_to_name['ENSG00000001036'] == 'FUCA2'

### Gene id to band

In [27]:
gene_id_to_band = genes_mapping[['band', 'gene_id']].set_index('gene_id').to_dict()['band']
assert gene_id_to_band['ENSG00000000457'] == '1q24.2'
assert gene_id_to_band['ENSG00000000460'] == '1q24.2'
assert gene_id_to_band['ENSG00000001036'] == '6q24.2'

In [28]:
with open(os.path.join(conf.GENES_METADATA_DIR, 'genes_mapping_id_to_band.pkl'), 'wb') as f:
    pickle.dump(gene_id_to_band, f)

In [29]:
with open(os.path.join(conf.GENES_METADATA_DIR, 'genes_mapping_id_to_band.pkl'), 'rb') as f:
    gene_id_to_band = pickle.load(f)

assert gene_id_to_band['ENSG00000000457'] == '1q24.2'
assert gene_id_to_band['ENSG00000000460'] == '1q24.2'
assert gene_id_to_band['ENSG00000001036'] == '6q24.2'

### Gene name to id

In [30]:
gene_name_to_id = genes_mapping[['gene_id', 'gene_name']].set_index('gene_name').to_dict()['gene_id']
assert gene_name_to_id['SCYL3'] == 'ENSG00000000457'
assert gene_name_to_id['C1orf112'] == 'ENSG00000000460'

In [31]:
with open(os.path.join(conf.GENES_METADATA_DIR, 'genes_mapping_simplified-1.pkl'), 'wb') as f:
    pickle.dump(gene_name_to_id, f)

In [32]:
with open(os.path.join(conf.GENES_METADATA_DIR, 'genes_mapping_simplified-1.pkl'), 'rb') as f:
    gene_name_to_id = pickle.load(f)

assert gene_name_to_id['SCYL3'] == 'ENSG00000000457'
assert gene_name_to_id['C1orf112'] == 'ENSG00000000460'

### Gene name to band

In [33]:
gene_name_to_band = genes_mapping[['band', 'gene_name']].set_index('gene_name').to_dict()['band']
assert gene_name_to_band['SCYL3'] == '1q24.2'
assert gene_name_to_band['C1orf112'] == '1q24.2'
assert gene_name_to_band['FUCA2'] == '6q24.2'

In [34]:
with open(os.path.join(conf.GENES_METADATA_DIR, 'genes_mapping_name_to_band.pkl'), 'wb') as f:
    pickle.dump(gene_name_to_band, f)

In [35]:
with open(os.path.join(conf.GENES_METADATA_DIR, 'genes_mapping_name_to_band.pkl'), 'rb') as f:
    gene_name_to_band = pickle.load(f)

assert gene_name_to_band['SCYL3'] == '1q24.2'
assert gene_name_to_band['C1orf112'] == '1q24.2'
assert gene_name_to_band['FUCA2'] == '6q24.2'

# Deprecated

# Remove some genes with duplicated gene IDs

Keep only those IDs reported in GeneCards

In [14]:
# some gene names are duplicated (several gene ids per gene name)
genes_mapping.shape[0] == genes_mapping['gene_name'].unique().shape[0]

False

In [15]:
tmp = genes_mapping['gene_name'].value_counts()
tmp = tmp[tmp > 1]
display(tmp)

LINC01422    2
LYNX1        2
MAL2         2
GOLGA8M      2
SPATA13      2
LINC00484    2
LINC01115    2
LINC01297    2
Name: gene_name, dtype: int64

In [16]:
# taken from GeneCards
ens_ids_to_keep_map = {
    'LINC01115': 'ENSG00000237667',
    'MAL2': 'ENSG00000147676',
    'LYNX1': 'ENSG00000180155',
    'LINC01422': 'ENSG00000223704',
    'SPATA13': 'ENSG00000182957',
    'GOLGA8M': 'ENSG00000188626',
    'LINC01297': 'ENSG00000274827',
    'LINC00484': 'ENSG00000235641',
}

In [17]:
ens_ids_to_keep = list(ens_ids_to_keep_map.values())

In [18]:
genes_mapping[genes_mapping['gene_name'].isin(tmp.index)].sort_values('gene_name')

Unnamed: 0,gene,gene_name,gene_type,gene_id
11637,ENSG00000188626.6,GOLGA8M,protein_coding,ENSG00000188626
14287,ENSG00000261480.1,GOLGA8M,lincRNA,ENSG00000261480
13159,ENSG00000229694.6,LINC00484,lincRNA,ENSG00000229694
13377,ENSG00000235641.4,LINC00484,lincRNA,ENSG00000235641
13459,ENSG00000237667.5,LINC01115,lincRNA,ENSG00000237667
70900,ENSG00000272342.1,LINC01115,lincRNA,ENSG00000272342
114612,ENSG00000225255.6,LINC01297,lincRNA,ENSG00000225255
118237,ENSG00000274827.4,LINC01297,lincRNA,ENSG00000274827
12932,ENSG00000223704.1,LINC01422,lincRNA,ENSG00000223704
13366,ENSG00000235271.5,LINC01422,lincRNA,ENSG00000235271


In [19]:
# keep only those from GeneCards
to_remove = genes_mapping[genes_mapping['gene_name'].isin(tmp.index) & (~genes_mapping['gene_id'].isin(ens_ids_to_keep))]
assert to_remove.shape[0] == len(ens_ids_to_keep)

In [20]:
to_remove

Unnamed: 0,gene,gene_name,gene_type,gene_id
13133,ENSG00000228741.2,SPATA13,lincRNA,ENSG00000228741
13159,ENSG00000229694.6,LINC00484,lincRNA,ENSG00000229694
13366,ENSG00000235271.5,LINC01422,lincRNA,ENSG00000235271
14287,ENSG00000261480.1,GOLGA8M,lincRNA,ENSG00000261480
15295,ENSG00000283992.1,LYNX1,protein_coding,ENSG00000283992
28990,ENSG00000253972.5,MAL2,lincRNA,ENSG00000253972
70900,ENSG00000272342.1,LINC01115,lincRNA,ENSG00000272342
114612,ENSG00000225255.6,LINC01297,lincRNA,ENSG00000225255


In [21]:
n_before = genes_mapping.shape[0]
display(n_before)
genes_mapping = genes_mapping.drop(to_remove.index)
display(genes_mapping.shape)
assert genes_mapping.shape[0] == (n_before - len(to_remove.index))

22535

(22527, 4)