# `Gene`: ensembl, release-109

- https://www.ensembl.org/info/data/mysql.html
- https://www.ensembl.org/info/docs/api/core/core_schema.html

Install mysqlclient: https://pypi.org/project/mysqlclient/

In [1]:
import pandas as pd
from sqlalchemy import create_engine
import mysql.connector as sql  # needed

In [2]:
def get_url(db="homo_sapiens_core_109_38"):
    return f"mysql+mysqldb://anonymous:@ensembldb.ensembl.org/{db}"

## Human

In [3]:
engine = create_engine(url=get_url())

### Queries

In [4]:
query_core = """
SELECT gene.stable_id, xref.display_label, gene.biotype, gene.description, external_synonym.synonym
FROM gene
LEFT JOIN xref ON gene.display_xref_id = xref.xref_id
LEFT JOIN external_synonym ON gene.display_xref_id = external_synonym.xref_id
"""

query_external = """
SELECT gene.stable_id, object_xref.xref_id, xref.dbprimary_acc, external_db.db_name
FROM gene
LEFT JOIN object_xref ON gene.gene_id = object_xref.ensembl_id
LEFT JOIN xref ON object_xref.xref_id = xref.xref_id
LEFT JOIN external_db ON xref.external_db_id = external_db.external_db_id
WHERE object_xref.ensembl_object_type = 'Gene' AND external_db.db_name IN ('HGNC', 'EntrezGene')
"""

Query for the basic gene annotations:

In [5]:
results_core = pd.read_sql(query_core, con=engine)
results_core.shape

(111838, 5)

In [6]:
results_core.head()

Unnamed: 0,stable_id,display_label,biotype,description,synonym
0,ENSG00000210049,MT-TF,Mt_tRNA,mitochondrially encoded tRNA-Phe (UUU/C) [Sour...,MTTF
1,ENSG00000210049,MT-TF,Mt_tRNA,mitochondrially encoded tRNA-Phe (UUU/C) [Sour...,trnF
2,ENSG00000211459,MT-RNR1,Mt_rRNA,mitochondrially encoded 12S rRNA [Source:HGNC ...,12S
3,ENSG00000211459,MT-RNR1,Mt_rRNA,mitochondrially encoded 12S rRNA [Source:HGNC ...,MOTS-c
4,ENSG00000211459,MT-RNR1,Mt_rRNA,mitochondrially encoded 12S rRNA [Source:HGNC ...,MTRNR1


In [7]:
results_core_group = results_core.groupby("stable_id").agg(
    {
        "display_label": "first",
        "biotype": "first",
        "description": "first",
        "synonym": lambda x: "|".join([i for i in set(x) if i is not None]),
    }
)

In [8]:
results_core_group.head()

Unnamed: 0_level_0,display_label,biotype,description,synonym
stable_id,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1
ENSG00000000003,TSPAN6,protein_coding,tetraspanin 6 [Source:HGNC Symbol;Acc:HGNC:11858],TM4SF6|T245|TSPAN-6
ENSG00000000005,TNMD,protein_coding,tenomodulin [Source:HGNC Symbol;Acc:HGNC:17757],TEM|myodulin|tendin|BRICD4|ChM1L
ENSG00000000419,DPM1,protein_coding,dolichyl-phosphate mannosyltransferase subunit...,CDGIE|MPDS
ENSG00000000457,SCYL3,protein_coding,SCY1 like pseudokinase 3 [Source:HGNC Symbol;A...,PACE1|PACE-1
ENSG00000000460,C1orf112,protein_coding,chromosome 1 open reading frame 112 [Source:HG...,FLJ10706


In [9]:
results_core_group = results_core_group[results_core_group.index.str.startswith("ENS")]

In [10]:
results_core_group.shape

(69299, 4)

Query for external ids:

In [11]:
results_external = pd.read_sql(query_external, con=engine)
results_external = results_external[results_external.stable_id.str.startswith("ENS")]
results_external.shape

(80857, 4)

In [12]:
results_external.head(2)

Unnamed: 0,stable_id,xref_id,dbprimary_acc,db_name
0,ENSG00000210049,2898423,HGNC:7481,HGNC
1,ENSG00000211459,2898394,HGNC:7470,HGNC


### HGNC

In [13]:
hgnc = (
    results_external[results_external["db_name"] == "HGNC"]
    .drop_duplicates(["stable_id", "dbprimary_acc"])
    .drop(columns=["xref_id", "db_name"])
)
hgnc.rename(columns={"dbprimary_acc": "hgnc_id"}, inplace=True)

In [14]:
dup = hgnc[hgnc.stable_id.duplicated(keep=False)]
dup

Unnamed: 0,stable_id,hgnc_id
6193,ENSG00000277796,HGNC:10628
6195,ENSG00000277796,HGNC:30554
6366,ENSG00000277768,HGNC:10628
6368,ENSG00000277768,HGNC:30554
9921,ENSG00000277336,HGNC:10628
9923,ENSG00000277336,HGNC:30554
12519,ENSG00000288487,HGNC:16346
12520,ENSG00000288487,HGNC:6335
17707,ENSG00000230417,HGNC:31430
17708,ENSG00000230417,HGNC:45111


In [15]:
# resolved by searching on https://www.genenames.org

cond1 = (hgnc.stable_id == "ENSG00000277796") & (hgnc.hgnc_id == "HGNC:10628")
cond2 = (hgnc.stable_id == "ENSG00000276085") & (hgnc.hgnc_id == "HGNC:30554")
# ENSG00000230417 is mapped to both HGNC:31430 and HGNC:45111
cond3 = hgnc.stable_id == "ENSG00000230417"

hgnc = hgnc[~(cond1 | cond2 | cond3)].set_index("stable_id")

In [16]:
hgnc[hgnc.index.duplicated()]

Unnamed: 0_level_0,hgnc_id
stable_id,Unnamed: 1_level_1
ENSG00000277768,HGNC:30554
ENSG00000277336,HGNC:30554
ENSG00000288487,HGNC:6335


### Entrez

In [17]:
entrez = (
    results_external[results_external["db_name"] == "EntrezGene"]
    .drop_duplicates(["stable_id", "dbprimary_acc"])
    .drop(columns=["xref_id", "db_name"])
)
entrez.rename(columns={"dbprimary_acc": "ncbi_gene_id"}, inplace=True)
entrez = entrez.set_index("stable_id")

In [18]:
dup = entrez[entrez.index.duplicated(keep=False)]
dup

Unnamed: 0_level_0,ncbi_gene_id
stable_id,Unnamed: 1_level_1
ENSG00000278294,124907156
ENSG00000278294,124907485
ENSG00000278294,124908250
ENSG00000276779,3805
ENSG00000276779,124900568
...,...
ENSG00000273768,124905574
ENSG00000273768,124905808
ENSG00000273768,124905809
ENSG00000178104,9659


### Merge Ensembl with HGNC and Entrez

In [19]:
results_core_group

Unnamed: 0_level_0,display_label,biotype,description,synonym
stable_id,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1
ENSG00000000003,TSPAN6,protein_coding,tetraspanin 6 [Source:HGNC Symbol;Acc:HGNC:11858],TM4SF6|T245|TSPAN-6
ENSG00000000005,TNMD,protein_coding,tenomodulin [Source:HGNC Symbol;Acc:HGNC:17757],TEM|myodulin|tendin|BRICD4|ChM1L
ENSG00000000419,DPM1,protein_coding,dolichyl-phosphate mannosyltransferase subunit...,CDGIE|MPDS
ENSG00000000457,SCYL3,protein_coding,SCY1 like pseudokinase 3 [Source:HGNC Symbol;A...,PACE1|PACE-1
ENSG00000000460,C1orf112,protein_coding,chromosome 1 open reading frame 112 [Source:HG...,FLJ10706
...,...,...,...,...
ENSG00000291313,,protein_coding,novel protein,
ENSG00000291314,,protein_coding,novel protein,
ENSG00000291315,,protein_coding,novel protein,
ENSG00000291316,,protein_coding,"novel protein, LOC84773-CYHR1 readthrough",


In [20]:
df = results_core_group.merge(hgnc, left_index=True, right_index=True, how="outer")
df[df.index.duplicated()]

Unnamed: 0_level_0,display_label,biotype,description,synonym,hgnc_id
stable_id,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1,Unnamed: 5_level_1
ENSG00000277336,CCL3L3,protein_coding,C-C motif chemokine ligand 3 like 3 [Source:HG...,MGC12815,HGNC:30554
ENSG00000277768,CCL3L3,protein_coding,C-C motif chemokine ligand 3 like 3 [Source:HG...,MGC12815,HGNC:30554
ENSG00000288487,KIR2DS3,protein_coding,"killer cell immunoglobulin like receptor, two ...",nkat7,HGNC:6335


In [None]:
df = df.merge(entrez, left_index=True, right_index=True, how="outer")

In [22]:
df = df.reset_index()
df.rename(
    columns={"stable_id": "ensembl_gene_id", "display_label": "symbol"}, inplace=True
)
df = df[
    [
        "ensembl_gene_id",
        "symbol",
        "ncbi_gene_id",
        "hgnc_id",
        "biotype",
        "description",
        "synonym",
    ]
]

In [50]:
df.head()

Unnamed: 0,ensembl_gene_id,symbol,ncbi_gene_id,hgnc_id,biotype,description,synonym
0,ENSG00000000003,TSPAN6,7105,HGNC:11858,protein_coding,tetraspanin 6 [Source:HGNC Symbol;Acc:HGNC:11858],TM4SF6|T245|TSPAN-6
1,ENSG00000000005,TNMD,64102,HGNC:17757,protein_coding,tenomodulin [Source:HGNC Symbol;Acc:HGNC:17757],TEM|myodulin|tendin|BRICD4|ChM1L
2,ENSG00000000419,DPM1,8813,HGNC:3005,protein_coding,dolichyl-phosphate mannosyltransferase subunit...,CDGIE|MPDS
3,ENSG00000000457,SCYL3,57147,HGNC:19285,protein_coding,SCY1 like pseudokinase 3 [Source:HGNC Symbol;A...,PACE1|PACE-1
4,ENSG00000000460,C1orf112,55732,HGNC:25565,protein_coding,chromosome 1 open reading frame 112 [Source:HG...,FLJ10706


In [51]:
df.shape

(75124, 7)

In [52]:
df.to_parquet("human_ensembl_release-109_Gene_lookup.parquet")

Uploaded to: s3://bionty-assets/human_ensembl_release-109_Gene_lookup.parquet

## Mouse

In [25]:
engine = create_engine(url=get_url("mus_musculus_core_109_39"))

### Queries

In [26]:
query_core = """
SELECT gene.stable_id, xref.display_label, gene.biotype, gene.description, external_synonym.synonym
FROM gene
LEFT JOIN xref ON gene.display_xref_id = xref.xref_id
LEFT JOIN external_synonym ON gene.display_xref_id = external_synonym.xref_id
"""

query_external = """
SELECT gene.stable_id, object_xref.xref_id, xref.dbprimary_acc, external_db.db_name
FROM gene
LEFT JOIN object_xref ON gene.gene_id = object_xref.ensembl_id
LEFT JOIN xref ON object_xref.xref_id = xref.xref_id
LEFT JOIN external_db ON xref.external_db_id = external_db.external_db_id
WHERE object_xref.ensembl_object_type = 'Gene' AND external_db.db_name IN ('MGI')
"""

Query for the basic gene annotations:

In [27]:
results_core = pd.read_sql(query_core, con=engine)
results_core.shape

(84720, 5)

In [28]:
results_core.head()

Unnamed: 0,stable_id,display_label,biotype,description,synonym
0,ENSMUSG00000064336,mt-Tf,Mt_tRNA,mitochondrially encoded tRNA phenylalanine [So...,tRNA
1,ENSMUSG00000064336,mt-Tf,Mt_tRNA,mitochondrially encoded tRNA phenylalanine [So...,tRNA-Phe
2,ENSMUSG00000064336,mt-Tf,Mt_tRNA,mitochondrially encoded tRNA phenylalanine [So...,TrnF tRNA
3,ENSMUSG00000064337,mt-Rnr1,Mt_rRNA,mitochondrially encoded 12S rRNA [Source:MGI S...,12S ribosomal RNA
4,ENSMUSG00000064337,mt-Rnr1,Mt_rRNA,mitochondrially encoded 12S rRNA [Source:MGI S...,12S rRNA


In [29]:
results_core_group = results_core.groupby("stable_id").agg(
    {
        "display_label": "first",
        "biotype": "first",
        "description": "first",
        "synonym": lambda x: "|".join([i for i in set(x) if i is not None]),
    }
)

In [30]:
results_core_group.head()

Unnamed: 0_level_0,display_label,biotype,description,synonym
stable_id,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1
ENSMUSG00000000001,Gnai3,protein_coding,guanine nucleotide binding protein (G protein)...,Galphai3
ENSMUSG00000000003,Pbsn,protein_coding,probasin [Source:MGI Symbol;Acc:MGI:1860484],PB
ENSMUSG00000000028,Cdc45,protein_coding,cell division cycle 45 [Source:MGI Symbol;Acc:...,Cdc45l
ENSMUSG00000000031,H19,lncRNA,"H19, imprinted maternally expressed transcript...",
ENSMUSG00000000037,Scml2,protein_coding,Scm polycomb group protein like 2 [Source:MGI ...,4932420G07Rik


In [31]:
results_core_group = results_core_group[results_core_group.index.str.startswith("ENS")]

In [32]:
results_core_group.shape

(57010, 4)

Query for external ids:

In [33]:
results_external = pd.read_sql(query_external, con=engine)
results_external = results_external[results_external.stable_id.str.startswith("ENS")]
results_external.shape

(55288, 4)

In [34]:
results_external.head(2)

Unnamed: 0,stable_id,xref_id,dbprimary_acc,db_name
0,ENSMUSG00000064336,1630742,MGI:102487,MGI
1,ENSMUSG00000064337,1630726,MGI:102493,MGI


### MGI

In [37]:
mgi = (
    results_external[results_external["db_name"] == "MGI"]
    .drop_duplicates(["stable_id", "dbprimary_acc"])
    .drop(columns=["xref_id", "db_name"])
)
mgi.rename(columns={"dbprimary_acc": "mgi_id"}, inplace=True)
mgi = mgi.set_index("stable_id")

In [38]:
dup = mgi[mgi.index.duplicated(keep=False)]
dup

Unnamed: 0_level_0,mgi_id
stable_id,Unnamed: 1_level_1
ENSMUSG00000115016,MGI:2145569
ENSMUSG00000115016,MGI:5593065
ENSMUSG00000119828,MGI:5455181
ENSMUSG00000119828,MGI:6721448
ENSMUSG00000082414,MGI:3705775
ENSMUSG00000082414,MGI:5434448


### Merge ensembl with MGI

In [39]:
### Merge Ensembl with HGNC and Entrez
df = results_core_group.merge(mgi, left_index=True, right_index=True, how="outer")
df[df.index.duplicated()]

Unnamed: 0_level_0,display_label,biotype,description,synonym,mgi_id
stable_id,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1,Unnamed: 5_level_1
ENSMUSG00000082414,Gm13303,unprocessed_pseudogene,predicted gene 13303 [Source:MGI Symbol;Acc:MG...,,MGI:5434448
ENSMUSG00000115016,Gm33906,lncRNA,"predicted gene, 33906 [Source:MGI Symbol;Acc:M...",,MGI:5593065
ENSMUSG00000119828,Gm25404,snRNA,"predicted gene, 25404 [Source:MGI Symbol;Acc:M...",,MGI:6721448


In [46]:
df = df.reset_index()
df.rename(
    columns={"stable_id": "ensembl_gene_id", "display_label": "symbol"}, inplace=True
)
df = df[["ensembl_gene_id", "symbol", "mgi_id", "biotype", "description", "synonym"]]
df.head()

Unnamed: 0,ensembl_gene_id,symbol,mgi_id,biotype,description,synonym
0,ENSMUSG00000000001,Gnai3,MGI:95773,protein_coding,guanine nucleotide binding protein (G protein)...,Galphai3
1,ENSMUSG00000000003,Pbsn,MGI:1860484,protein_coding,probasin [Source:MGI Symbol;Acc:MGI:1860484],PB
2,ENSMUSG00000000028,Cdc45,MGI:1338073,protein_coding,cell division cycle 45 [Source:MGI Symbol;Acc:...,Cdc45l
3,ENSMUSG00000000031,H19,MGI:95891,lncRNA,"H19, imprinted maternally expressed transcript...",
4,ENSMUSG00000000037,Scml2,MGI:1340042,protein_coding,Scm polycomb group protein like 2 [Source:MGI ...,4932420G07Rik


In [41]:
df.shape

(57013, 6)

In [47]:
df.to_parquet("mouse_ensembl_release-109_Gene_lookup.parquet")