# `Gene`: ensembl, release-110

- https://www.ensembl.org/info/data/mysql.html
- https://www.ensembl.org/info/docs/api/core/core_schema.html

Install mysqlclient: https://pypi.org/project/mysqlclient/

In [1]:
import pandas as pd
from sqlalchemy import create_engine
import mysql.connector as sql  # needed
import bionty as bt

version = "release-110"
species = bt.Species(version=version).lookup()

✅ New records found in the public sources.yaml, updated /Users/sunnysun/.lamin/bionty/versions/sources_local.yaml!


In [2]:
def get_url(species: bt.Species):
    return f"mysql+mysqldb://anonymous:@ensembldb.ensembl.org/{species.core_db}"

In [3]:
query_core = """
SELECT gene.stable_id, xref.display_label, gene.biotype, gene.description, external_synonym.synonym
FROM gene
LEFT JOIN xref ON gene.display_xref_id = xref.xref_id
LEFT JOIN external_synonym ON gene.display_xref_id = external_synonym.xref_id
"""

query_external = """
SELECT gene.stable_id, object_xref.xref_id, xref.dbprimary_acc, external_db.db_name
FROM gene
LEFT JOIN object_xref ON gene.gene_id = object_xref.ensembl_id
LEFT JOIN xref ON object_xref.xref_id = xref.xref_id
LEFT JOIN external_db ON xref.external_db_id = external_db.external_db_id
WHERE object_xref.ensembl_object_type = 'Gene' AND external_db.db_name IN ('EntrezGene')
"""

In [17]:
def generate_genes_df(species: bt.Species, version="release-110"):
    engine = create_engine(url=get_url(species=species))

    # Query for the basic gene annotations:
    results_core = pd.read_sql(query_core, con=engine)
    print(f"result_core.shape: {results_core.shape}")
    print(f"result_core.head():\n")
    display(results_core.head())

    # aggregate metadata based on ensembl stable_id
    results_core_group = results_core.groupby("stable_id").agg(
        {
            "display_label": "first",
            "biotype": "first",
            "description": "first",
            "synonym": lambda x: "|".join([i for i in set(x) if i is not None]),
        }
    )
    print(f"results_core_group.head():\n")
    display(results_core_group.head())

    # Query for external ids:
    results_external = pd.read_sql(query_external, con=engine)
    results_external = results_external[
        results_external.stable_id.str.startswith("ENS")
    ]
    print(f"results_external.shape: {results_external.shape}")
    print(f"results_external.head():\n")
    display(results_external.head())

    # ncbi_gene_id
    entrez = (
        results_external[results_external["db_name"] == "EntrezGene"]
        .drop_duplicates(["stable_id", "dbprimary_acc"])
        .drop(columns=["xref_id", "db_name"])
    )
    entrez.rename(columns={"dbprimary_acc": "ncbi_gene_id"}, inplace=True)
    entrez = entrez.set_index("stable_id")
    dup = entrez[entrez.index.duplicated(keep=False)]
    print(f"duplicated ensembl_gene_ids with ncbi_gene_ids: {dup.shape[0]}\n")
    display(dup.head())

    # merge with ncbi_gene_id
    df = results_core_group.merge(
        entrez, left_index=True, right_index=True, how="outer"
    )
    df = df.reset_index()
    df.rename(
        columns={
            "stable_id": "ensembl_gene_id",
            "display_label": "symbol",
            "synonym": "synonyms",
        },
        inplace=True,
    )
    df = df[
        [
            "ensembl_gene_id",
            "symbol",
            "ncbi_gene_id",
            "biotype",
            "description",
            "synonyms",
        ]
    ]
    df = df[~df["ensembl_gene_id"].isnull()]
    df = df.sort_values("ensembl_gene_id").reset_index(drop=True)
    print(f"Final df.shape: {df.shape}")
    print(f"df.head():\n")
    display(df.head())

    # save to parquet
    filename = f"df_{species.name}__ensembl__{version}__Gene.parquet"
    df.to_parquet(filename)
    print(f"Saved as {filename}")

## Human

In [5]:
generate_genes_df(species=species.human)

result_core.shape: (113336, 5)
result_core.head():



Unnamed: 0,stable_id,display_label,biotype,description,synonym
0,ENSG00000210049,MT-TF,Mt_tRNA,mitochondrially encoded tRNA-Phe (UUU/C) [Sour...,MTTF
1,ENSG00000210049,MT-TF,Mt_tRNA,mitochondrially encoded tRNA-Phe (UUU/C) [Sour...,TRNF
2,ENSG00000211459,MT-RNR1,Mt_rRNA,mitochondrially encoded 12S rRNA [Source:HGNC ...,12S
3,ENSG00000211459,MT-RNR1,Mt_rRNA,mitochondrially encoded 12S rRNA [Source:HGNC ...,MOTS-C
4,ENSG00000211459,MT-RNR1,Mt_rRNA,mitochondrially encoded 12S rRNA [Source:HGNC ...,MTRNR1


results_core_group.head():



Unnamed: 0_level_0,display_label,biotype,description,synonym
stable_id,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1
ENSG00000000003,TSPAN6,protein_coding,tetraspanin 6 [Source:HGNC Symbol;Acc:HGNC:11858],TM4SF6|T245|TSPAN-6
ENSG00000000005,TNMD,protein_coding,tenomodulin [Source:HGNC Symbol;Acc:HGNC:17757],TEM|MYODULIN|CHM1L|TENDIN|BRICD4
ENSG00000000419,DPM1,protein_coding,dolichyl-phosphate mannosyltransferase subunit...,CDGIE|MPDS
ENSG00000000457,SCYL3,protein_coding,SCY1 like pseudokinase 3 [Source:HGNC Symbol;A...,PACE-1|PACE1
ENSG00000000460,C1orf112,protein_coding,chromosome 1 open reading frame 112 [Source:HG...,FLJ10706|APOLO1|FLIP


results_external.shape: (36004, 4)
results_external.head():



Unnamed: 0,stable_id,xref_id,dbprimary_acc,db_name
0,ENSG00000198888,554032,4535,EntrezGene
1,ENSG00000198763,554045,4536,EntrezGene
2,ENSG00000198804,553814,4512,EntrezGene
3,ENSG00000210151,1138145,113219467,EntrezGene
4,ENSG00000198712,553829,4513,EntrezGene


duplicated ensembl_gene_ids with ncbi_gene_ids: 6158



Unnamed: 0_level_0,ncbi_gene_id
stable_id,Unnamed: 1_level_1
ENSG00000278294,124907156
ENSG00000278294,124907485
ENSG00000278294,124908250
ENSG00000274917,100008587
ENSG00000274917,124907114


Final df.shape: (77043, 6)
df.head():



Unnamed: 0,ensembl_gene_id,symbol,ncbi_gene_id,biotype,description,synonyms
0,ENSG00000000003,TSPAN6,7105,protein_coding,tetraspanin 6 [Source:HGNC Symbol;Acc:HGNC:11858],TM4SF6|T245|TSPAN-6
1,ENSG00000000005,TNMD,64102,protein_coding,tenomodulin [Source:HGNC Symbol;Acc:HGNC:17757],TEM|MYODULIN|CHM1L|TENDIN|BRICD4
2,ENSG00000000419,DPM1,8813,protein_coding,dolichyl-phosphate mannosyltransferase subunit...,CDGIE|MPDS
3,ENSG00000000457,SCYL3,57147,protein_coding,SCY1 like pseudokinase 3 [Source:HGNC Symbol;A...,PACE-1|PACE1
4,ENSG00000000460,C1orf112,55732,protein_coding,chromosome 1 open reading frame 112 [Source:HG...,FLJ10706|APOLO1|FLIP


Saved as df_human__ensembl__release-110__Gene.parquet


## Mouse

In [6]:
generate_genes_df(species=species.mouse)

result_core.shape: (84751, 5)
result_core.head():



Unnamed: 0,stable_id,display_label,biotype,description,synonym
0,ENSMUSG00000064336,mt-Tf,Mt_tRNA,mitochondrially encoded tRNA phenylalanine [So...,tRNA
1,ENSMUSG00000064336,mt-Tf,Mt_tRNA,mitochondrially encoded tRNA phenylalanine [So...,tRNA-Phe
2,ENSMUSG00000064336,mt-Tf,Mt_tRNA,mitochondrially encoded tRNA phenylalanine [So...,TrnF tRNA
3,ENSMUSG00000064337,mt-Rnr1,Mt_rRNA,mitochondrially encoded 12S rRNA [Source:MGI S...,12S ribosomal RNA
4,ENSMUSG00000064337,mt-Rnr1,Mt_rRNA,mitochondrially encoded 12S rRNA [Source:MGI S...,12S rRNA


results_core_group.head():



Unnamed: 0_level_0,display_label,biotype,description,synonym
stable_id,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1
ENSMUSG00000000001,Gnai3,protein_coding,guanine nucleotide binding protein (G protein)...,Galphai3
ENSMUSG00000000003,Pbsn,protein_coding,probasin [Source:MGI Symbol;Acc:MGI:1860484],PB
ENSMUSG00000000028,Cdc45,protein_coding,cell division cycle 45 [Source:MGI Symbol;Acc:...,Cdc45l
ENSMUSG00000000031,H19,lncRNA,"H19, imprinted maternally expressed transcript...",
ENSMUSG00000000037,Scml2,protein_coding,Scm polycomb group protein like 2 [Source:MGI ...,4932420G07Rik


results_external.shape: (27747, 4)
results_external.head():



Unnamed: 0,stable_id,xref_id,dbprimary_acc,db_name
0,ENSMUSG00000064341,344016,17716,EntrezGene
1,ENSMUSG00000064345,344027,17717,EntrezGene
2,ENSMUSG00000064351,343950,17708,EntrezGene
3,ENSMUSG00000064354,343957,17709,EntrezGene
4,ENSMUSG00000064356,343940,17706,EntrezGene


duplicated ensembl_gene_ids with ncbi_gene_ids: 554



Unnamed: 0_level_0,ncbi_gene_id
stable_id,Unnamed: 1_level_1
ENSMUSG00000094383,108168683
ENSMUSG00000094383,108168684
ENSMUSG00000094383,108169098
ENSMUSG00000094383,108169101
ENSMUSG00000095634,100039810


Final df.shape: (57283, 6)
df.head():



Unnamed: 0,ensembl_gene_id,symbol,ncbi_gene_id,biotype,description,synonyms
0,ENSMUSG00000000001,Gnai3,14679,protein_coding,guanine nucleotide binding protein (G protein)...,Galphai3
1,ENSMUSG00000000003,Pbsn,54192,protein_coding,probasin [Source:MGI Symbol;Acc:MGI:1860484],PB
2,ENSMUSG00000000028,Cdc45,12544,protein_coding,cell division cycle 45 [Source:MGI Symbol;Acc:...,Cdc45l
3,ENSMUSG00000000031,H19,14955,lncRNA,"H19, imprinted maternally expressed transcript...",
4,ENSMUSG00000000037,Scml2,107815,protein_coding,Scm polycomb group protein like 2 [Source:MGI ...,4932420G07Rik


Saved as df_mouse__ensembl__release-110__Gene.parquet


## saccharomyces_cerevisiae

In [18]:
generate_genes_df(species=species.saccharomyces_cerevisiae)

result_core.shape: (8552, 5)
result_core.head():



Unnamed: 0,stable_id,display_label,biotype,description,synonym
0,YBR024W,SCO2,protein_coding,Protein anchored to mitochondrial inner membra...,
1,YDL245C,HXT15,protein_coding,Putative transmembrane polyol transporter; sup...,
2,YBR232C,,protein_coding,Dubious open reading frame; unlikely to encode...,
3,YDR320W-B,,protein_coding,Dubious open reading frame; unlikely to encode...,
4,YBR021W,FUR4,protein_coding,Plasma membrane localized uracil permease; exp...,


results_core_group.head():



Unnamed: 0_level_0,display_label,biotype,description,synonym
stable_id,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1
ETS1-1,,rRNA,Non-coding region located immediately upstream...,
ETS1-2,,rRNA,Non-coding region located immediately upstream...,
ETS2-1,,rRNA,Non-coding region located adjacent to and down...,
ETS2-2,,rRNA,Non-coding region located adjacent to RDN25; t...,
HRA1,,ncRNA,"Non-protein-coding RNA; substrate of RNase P, ...",


results_external.shape: (0, 4)
results_external.head():



Unnamed: 0,stable_id,xref_id,dbprimary_acc,db_name


duplicated ensembl_gene_ids with ncbi_gene_ids: 0



Unnamed: 0_level_0,ncbi_gene_id
stable_id,Unnamed: 1_level_1


Final df.shape: (7127, 6)
df.head():



Unnamed: 0,ensembl_gene_id,symbol,ncbi_gene_id,biotype,description,synonyms
0,ETS1-1,,,rRNA,Non-coding region located immediately upstream...,
1,ETS1-2,,,rRNA,Non-coding region located immediately upstream...,
2,ETS2-1,,,rRNA,Non-coding region located adjacent to and down...,
3,ETS2-2,,,rRNA,Non-coding region located adjacent to RDN25; t...,
4,HRA1,,,ncRNA,"Non-protein-coding RNA; substrate of RNase P, ...",


Saved as df_saccharomyces cerevisiae__ensembl__release-110__Gene.parquet
