# `Gene`: ensembl, release-109

- https://www.ensembl.org/info/data/mysql.html
- https://www.ensembl.org/info/docs/api/core/core_schema.html

Install mysqlclient: https://pypi.org/project/mysqlclient/

In [None]:
import pandas as pd
from sqlalchemy import create_engine
import mysql.connector as sql  # needed

In [None]:
def get_url(db="homo_sapiens_core_109_38"):
    return f"mysql+mysqldb://anonymous:@ensembldb.ensembl.org/{db}"

## Human

In [None]:
engine = create_engine(url=get_url())

### Queries

In [None]:
query_core = """
SELECT gene.stable_id, xref.display_label, gene.biotype, gene.description, external_synonym.synonym
FROM gene
LEFT JOIN xref ON gene.display_xref_id = xref.xref_id
LEFT JOIN external_synonym ON gene.display_xref_id = external_synonym.xref_id
"""

query_external = """
SELECT gene.stable_id, object_xref.xref_id, xref.dbprimary_acc, external_db.db_name
FROM gene
LEFT JOIN object_xref ON gene.gene_id = object_xref.ensembl_id
LEFT JOIN xref ON object_xref.xref_id = xref.xref_id
LEFT JOIN external_db ON xref.external_db_id = external_db.external_db_id
WHERE object_xref.ensembl_object_type = 'Gene' AND external_db.db_name IN ('HGNC', 'EntrezGene')
"""

Query for the basic gene annotations:

In [None]:
results_core = pd.read_sql(query_core, con=engine)
results_core.shape

In [None]:
results_core.head()

In [None]:
results_core_group = results_core.groupby("stable_id").agg(
    {
        "display_label": "first",
        "biotype": "first",
        "description": "first",
        "synonym": lambda x: "|".join([i for i in set(x) if i is not None]),
    }
)

In [None]:
results_core_group.head()

In [None]:
results_core_group = results_core_group[results_core_group.index.str.startswith("ENS")]

In [None]:
results_core_group.shape

Query for external ids:

In [None]:
results_external = pd.read_sql(query_external, con=engine)
results_external = results_external[results_external.stable_id.str.startswith("ENS")]
results_external.shape

In [None]:
results_external.head(2)

### HGNC

In [None]:
hgnc = (
    results_external[results_external["db_name"] == "HGNC"]
    .drop_duplicates(["stable_id", "dbprimary_acc"])
    .drop(columns=["xref_id", "db_name"])
)
hgnc.rename(columns={"dbprimary_acc": "hgnc_id"}, inplace=True)

In [None]:
dup = hgnc[hgnc.stable_id.duplicated(keep=False)]
dup

In [None]:
# resolved by searching on https://www.genenames.org

cond1 = (hgnc.stable_id == "ENSG00000277796") & (hgnc.hgnc_id == "HGNC:10628")
cond2 = (hgnc.stable_id == "ENSG00000276085") & (hgnc.hgnc_id == "HGNC:30554")
# ENSG00000230417 is mapped to both HGNC:31430 and HGNC:45111
cond3 = hgnc.stable_id == "ENSG00000230417"

hgnc = hgnc[~(cond1 | cond2 | cond3)].set_index("stable_id")

In [None]:
hgnc[hgnc.index.duplicated()]

### Entrez

In [None]:
entrez = (
    results_external[results_external["db_name"] == "EntrezGene"]
    .drop_duplicates(["stable_id", "dbprimary_acc"])
    .drop(columns=["xref_id", "db_name"])
)
entrez.rename(columns={"dbprimary_acc": "ncbi_gene_id"}, inplace=True)
entrez = entrez.set_index("stable_id")

In [None]:
dup = entrez[entrez.index.duplicated(keep=False)]
dup

### Merge Ensembl with HGNC and Entrez

In [None]:
results_core_group

In [None]:
df = results_core_group.merge(hgnc, left_index=True, right_index=True, how="outer")
df[df.index.duplicated()]

In [None]:
df = df.merge(entrez, left_index=True, right_index=True, how="outer")

In [None]:
df = df.reset_index()
df.rename(
    columns={
        "stable_id": "ensembl_gene_id",
        "display_label": "symbol",
        "synonym": "synonyms",
    },
    inplace=True,
)
df = df[
    [
        "ensembl_gene_id",
        "symbol",
        "ncbi_gene_id",
        "hgnc_id",
        "biotype",
        "description",
        "synonyms",
    ]
]
df = df[~df["ensembl_gene_id"].isnull()]
df = df.sort_values("ensembl_gene_id").reset_index(drop=True)

In [None]:
df.head()

In [None]:
df.shape

In [None]:
df.to_parquet("df_human__ensembl__release-109__Gene.parquet")

Uploaded to: s3://bionty-assets/human_ensembl_release-109_Gene_lookup.parquet

## Mouse

In [None]:
engine = create_engine(url=get_url("mus_musculus_core_109_39"))

### Queries

In [None]:
query_core = """
SELECT gene.stable_id, xref.display_label, gene.biotype, gene.description, external_synonym.synonym
FROM gene
LEFT JOIN xref ON gene.display_xref_id = xref.xref_id
LEFT JOIN external_synonym ON gene.display_xref_id = external_synonym.xref_id
"""

query_external = """
SELECT gene.stable_id, object_xref.xref_id, xref.dbprimary_acc, external_db.db_name
FROM gene
LEFT JOIN object_xref ON gene.gene_id = object_xref.ensembl_id
LEFT JOIN xref ON object_xref.xref_id = xref.xref_id
LEFT JOIN external_db ON xref.external_db_id = external_db.external_db_id
WHERE object_xref.ensembl_object_type = 'Gene' AND external_db.db_name IN ('MGI', "EntrezGene")
"""

Query for the basic gene annotations:

In [None]:
results_core = pd.read_sql(query_core, con=engine)
results_core.shape

In [None]:
results_core.head()

In [None]:
results_core_group = results_core.groupby("stable_id").agg(
    {
        "display_label": "first",
        "biotype": "first",
        "description": "first",
        "synonym": lambda x: "|".join([i for i in set(x) if i is not None]),
    }
)

In [None]:
results_core_group.head()

In [None]:
results_core_group = results_core_group[results_core_group.index.str.startswith("ENS")]

In [None]:
results_core_group.shape

Query for external ids:

In [None]:
results_external = pd.read_sql(query_external, con=engine)
results_external = results_external[results_external.stable_id.str.startswith("ENS")]
results_external.shape

In [None]:
results_external.head(2)

### MGI

In [None]:
mgi = (
    results_external[results_external["db_name"] == "MGI"]
    .drop_duplicates(["stable_id", "dbprimary_acc"])
    .drop(columns=["xref_id", "db_name"])
)
mgi.rename(columns={"dbprimary_acc": "mgi_id"}, inplace=True)
mgi = mgi.set_index("stable_id")

In [None]:
dup = mgi[mgi.index.duplicated(keep=False)]
dup

### Entrez

In [None]:
entrez = (
    results_external[results_external["db_name"] == "EntrezGene"]
    .drop_duplicates(["stable_id", "dbprimary_acc"])
    .drop(columns=["xref_id", "db_name"])
)
entrez.rename(columns={"dbprimary_acc": "ncbi_gene_id"}, inplace=True)
entrez = entrez.set_index("stable_id")

In [None]:
dup = entrez[entrez.index.duplicated(keep=False)]
dup

### Merge ensembl with MGI, Entrez

In [None]:
results_core_group

In [None]:
df = results_core_group.merge(mgi, left_index=True, right_index=True, how="outer")
df[df.index.duplicated()]

In [None]:
df = df.merge(entrez, left_index=True, right_index=True, how="outer")

In [None]:
df = df.reset_index()
df.rename(
    columns={
        "stable_id": "ensembl_gene_id",
        "display_label": "symbol",
        "synonym": "synonyms",
    },
    inplace=True,
)
df = df[
    [
        "ensembl_gene_id",
        "symbol",
        "ncbi_gene_id",
        "mgi_id",
        "biotype",
        "description",
        "synonyms",
    ]
]

df = df[~df["ensembl_gene_id"].isnull()]
df = df.sort_values("ensembl_gene_id").reset_index(drop=True)

In [None]:
df.head()

In [None]:
df.shape

In [None]:
df.to_parquet("df_mouse__ensembl__release-109__Gene.parquet")