# `Gene`: ensembl, release-110

- https://www.ensembl.org/info/data/mysql.html
- https://www.ensembl.org/info/docs/api/core/core_schema.html

Install mysqlclient: https://pypi.org/project/mysqlclient/

In [26]:
from bionty_base.entities._gene import EnsemblGene

version = "release-111"

## Human

In [27]:
ensembl_gene = EnsemblGene(organism="human", version=version)

In [28]:
df = ensembl_gene.download_df()

💡 fetching records from the core DB...
💡 fetching records from the external DBs...
❗ duplicated #rows ensembl_gene_id with ncbi_gene_id: 6031
✅ downloaded Gene table containing 76062 entries.


In [29]:
df.head()

Unnamed: 0,ensembl_gene_id,symbol,ncbi_gene_id,biotype,description,synonyms
0,ENSG00000000003,TSPAN6,7105,protein_coding,tetraspanin 6 [Source:HGNC Symbol;Acc:HGNC:11858],TM4SF6|TSPAN-6|T245
1,ENSG00000000005,TNMD,64102,protein_coding,tenomodulin [Source:HGNC Symbol;Acc:HGNC:17757],MYODULIN|BRICD4|TEM|CHM1L|TENDIN
2,ENSG00000000419,DPM1,8813,protein_coding,dolichyl-phosphate mannosyltransferase subunit...,CDGIE|MPDS
3,ENSG00000000457,SCYL3,57147,protein_coding,SCY1 like pseudokinase 3 [Source:HGNC Symbol;A...,PACE-1|PACE1
4,ENSG00000000460,FIRRM,55732,protein_coding,FIGNL1 interacting regulator of recombination ...,FLIP|C1ORF112|FLJ10706|APOLO1|MEICA1


In [30]:
# https://github.com/laminlabs/bionty/issues/533
df["description"] = df["description"].str.replace(r"\[.*?\]", "", regex=True)

In [31]:
df.head()

Unnamed: 0,ensembl_gene_id,symbol,ncbi_gene_id,biotype,description,synonyms
0,ENSG00000000003,TSPAN6,7105,protein_coding,tetraspanin 6,TM4SF6|TSPAN-6|T245
1,ENSG00000000005,TNMD,64102,protein_coding,tenomodulin,MYODULIN|BRICD4|TEM|CHM1L|TENDIN
2,ENSG00000000419,DPM1,8813,protein_coding,dolichyl-phosphate mannosyltransferase subunit...,CDGIE|MPDS
3,ENSG00000000457,SCYL3,57147,protein_coding,SCY1 like pseudokinase 3,PACE-1|PACE1
4,ENSG00000000460,FIRRM,55732,protein_coding,FIGNL1 interacting regulator of recombination ...,FLIP|C1ORF112|FLJ10706|APOLO1|MEICA1


In [32]:
df.to_parquet(f"df_human__ensembl__{version}__Gene.parquet")

In [33]:
df_legacy = ensembl_gene.download_legacy_ids_df(df)

In [34]:
df_legacy.shape

(8285, 14)

In [35]:
df_legacy.to_parquet(f"df-legacy_human__ensembl__{version}__Gene.parquet")

## Mouse

In [36]:
ensembl_gene = EnsemblGene(organism="mouse", version=version)

In [37]:
df = ensembl_gene.download_df()

💡 fetching records from the core DB...
💡 fetching records from the external DBs...
❗ duplicated #rows ensembl_gene_id with ncbi_gene_id: 600
✅ downloaded Gene table containing 57545 entries.


In [38]:
df.head()

Unnamed: 0,ensembl_gene_id,symbol,ncbi_gene_id,biotype,description,synonyms
0,ENSMUSG00000000001,Gnai3,14679,protein_coding,guanine nucleotide binding protein (G protein)...,Galphai3
1,ENSMUSG00000000003,Pbsn,54192,protein_coding,probasin [Source:MGI Symbol;Acc:MGI:1860484],PB
2,ENSMUSG00000000028,Cdc45,12544,protein_coding,cell division cycle 45 [Source:MGI Symbol;Acc:...,Cdc45l
3,ENSMUSG00000000031,H19,14955,lncRNA,"H19, imprinted maternally expressed transcript...",
4,ENSMUSG00000000037,Scml2,107815,protein_coding,Scm polycomb group protein like 2 [Source:MGI ...,4932420G07Rik


In [39]:
# https://github.com/laminlabs/bionty/issues/533
df["description"] = df["description"].str.replace(r"\[.*?\]", "", regex=True)

In [40]:
df.to_parquet(f"df_mouse__ensembl__{version}__Gene.parquet")

In [41]:
df_legacy = ensembl_gene.download_legacy_ids_df(df)

In [42]:
df_legacy.shape

(30685, 14)

In [43]:
df_legacy.head()

Unnamed: 0,mapping_session_id,old_stable_id,old_version,new_stable_id,new_version,type,score,old_db_name,new_db_name,old_release,new_release,old_assembly,new_assembly,created
0,59,ENSMUSG00000079169,4,ENSMUSG00000027157,12,gene,0.99,mus_musculus_core_106_39,mus_musculus_core_107_39,106,107,GRCm39,GRCm39,2022-01-12 20:12:50
1,61,ENSMUSG00000095464,2,ENSMUSG00000046516,12,gene,0.99,mus_musculus_core_108_39,mus_musculus_core_109_39,108,109,GRCm39,GRCm39,2022-08-25 23:23:22
2,59,ENSMUSG00000085431,8,ENSMUSG00000054510,7,gene,0.99,mus_musculus_core_106_39,mus_musculus_core_107_39,106,107,GRCm39,GRCm39,2022-01-12 20:12:50
3,6,ENSMUSG00000067056,1,ENSMUSG00000070605,1,gene,0.993523,mus_musculus_core_36_34d,mus_musculus_core_38_35,36,38,NCBIM34,NCBIM35,2006-03-15 17:41:36
4,6,ENSMUSG00000068846,1,ENSMUSG00000071738,1,gene,0.991369,mus_musculus_core_36_34d,mus_musculus_core_38_35,36,38,NCBIM34,NCBIM35,2006-03-15 17:41:36


In [44]:
df_legacy.to_parquet(f"df-legacy_mouse__ensembl__{version}__Gene.parquet")

## saccharomyces_cerevisiae

In [45]:
ensembl_gene = EnsemblGene(organism="saccharomyces cerevisiae", version=version)

In [46]:
df = ensembl_gene.download_df()

💡 fetching records from the core DB...
💡 fetching records from the external DBs...
❗ duplicated #rows ensembl_gene_id with ncbi_gene_id: 218
❗ no ensembl_gene_id found, writing to table_id column.
✅ downloaded Gene table containing 7248 entries.


In [47]:
# https://github.com/laminlabs/bionty/issues/533
df["description"] = df["description"].str.replace(r"\[.*?\]", "", regex=True)

In [48]:
df.to_parquet(f"df_saccharomyces cerevisiae__ensembl__{version}__Gene.parquet")

In [49]:
df_legacy = ensembl_gene.download_legacy_ids_df(df, col="stable_id")

In [50]:
df_legacy.shape

(0, 14)