# Plant `Gene`: ensembl, release-57

- https://www.ensembl.org/info/data/mysql.html
- https://www.ensembl.org/info/docs/api/core/core_schema.html

Install mysqlclient: https://pypi.org/project/mysqlclient/

In [1]:
from bionty_base.entities._gene import EnsemblGene

In [6]:
organisms = [
    "arabidopsis thaliana",
    "medicago truncatula",
    "solanum lycopersicum",
    "zea mays",
    "oryza sativa japonica group",
]
version = "release-57"
for organism in organisms:
    print(f"Downloading {organism}...")
    ensembl_gene = EnsemblGene(organism=organism, version=version, kingdom="plants")
    print("URL:", ensembl_gene._url)
    df = ensembl_gene.download_df()
    # https://github.com/laminlabs/bionty/issues/533
    df["description"] = df["description"].str.replace(r"\[.*?\]", "", regex=True)
    df.to_parquet(f"df_{organism}__ensembl__{version}__Gene.parquet")
    display(df.head())

Downloading arabidopsis thaliana...
URL: mysql+mysqldb://anonymous:@ensembldb.ensembl.org:4157/arabidopsis_thaliana_core_57_110_11
[94m•[0m fetching records from the core DB...
[94m•[0m fetching records from the external DBs...
[93m![0m duplicated #rows ensembl_gene_id with ncbi_gene_id: 438
[93m![0m no ensembl_gene_id found, writing to table_id column.
[92m✓[0m downloaded Gene table containing 33127 entries.


Unnamed: 0,stable_id,symbol,ncbi_gene_id,biotype,description,synonyms
0,AT1G01010,NAC001,839580,protein_coding,NAC domain containing protein 1,T25K16.1|ANAC001|T25K16_1|NAC domain containin...
1,AT1G01020,ARV1,839569,protein_coding,ARV1 family protein,T25K16.2|T25K16_2
2,AT1G01030,NGA3,839321,protein_coding,AP2/B3-like transcriptional factor family prot...,T25K16.3|NGATHA3|T25K16_3
3,AT1G01040,DCL1,839574,protein_coding,dicer-like 1,T25K16_4|SUS1|SHORT INTEGUMENTS 1|T25K16.4|CAF...
4,AT1G01046,ath-MIR838,6240410,miRNA,ath-MIR838,


Downloading medicago truncatula...
URL: mysql+mysqldb://anonymous:@ensembldb.ensembl.org:4157/medicago_truncatula_core_57_110_2
[94m•[0m fetching records from the core DB...
[94m•[0m fetching records from the external DBs...
[93m![0m duplicated #rows ensembl_gene_id with ncbi_gene_id: 1589
[92m✓[0m downloaded Gene table containing 1328 entries.


Unnamed: 0,ensembl_gene_id,symbol,ncbi_gene_id,biotype,description,synonyms
0,ENSRNA049434913,tRNA-Asn,,tRNA,tRNA-Asn for anticodon GUU,
1,ENSRNA049434956,tRNA-Glu,,tRNA,tRNA-Glu for anticodon UUC,
2,ENSRNA049434965,tRNA-Met,,tRNA,tRNA-Met for anticodon CAU,
3,ENSRNA049435003,tRNA-Leu,,tRNA,tRNA-Leu for anticodon AAG,
4,ENSRNA049435027,tRNA-Gly,,tRNA,tRNA-Gly for anticodon UCC,


Downloading solanum lycopersicum...
URL: mysql+mysqldb://anonymous:@ensembldb.ensembl.org:4157/solanum_lycopersicum_core_57_110_3
[94m•[0m fetching records from the core DB...
[94m•[0m fetching records from the external DBs...
[93m![0m duplicated #rows ensembl_gene_id with ncbi_gene_id: 243
[92m✓[0m downloaded Gene table containing 1167 entries.


Unnamed: 0,ensembl_gene_id,symbol,ncbi_gene_id,biotype,description,synonyms
0,ENSRNA049444660,tRNA-Val,,tRNA,tRNA-Val for anticodon UAC,
1,ENSRNA049446579,tRNA-Ile,,tRNA,tRNA-Ile for anticodon AAU,
2,ENSRNA050028289,tRNA-Ser,,tRNA,tRNA-Ser for anticodon AGA,
3,ENSRNA050028290,tRNA-Lys,,tRNA,tRNA-Lys for anticodon CUU,
4,ENSRNA050028291,tRNA-Met,,tRNA,tRNA-Met for anticodon CAU,


Downloading zea mays...
URL: mysql+mysqldb://anonymous:@ensembldb.ensembl.org:4157/zea_mays_core_57_110_8
[94m•[0m fetching records from the core DB...
[94m•[0m fetching records from the external DBs...
[93m![0m duplicated #rows ensembl_gene_id with ncbi_gene_id: 641
[93m![0m no ensembl_gene_id found, writing to table_id column.
[92m✓[0m downloaded Gene table containing 44735 entries.


Unnamed: 0,stable_id,symbol,ncbi_gene_id,biotype,description,synonyms
0,Zm00001eb000010,,,protein_coding,Zm00001e000001,
1,Zm00001eb000020,,,protein_coding,Zm00001e000002,
2,Zm00001eb000030,,,misc_non_coding,Zm00001e000003,
3,Zm00001eb000040,,,misc_non_coding,Zm00001e000004,
4,Zm00001eb000050,,,protein_coding,Zm00001e100003,


Downloading oryza sativa japonica group...
URL: mysql+mysqldb://anonymous:@ensembldb.ensembl.org:4157/oryza_sativa_core_57_110_7
[94m•[0m fetching records from the core DB...
[94m•[0m fetching records from the external DBs...
[93m![0m duplicated #rows ensembl_gene_id with ncbi_gene_id: 780
[92m✓[0m downloaded Gene table containing 949 entries.


Unnamed: 0,ensembl_gene_id,symbol,ncbi_gene_id,biotype,description,synonyms
0,ENSRNA049440515,tRNA-Asn,,tRNA,tRNA-Asn for anticodon GUU,
1,ENSRNA049440716,tRNA-Leu,,tRNA,tRNA-Leu for anticodon AAG,
2,ENSRNA049441102,tRNA-Gln,,tRNA,tRNA-Gln for anticodon UUG,
3,ENSRNA049441259,tRNA-Ala,,tRNA,tRNA-Ala for anticodon AGC,
4,ENSRNA049441339,tRNA-Leu,,tRNA,tRNA-Leu for anticodon AAG,
