Source: https://bioconductor.org/packages/release/bioc/vignettes/biomaRt/inst/doc/accessing_ensembl.html#selecting-an-ensembl-biomart-database-and-dataset

In [1]:
library(biomaRt)

### Step1: Identifying the database you need

In [2]:
listEnsembl()

biomart,version
<chr>,<chr>
genes,Ensembl Genes 108
mouse_strains,Mouse strains 108
snps,Ensembl Variation 108
regulation,Ensembl Regulation 108


In [3]:
# we will select the main Ensembl mart, which provides access to gene annotation information
ensembl <- useEnsembl(biomart = "genes")
ensembl

Object of class 'Mart':
  Using the ENSEMBL_MART_ENSEMBL BioMart database
  No dataset selected.

### Step 2: Choosing a dataset

In [4]:
datasets <- listDatasets(ensembl)
head(datasets)

Unnamed: 0_level_0,dataset,description,version
Unnamed: 0_level_1,<I<chr>>,<I<chr>>,<I<chr>>
1,abrachyrhynchus_gene_ensembl,Pink-footed goose genes (ASM259213v1),ASM259213v1
2,acalliptera_gene_ensembl,Eastern happy genes (fAstCal1.2),fAstCal1.2
3,acarolinensis_gene_ensembl,Green anole genes (AnoCar2.0v2),AnoCar2.0v2
4,acchrysaetos_gene_ensembl,Golden eagle genes (bAquChr1.2),bAquChr1.2
5,acitrinellus_gene_ensembl,Midas cichlid genes (Midas_v5),Midas_v5
6,amelanoleuca_gene_ensembl,Giant panda genes (ASM200744v2),ASM200744v2


In [5]:
#searchDatasets(mart = ensembl, pattern = "hsapiens")
searchDatasets(mart = ensembl, pattern = "Macaca_fascicularis")

Unnamed: 0_level_0,dataset,description,version
Unnamed: 0_level_1,<I<chr>>,<I<chr>>,<I<chr>>
98,mfascicularis_gene_ensembl,Crab-eating macaque genes (Macaca_fascicularis_6.0),Macaca_fascicularis_6.0


In [6]:
# To use a dataset we can update our Mart object using the function useDataset()
ensembl <- useDataset(dataset = "mfascicularis_gene_ensembl", mart = ensembl)

> if the dataset one wants to use is known in advance, we can select a both the database and dataset in one step:
>```R
ensembl <- useEnsembl(biomart = "genes", dataset = "hsapiens_gene_ensembl")
```

In [7]:
attributes = listAttributes(ensembl)

In [8]:
searchAttributes(mart = ensembl, pattern = "symbol")

Unnamed: 0_level_0,name,description,page
Unnamed: 0_level_1,<chr>,<chr>,<chr>
44,hgnc_symbol,HGNC symbol,feature_page
62,uniprot_gn_symbol,UniProtKB Gene Name symbol,feature_page


In [9]:
df_genes <- read.csv('../data/Sample_210017A_3_120h/features.tsv.gz', sep = '\t', header = FALSE, row.names = NULL)
head(df_genes)

Unnamed: 0_level_0,V1,V2,V3
Unnamed: 0_level_1,<chr>,<chr>,<chr>
1,ENSMFAG00000044637,PGBD2,Gene Expression
2,ENSMFAG00000011984,U6,Gene Expression
3,ENSMFAG00000039056,ZNF692,Gene Expression
4,ENSMFAG00000030010,ZNF672,Gene Expression
5,ENSMFAG00000002737,SH3BP5L,Gene Expression
6,ENSMFAG00000005691,ENSMFAG00000005691,Gene Expression


In [10]:
entrez=df_genes$V1
gene_anno = getBM(attributes = c('ensembl_gene_id', 'external_gene_name','hgnc_symbol'), 
              filters = 'ensembl_gene_id', 
              values = entrez, 
              mart = ensembl)
head(gene_anno)

Unnamed: 0_level_0,ensembl_gene_id,external_gene_name,hgnc_symbol
Unnamed: 0_level_1,<chr>,<chr>,<chr>
1,ENSMFAG00000000043,MCCC1,MCCC1
2,ENSMFAG00000000045,,
3,ENSMFAG00000000049,C1orf185,C1orf185
4,ENSMFAG00000000068,IL22RA1,IL22RA1
5,ENSMFAG00000000070,POMGNT1,POMGNT1
6,ENSMFAG00000000076,CD247,CD247


In [11]:
write.csv(gene_anno, 'gene_anno.csv')