<a href="https://colab.research.google.com/github/katarinagresova/benchmarks/blob/main/ensembl/Scrape_Ensembl.ipynb" target="_parent"><img src="https://colab.research.google.com/assets/colab-badge.svg" alt="Open In Colab"/></a>

#Setup

In [None]:
!pip install biopython
!pip install apybiomart
!pip install ensembl_rest

In [35]:
import apybiomart as apy
import numpy as np
import os
from Bio.Seq import Seq
from Bio import SeqIO
from Bio.SeqRecord import SeqRecord

In [2]:
from google.colab import drive
drive.mount('/content/drive/')

Mounted at /content/drive/


#Examine BioMart API

In [None]:
apy.find_marts()

In [None]:
apy.find_datasets(mart="ENSEMBL_MART_FUNCGEN")

#Download data

In [3]:
def dump_dataset(dataset, output_path="", filters={}):
  output_path = dataset + ".csv" if output_path == "" else output_path
  attr = apy.find_attributes(dataset=dataset)
  return apy.query(
    attributes=np.array(attr['Attribute_ID']),
    filters=filters,
    dataset=dataset,
    save=True,
    output=output_path
)

In [9]:
# this is needed in Google Colab, because it runs on asyncio and we want to run another asyncio inside
import nest_asyncio
import asyncio
nest_asyncio.apply()

def dump_dataset_async(dataset, output_path=""):
  output_path = os.path.join(output_dir, dataset + ".csv")
  loop = asyncio.get_event_loop()
  attr = apy.find_attributes(dataset=dataset)
  feature_types = [
      "CTFC Binding Site",
      "Enhancer",
      "Open chromatin",
      "Promoter",
      "Promoter Flanking Region",
      "TF binding site"
  ]
  tasks = [
      apy.aquery(
          attributes=np.array(attr['Attribute_ID']),
          filters={
              "chromosome_name": str(i)
          },
          dataset=dataset,
          save=True,
          output=output_path
      )
      for i in range(23)
  ]
  return loop.run_until_complete(asyncio.gather(*tasks))

### hsapiens_external_feature

In [4]:
dataset="hsapiens_external_feature"

In [11]:
apy.find_attributes(dataset=dataset)

  df.replace(pd.np.nan, "", inplace=True)


Unnamed: 0,Attribute_ID,Attribute_name,Attribute_description,Dataset_ID
0,chromosome_name,Chromosome/scaffold Name,,hsapiens_external_feature
1,chromosome_start,Start (bp),,hsapiens_external_feature
2,chromosome_end,End (bp),,hsapiens_external_feature
3,feature_type,Feature type,,hsapiens_external_feature
4,feature_type_class,Feature type class,,hsapiens_external_feature
5,feature_type_description,Feature type description,,hsapiens_external_feature
6,display_label,Identifier,,hsapiens_external_feature
7,so_accession,SO term accession,,hsapiens_external_feature
8,so_name,SO term name,,hsapiens_external_feature
9,db_display_name,Database name,,hsapiens_external_feature


In [12]:
apy.find_filters(dataset=dataset)

  df.replace(pd.np.nan, "", inplace=True)


Unnamed: 0,Filter_ID,Filter_type,Filter_description,Dataset_ID
0,chromosome_name,,,hsapiens_external_feature
1,start,text,Determine which base pair on the specified chr...,hsapiens_external_feature
2,end,text,Determine which base pair on the specified chr...,hsapiens_external_feature
3,chromosomal_region,text,Limit to Genes within multiple comma separate ...,hsapiens_external_feature
4,band_start,drop_down_basic_filter,,hsapiens_external_feature
5,band_end,drop_down_basic_filter,,hsapiens_external_feature
6,marker_start,drop_down_basic_filter,,hsapiens_external_feature
7,marker_end,,,hsapiens_external_feature
8,hsapiens_encode.encode_region,,,hsapiens_external_feature
9,external_feature_set_name,list,,hsapiens_external_feature


In [6]:
hsapiens_external_feature = dump_dataset(dataset, "biomart/"+dataset+".csv")

  df.replace(pd.np.nan, "", inplace=True)
  """Entry point for launching an IPython kernel.
  result.replace(pd.np.nan, "", inplace=True)


In [6]:
hsapiens_external_feature.head()

Unnamed: 0,Chromosome/scaffold Name,Start (bp),End (bp),Feature type,Feature type class,Feature type description,Identifier,SO term accession,SO term name,Database name,Display label,Stable ID,Linkage annotation
0,1,922877,923268,FANTOM predictions,Enhancer,"FANTOM enhancers, permissive",1:922877-923268,SO:0000165,enhancer,,,,
1,1,983070,983175,FANTOM predictions,Enhancer,"FANTOM enhancers, permissive",1:983070-983175,SO:0000165,enhancer,,,,
2,1,1006412,1006755,FANTOM predictions,Enhancer,"FANTOM enhancers, permissive",1:1006412-1006755,SO:0000165,enhancer,,,,
3,1,1010390,1010654,FANTOM predictions,Enhancer,"FANTOM enhancers, permissive",1:1010390-1010654,SO:0000165,enhancer,,,,
4,1,1021184,1021432,FANTOM predictions,Enhancer,"FANTOM enhancers, permissive",1:1021184-1021432,SO:0000165,enhancer,,,,


In [None]:
hsapiens_external_feature['SO term name'].value_counts()

transcription_start_site    388825
enhancer                     82370
Name: SO term name, dtype: int64

### hsapiens_regulatory_feature

In [13]:
dataset = "hsapiens_regulatory_feature"

In [17]:
hsapiens_regulatory_feature = dump_dataset(
    dataset, 
    output_path="biomart/"+dataset+"_0.csv", 
    filters={"chromosome_name":"1", "regulatory_feature_type_name":"Promoter"}
)

  df.replace(pd.np.nan, "", inplace=True)
  result.replace(pd.np.nan, "", inplace=True)


In [14]:
apy.find_filters(dataset=dataset)

  df.replace(pd.np.nan, "", inplace=True)


Unnamed: 0,Filter_ID,Filter_type,Filter_description,Dataset_ID
0,chromosome_name,text,,hsapiens_regulatory_feature
1,start,text,Determine which base pair on the specified chr...,hsapiens_regulatory_feature
2,end,text,Determine which base pair on the specified chr...,hsapiens_regulatory_feature
3,chromosomal_region,text,Limit to Genes within multiple comma separate ...,hsapiens_regulatory_feature
4,band_start,drop_down_basic_filter,,hsapiens_regulatory_feature
5,band_end,drop_down_basic_filter,,hsapiens_regulatory_feature
6,marker_start,drop_down_basic_filter,,hsapiens_regulatory_feature
7,marker_end,,,hsapiens_regulatory_feature
8,hsapiens_encode.encode_region,,,hsapiens_regulatory_feature
9,regulatory_stable_id,list,,hsapiens_regulatory_feature


#Get sequences from positions

In [13]:
import ensembl_rest

In [12]:
def get_sequences_by_positions(identifiers_list):
  return ensembl_rest.sequence_region_post(
      species="human",
      format="fasta",
      params={'regions':identifiers_list}
  )

In [30]:
def get_sequence_by_position(identifier):
  return ensembl_rest.sequence_region(
      species="human",
      format="fasta",
      region=identifier
  )

In [11]:
def make_seqrecord_from_response(response_record):
  return SeqRecord(
      Seq(response_record['seq']), 
      id=response_record['id'], 
      description=""
  )

In [31]:
def save_to_fasta(out_path, indentifiers):
  with open(out_path, 'w') as handle:
    for identifier in indentifiers:
      response_record = get_sequence_by_position(identifier)
      SeqIO.write(
          make_seqrecord_from_response(response_record), 
          handle, 
          'fasta'
      )

In [27]:
def get_positions(biomart_list):
  return biomart_list['Identifier'].tolist()

### hsapiens_external_feature

In [7]:
enhancers = hsapiens_external_feature[hsapiens_external_feature['SO term name'] == 'enhancer']
enhancers.head()

Unnamed: 0,Chromosome/scaffold Name,Start (bp),End (bp),Feature type,Feature type class,Feature type description,Identifier,SO term accession,SO term name,Database name,Display label,Stable ID,Linkage annotation
0,1,922877,923268,FANTOM predictions,Enhancer,"FANTOM enhancers, permissive",1:922877-923268,SO:0000165,enhancer,,,,
1,1,983070,983175,FANTOM predictions,Enhancer,"FANTOM enhancers, permissive",1:983070-983175,SO:0000165,enhancer,,,,
2,1,1006412,1006755,FANTOM predictions,Enhancer,"FANTOM enhancers, permissive",1:1006412-1006755,SO:0000165,enhancer,,,,
3,1,1010390,1010654,FANTOM predictions,Enhancer,"FANTOM enhancers, permissive",1:1010390-1010654,SO:0000165,enhancer,,,,
4,1,1021184,1021432,FANTOM predictions,Enhancer,"FANTOM enhancers, permissive",1:1021184-1021432,SO:0000165,enhancer,,,,


In [None]:
enhancers_positions = get_positions(enhancers)
enhancers_positions

In [None]:
save_to_fasta("biomart/"+dataset+"_enhancers.fa", enhancers_positions)