<a href="https://colab.research.google.com/github/katarinagresova/benchmarks/blob/main/ensembl/Scrape_Ensembl.ipynb" target="_parent"><img src="https://colab.research.google.com/assets/colab-badge.svg" alt="Open In Colab"/></a>

# Setup

In [80]:
!pip install biopython



In [101]:
import urllib.request
import os
import pandas as pd
import gzip
from tqdm.notebook import tqdm
from Bio.Seq import Seq
from Bio import SeqIO
from Bio.SeqRecord import SeqRecord

# Data download

In [117]:
ensembl_ftp = 'ftp://ftp.ensembl.org/pub/'
release = '100'
feature_file = 'mmusculus_regulatory_feature__regulatory_feature__main.txt.gz'
organism = 'mus_musculus'
fasta_file = 'Mus_musculus.GRCm38.dna_rm.toplevel.fa.gz'

urllib.request.urlretrieve(
    ensembl_ftp + 'release-' + release + '/mysql/regulation_mart_' + release + '/' + feature_file 
    feature_file
)

urllib.request.urlretrieve(
    ensembl_ftp + 'release-' + release + 'fasta/' + organism + '/dna/' + fasta_file 
    fasta_file
)

# Data preprocessing

In [83]:
header_list = [
  "feature_type_description", 
  "bound_seq_region_start", 
  "feature_type_name",
  "bound_seq_region_end",
  "so_name",
  "so_accession",
  "seq_region_name",
  "stable_id",
  "regulatory_feature_id",
  "seq_region_start",
  "seq_region_end",
  "seq_region_strand"
]
df = pd.read_csv(local_path, sep='\t', names=header_list)

In [84]:
df.head()

Unnamed: 0,feature_type_description,bound_seq_region_start,feature_type_name,bound_seq_region_end,so_name,so_accession,seq_region_name,stable_id,regulatory_feature_id,seq_region_start,seq_region_end,seq_region_strand
0,Predicted promoter flanking region,8421801,Promoter Flanking Region,8422039,promoter_flanking_region,SO:0001952,8,ENSMUSR00000727517,1,8421801,8422039,0
1,Predicted promoter flanking region,103364401,Promoter Flanking Region,103368000,promoter_flanking_region,SO:0001952,5,ENSMUSR00000214044,2,103364401,103368000,0
2,CTCF Binding Site,144935201,CTCF Binding Site,144936200,CTCF_binding_site,SO:0001974,7,ENSMUSR00000451846,3,144935201,144936200,0
3,CTCF Binding Site,24612601,CTCF Binding Site,24613000,CTCF_binding_site,SO:0001974,19,ENSMUSR00000381379,4,24612601,24613000,0
4,Predicted enhancer,125880801,Enhancer,125881000,enhancer,SO:0000165,8,ENSMUSR00000742157,5,125880801,125881000,0


In [111]:
seqs = df[['so_name', 'seq_region_name', 'seq_region_start', 'seq_region_end']]
seqs['seq'] = ''

A value is trying to be set on a copy of a slice from a DataFrame.
Try using .loc[row_indexer,col_indexer] = value instead

See the caveats in the documentation: https://pandas.pydata.org/pandas-docs/stable/user_guide/indexing.html#returning-a-view-versus-a-copy
  


In [112]:
seqs

Unnamed: 0,so_name,seq_region_name,seq_region_start,seq_region_end,seq
0,promoter_flanking_region,8,8421801,8422039,
1,promoter_flanking_region,5,103364401,103368000,
2,CTCF_binding_site,7,144935201,144936200,
3,CTCF_binding_site,19,24612601,24613000,
4,enhancer,8,125880801,125881000,


In [118]:
def which(self):
    try:
        self = list(iter(self))
    except TypeError as e:
        raise Exception("""'which' method can only be applied to iterables.
        {}""".format(str(e)))
    indices = [i for i, x in enumerate(self) if bool(x) == True]
    return(indices)

with gzip.open(fasta_file, "rt") as handle:
    for record in tqdm(SeqIO.parse(handle, "fasta"), total=24):
        sel_seqs = which(seqs.seq_region_name == record.id)
        for i in sel_seqs:
            seqs.loc[i, "seq"] = str(record.seq[(seqs.seq_region_start[i]-1):seqs.seq_region_end[i]])
        
        if record.id == "MT": 
            # stop, do not read small contigs
            break

HBox(children=(FloatProgress(value=0.0, max=24.0), HTML(value='')))

# Saving to fasta

In [109]:
def save_to_fasta(filename, seq_df):
  with open(filename + '.fa', 'w') as handle:
    for index, record in seq_df.iterrows():
      SeqIO.write(
          SeqRecord(
              Seq(record.seq), 
              record.so_name + '_' + record.seq_region_name + ":" + str(record.seq_region_start) + ".." + str(record.seq_region_end), 
              description=""
          ), 
          handle, 
          'fasta'
      )

In [110]:
features = seqs.so_name.unique()
for feature in features:
  feature_seqs = seqs[seqs.so_name == feature]
  save_to_fasta(organism + '_' + feature, feature_seqs)