In [69]:
%load_ext autoreload
%autoreload 2
%pprint

The autoreload extension is already loaded. To reload it, use:
  %reload_ext autoreload
Pretty printing has been turned OFF


# Exploring available regulatory data in the Ensembl database

In [134]:
from ensembl_scraper.regulatory import Metadata

First we need to create an object for specific Ensembl release. We probably already know which release do we want but we can use the following command to get all the available Ensembl releases.

In [71]:
Metadata.get_releases()

['100', '101', '102', '103', '104', '105', '106', '19', '21', '22', '23', '24', '25', '26', '27', '28', '29', '30', '31', '32', '33', '34', '35', '36', '37', '38', '39', '40', '41', '42', '43', '44', '45', '46', '47', '48', '49', '50', '51', '52', '53', '54', '55', '56', '57', '58', '59', '60', '61', '62', '63', '64', '65', '66', '67', '68', '69', '70', '71', '72', '73', '74', '75', '76', '77', '78', '79', '80', '81', '82', '83', '84', '85', '86', '87', '88', '89', '90', '91', '92', '93', '94', '95', '96', '97', '98', '99']

But if we are not interested in some specific release we can use the following command to get the latest release.

In [135]:
m = Metadata('latest')

And we can verify that the release we are using really is the latest one.

In [130]:
m.get_release()

106

And now we can start exploring the data within the selected release. First thing we can do is to get the list of available species.

In [124]:
m.get_organisms()

{'mus_musculus', 'homo_sapiens'}

Then we can get the list of available regulatory feature classes for each species.

In [125]:
for organism in m.get_organisms():
    print(organism, m.get_feature_classes(organism))

mus_musculus {'regulatory_feature', 'external_feature', 'peak', 'mirna_target_feature'}
homo_sapiens {'regulatory_feature', 'external_feature', 'peak', 'mirna_target_feature'}


And sometimes it might come in handy to translate organism names into human readable names.

In [126]:
for organism in m.get_organisms():
    print(organism, "-", m.get_full_name(organism))

mus_musculus - Mouse
homo_sapiens - Human


In [139]:
m.get_features('mus_musculus', 'peak')

ftp://ftp.ensembl.org/pub/release-106/mysql/regulation_mart_106/mmusculus_peak__peak__main.txt.gz
█

In [115]:
m.feature_data['mus_musculus']['external_feature']

Unnamed: 0,seq_region_strand,seq_region_start,seq_region_end,feature_type_description,seq_region_name
0,+,113072870,113077010,Enhancer identified by positive VISTA assay,chr11
1,+,55356208,55358345,Enhancer identified by positive VISTA assay,chr9
2,+,18950903,18952943,Enhancer identified by positive VISTA assay,chr7
3,+,13672328,13676370,Enhancer identified by positive VISTA assay,chr8
4,+,143444327,143446382,Enhancer identified by positive VISTA assay,chr6
...,...,...,...,...,...
628,+,96300066,96302831,Enhancer identified by positive VISTA assay,chr7
629,+,135639643,135641905,Enhancer identified by positive VISTA assay,chr7
630,+,151734158,151735565,Enhancer identified by positive VISTA assay,chr3
631,+,83062627,83064298,Enhancer identified by positive VISTA assay,chr12


In [132]:
columns = m._get_column_names('mus_musculus', 'external_feature')
columns

['feature_type_class', 'so_accession', 'seq_region_strand', 'seq_region_start', 'display_label', 'seq_region_end', 'fs_display_label', 'so_term', 'external_feature_id', 'feature_type_description', 'seq_region_name']

In [133]:
import pandas as pd

pd.read_csv('~/.ensembl_scraper/106/data/mmusculus_external_feature__external_feature__main.txt', sep='\t', names=columns)

Unnamed: 0,feature_type_class,so_accession,seq_region_strand,seq_region_start,display_label,seq_region_end,fs_display_label,so_term,external_feature_id,feature_type_description,seq_region_name
0,Enhancer,SO:0000165,1,113072871,1766,113077010,VISTA Enhancers,enhancer,251727,Enhancer identified by positive VISTA assay,11
1,Enhancer,SO:0000165,1,55356209,1754,55358345,VISTA Enhancers,enhancer,251725,Enhancer identified by positive VISTA assay,9
2,Enhancer,SO:0000165,1,18950904,1753,18952943,VISTA Enhancers,enhancer,251724,Enhancer identified by positive VISTA assay,7
3,Enhancer,SO:0000165,1,13672329,1751,13676370,VISTA Enhancers,enhancer,251722,Enhancer identified by positive VISTA assay,8
4,Enhancer,SO:0000165,1,143444328,1750,143446382,VISTA Enhancers,enhancer,251721,Enhancer identified by positive VISTA assay,6
...,...,...,...,...,...,...,...,...,...,...,...
628,Enhancer,SO:0000165,1,96300067,2003,96302831,VISTA Enhancers,enhancer,251820,Enhancer identified by positive VISTA assay,7
629,Enhancer,SO:0000165,1,135639644,2007,135641905,VISTA Enhancers,enhancer,251824,Enhancer identified by positive VISTA assay,7
630,Enhancer,SO:0000165,1,151734159,2010,151735565,VISTA Enhancers,enhancer,251827,Enhancer identified by positive VISTA assay,3
631,Enhancer,SO:0000165,1,83062628,2011,83064298,VISTA Enhancers,enhancer,251828,Enhancer identified by positive VISTA assay,12
