In [1]:
# This notebook gives an example of how to build relativley simple data paths.
# It assumes that you understand the concepts presented in the example 2
# notebook.

# This examples requires that you understand a little bit about the example
# catalog data model, which is based on the FaceBase project.

# Key tables:
#  - 'dataset' : represents a unit of data usually a 'study' or 'experiment'
#  - 'sample' : a biosample
#  - 'assay' : a bioassay (typically RNA-seq or ChIP-seq assays)

# Relationships
#  - 'dataset <- sample': A dataset may have one to many samples. I.e., there 
#    is a foreign key reference from sample to dataset.
#  - 'sample <- assay': A sample may have one to many assays. I.e., there is a
#    foreign key reference from assay to sample.

In [2]:
# Import deriva modules
from deriva_common import ErmrestCatalog, get_credential

In [3]:
# Connect with the deriva catalog
protocol = 'https'
hostname = 'www.facebase.org'
catalog_number = 1
credential = None
# If you need to authenticate, use Deriva Auth agent and get the credential
# credential = get_credential(hostname)
catalog = ErmrestCatalog(protocol, hostname, catalog_number, credential)

In [4]:
# Get the path builder interface for this catalog
pb = catalog.getPathBuilder()

In [5]:
# Get a reference to the key tables. Note that we define local varialbes only
# for convenience.
dataset = pb.isa.dataset
sample = pb.isa.sample
assay = pb.isa.assay

In [6]:
# Build a data path by linking together different tables that are related.
# By default, data path returns entities for the _last_ linked entity set
# in the path. The following data path will therefore return assays not
# datasets.
assays_datapath = dataset.link(sample).link(assay)
print(assays_datapath.uri) # the ERMrest URL

https://www.facebase.org/ermrest/catalog/1/entity/isa:dataset/isa:sample/isa:assay


In [7]:
# Get the entity set for this linked data path
assays_entities = assays_datapath.entities()

In [8]:
# The entity set behaves like a container.
len(assays_entities)

171

In [9]:
# Building off of the path, a filter can be added. In this filter, the assay's
# attriburtes may be reference in the expressions. In these binary comparisons 
# the left operand must be an attribute while the right operand must a literal
# value.
assays_datapath_filtered = assays_datapath.filter(assay.molecule_type == 'mRNA')
print(assays_datapath_filtered.uri) # the ERMrest URL
assays_datapath_filtered_entities = assays_datapath_filtered.entities()

https://www.facebase.org/ermrest/catalog/1/entity/isa:dataset/isa:sample/isa:assay/(molecule_type=mRNA)


In [10]:
# Access and print one of the entities in the set
len(assays_datapath_filtered_entities)

6

In [None]:
# Access and print a small slice of entities from the set
print (assays_datapath_filtered_entities[2:4])

[{'id': 15, 'dataset': 14068, 'sample': 2, 'replicate': '5', 'sample_composition': 'maxillary process', 'sample_type': 'RNA-seq', 'molecule_type': 'mRNA', 'sample_purification': 'excision', 'markers': 'histology', 'isolation_protocol': '', 'cell_count': 'NA', 'protocol': '', 'pretreatment': 'Trizol', 'fragmentation_method': 'Fragmentation Buffer from Illumina', 'reagent': 'TruSeq stranded total RNA kit', 'reagent_source': 'Illumina', 'reagent_catalog_number': '15032619.0', 'reagent_batch_number': '', 'selection': 'totalRNA', 'library_id': 75, 'alignment_id': 55, 'tracks_id': 35}, {'id': 20, 'dataset': 14068, 'sample': 4, 'replicate': '5', 'sample_composition': 'mandibular process', 'sample_type': 'RNA-seq', 'molecule_type': 'mRNA', 'sample_purification': 'excision', 'markers': 'histology', 'isolation_protocol': '', 'cell_count': 'NA', 'protocol': '', 'pretreatment': 'Trizol', 'fragmentation_method': 'Fragmentation Buffer from Illumina', 'reagent': 'TruSeq stranded total RNA kit', 'reag

In [None]:
# Convert to a DataFrame
assays_datapath_filtered_entities.dataframe

In [None]:
# IMPORTANT: methods on data paths (i.e., link or filter and others) return new
# data path objects. They do not alter the existing data path. Notice here that
# getting the entities for the original data path returns the same number as
# before. It did not change when the filters were added, as that created and
# returned a new data path object.
len(assays_datapath.entities())

In [None]:
# Returning to the initial example, if we want to project additional attributes
# from other entities in the data path, we need to be able to reference the
# "instances" of linked entity sets at any point in the path. To do so, first
# we need to define a few table "aliases" that we can use in the paths.

# Start by defining an alias for the 'dataset' table
D = dataset.as_('D')

In [None]:
# Like the original table, an alias may be used to reference the columns of the
# original table.
D.columns['accession']

In [None]:
# Now repeat the path but use the aliased table in place of the table.
datapath = D.link(sample).link(assay)

# Project attributes from the last referenced table and any aliased tables.
datapath = datapath.attributes(D.accession, assay.molecule_type, assay.sample_type)
print(datapath.uri)

In [None]:
# Get the entity set
entities = datapath.entities()
for e in entities[0:10]:
    print(e)

In [None]:
# Now define another alias so that sample's columns may be projected as well.
S = sample.as_('S')

In [None]:
# This is an all new datapath instance. When linking the samples table we will
# need to first indicate which table is being linked, and then what alias to
# link it "as". This is similar in spirit to the SQL concept of joining tables
# and renaming them "as" a given table instance name.
datapath = D.link(sample, as_=S).link(assay).attributes(D.accession, S.stage, assay.sample_type)
print(datapath.uri)

In [None]:
# Get a few entities and print them out
for e in datapath.entities(limit=5):
    print(e)