# Fireworks overview

This notebook demonstrates querying of Fireworks workflows and Filepad objects

## Initialization

### IPython magic

In [1]:
%load_ext autoreload
%autoreload 2

In [None]:
%aimport

### Imports

In [2]:
import ase.io # here used for reading pdb files
from ase.visualize import view
from ase.visualize.plot import plot_atoms # has nasty offset issues
from cycler import cycler # here used for cycling through colors in plots
import datetime
import fabric # for pythonic ssh connections
from fireworks import LaunchPad, Firework, Tracker, Workflow 
from fireworks import FileTransferTask, PyTask, ScriptTask

# FireWorks functionality 
from fireworks import Firework, LaunchPad, ScriptTask, Workflow
from fireworks.user_objects.firetasks.templatewriter_task import TemplateWriterTask
from fireworks.user_objects.firetasks.filepad_tasks import AddFilesTask, GetFilesTask, GetFilesByQueryTask
from imteksimfw.fireworks.user_objects.firetasks.cmd_tasks import CmdTask
from fireworks.utilities.filepad import FilePad # direct FilePad access, similar to the familiar LaunchPad

from collections.abc import Iterable
import glob
import gc # manually clean up memory with gc.collect()
import gromacs # GromacsWrapper, here used for evoking gmc commands, reading and writing .ndx files
# from io import StringIO, TextIOWrapper
import io
from IPython.display import display, Image #, Video # display image files within notebook
from ipywidgets import Video  # display video within notebook
import itertools # for products of iterables
import json # generic serialization of lists and dicts
import jinja2 # here used for filling packmol input script template
import jinja2.meta # for gathering variables in a jinja2 template
import logging 
import matplotlib.pyplot as plt
import MDAnalysis as mda # here used for reading and analyzing gromacs trajectories
import MDAnalysis.analysis.rdf as mda_rdf
import MDAnalysis.analysis.rms as mda_rms
from mpl_toolkits.mplot3d import Axes3D # here used for 3d point cloud scatter plot
import miniball # finds minimum bounding sphere of a point set
import nglview
import numpy as np
import os, os.path
import pandas as pd
import panedr # reads GROMACS edr into pandas df, requires pandas and pbr
import parmed as pmd # has quite a few advantages over ASE when it comes to parsing pdb
from pprint import pprint
import pymongo # for sorting in queries
import scipy.constants as sc
import subprocess # used for evoking external packmol
import sys
import tempfile
import yaml

NOTE: Some configuration directories are not set up yet: 
	/home/centos/.gromacswrapper
	/home/centos/.gromacswrapper/qscripts
	/home/centos/.gromacswrapper/templates
NOTE: You can create the configuration file and directories with:
	>>> import gromacs
	>>> gromacs.config.setup()




GromacsWrapper might need a file `~/.gromacswrapper.cfg` with content
```cfg
[Gromacs]
tools = gmx gmx_d 
# gmx_mpi_d gmx_mpi_d

# name of the logfile that is written to the current directory
logfilename = gromacs.log

# loglevels (see Python's logging module for details)
#   ERROR   only fatal errors
#   WARN    only warnings
#   INFO    interesting messages
#   DEBUG   everything

# console messages written to screen
loglevel_console = INFO

# file messages written to logfilename
loglevel_file = DEBUG
```
in order to know the GROMACS executables it is allowed to use. Otherwise,
calls to `gmx_mpi` or `gmx_mpi_d` without MPI wrapper might lead to MPI 
warnings in output that cause GromacsWrapper to fail.

### Logging

In [3]:
logging.basicConfig(level=logging.INFO)
logger = logging.getLogger()
logger.setLevel(logging.INFO)

ParmEd needs to know the GROMACS topology folder, usually get this from 
envionment variable `GMXLIB`:

### Function definitions

In [152]:
def highlight_bool(s):
    """color boolean values in pandas dataframe"""
    return ['background-color: green' if v else 'background-color: red' for v in s]

In [4]:
def find_undeclared_variables(infile):
    """identify all variables evaluated in a jinja 2 template file"""
    env = jinja2.Environment()
    with open(infile) as template_file:
        parsed = env.parse(template_file.read())

    undefined = jinja2.meta.find_undeclared_variables(parsed)
    return undefined

In [5]:
def memuse():
    """Quick overview on memory usage of objects in Jupyter notebook"""
    # https://stackoverflow.com/questions/40993626/list-memory-usage-in-ipython-and-jupyter
    # These are the usual ipython objects, including this one you are creating
    ipython_vars = ['In', 'Out', 'exit', 'quit', 'get_ipython', 'ipython_vars']

    # Get a sorted list of the objects and their sizes
    return sorted([(x, sys.getsizeof(globals().get(x))) for x in dir(sys.modules['__main__']) if not x.startswith('_') and x not in sys.modules and x not in ipython_vars], key=lambda x: x[1], reverse=True)

### Global settings

In [51]:
# pandas settings
# https://songhuiming.github.io/pages/2017/04/02/jupyter-and-pandas-display/
pd.options.display.max_rows = 200
pd.options.display.max_columns = 16
pd.options.display.max_colwidth = 256
pd.options.display.max_colwidth = None

In [52]:
os.environ['GMXLIB'] = '/gmx_top'

In [53]:
# pmd.gromacs.GROMACS_TOPDIR = os.environ['GMXLIB']
pmd.gromacs.GROMACS_TOPDIR = '/gmx_top'

In [54]:
# prefix = '/mnt/dat/work/testuser/indenter/sandbox/20191110_packmol'
prefix = '/mnt/dat/work'

In [55]:
work_prefix = '/mnt/dat/work/tmp'

In [56]:
try:
    os.mkdir(work_prefix)
except FileExistsError as exc:
    print(exc)

[Errno 17] File exists: '/mnt/dat/work/tmp'


In [57]:
os.chdir(work_prefix)

In [58]:
# the FireWorks LaunchPad
lp = LaunchPad.auto_load() #Define the server and database
# FilePad behaves analogous to LaunchPad
fp = FilePad.auto_load()

# Fireworks

In [13]:
project = '2020-09-28-ctab-on-au-111-substrate-passivation'

In [14]:
query={'spec.metadata.project': project}

In [15]:
fw_ids = lp.get_fw_ids(query)

In [16]:
len(fw_ids)

3479

In [17]:
wf_query = {'nodes': {'$in': fw_ids}}

In [18]:
lp.workflows.count_documents(wf_query)

1

In [19]:
wf = lp.workflows.find_one(wf_query)

In [20]:
wf.keys()

dict_keys(['_id', 'links', 'parent_links', 'nodes', 'metadata', 'state', 'name', 'created_on', 'updated_on', 'fw_states'])

In [21]:
fw_ids = wf['nodes']

In [22]:
fw = lp.fireworks.find_one()

In [23]:
fw

{'_id': ObjectId('5b76cead2512e1d04c868d05'),
 'spec': {'_tasks': [{'script': ['./replicate.sh 17 10 2 111'],
    'use_shell': True,
    '_fw_name': 'ScriptTask'},
   {'script': ['./replicate.sh 26 15 2 111'],
    'use_shell': True,
    '_fw_name': 'ScriptTask'},
   {'script': ['./replicate.sh 51 30 2 111'],
    'use_shell': True,
    '_fw_name': 'ScriptTask'},
   {'script': ['./replicate.sh 60 35 2 111'],
    'use_shell': True,
    '_fw_name': 'ScriptTask'}]},
 'fw_id': 2,
 'created_on': '2018-08-17T13:32:38.908251',
 'updated_on': '2018-08-17T13:39:44.092056',
 'launches': [2],
 'state': 'COMPLETED',
 'name': 'SDS on AU 111 Substrate Replication',
 'archived_launches': []}

In [24]:
query = {'fw_id': {'$in': fw_ids}}

In [25]:
lp.fireworks.count_documents(query)

3479

In [26]:
query = {'fw_id': {'$in': fw_ids}, 'name': {'$regex':'NPT'}}

In [27]:
lp.fireworks.count_documents(query)

528

In [28]:
query = {'fw_id': {'$in': fw_ids}, 'name': {'$regex':'NPT.*mdrun'}, 'state': 'COMPLETED'}

In [29]:
lp.fireworks.count_documents(query)

38

In [30]:
fw = lp.fireworks.find_one(query)

In [31]:
fw['fw_id']

50932

In [32]:
fw['name']

'n=1944, m=1944, s=bilayer, SubstratePassivation:BilayerPackingAndEquilibartion:GromacsMinimizationEquilibrationRelaxation:GromacsNPTEquilibration:GromacsNPTEquilibrationMain:gmx_mdrun'

In [33]:
fw['state']

'COMPLETED'

In [34]:
fw['spec']['metadata']['step_specific']

{'conversion': {'lmp_type_to_element_mapping': {'11': 'Au'},
  'element_to_pdb_atom_name_mapping': {'Au': 'AU'},
  'element_to_pdb_residue_name_mapping': {'Au': 'AUM'}},
 'packing': {'surfactant_substrate': {'tolerance': 1.5,
   'constraints': {'layers': [{'bounding_box': [[-98.572, -99.281, 101.617],
       [100.226, 100.354, 126.03802096329918]],
      'lower_constraint_plane': 105.37984599543711,
      'upper_constraint_plane': 122.27517496786207},
     {'bounding_box': [[-98.572, -99.281, 126.03802096329918],
       [100.226, 100.354, 150.45904192659833]],
      'lower_constraint_plane': 129.80086695873626,
      'upper_constraint_plane': 146.69619593116124}]}}},
 'dtool_push': {'dtool_target': '/p/project/chfr13/hoermann4/dtool/DATASETS',
  'remote_dataset': {'uri': 'file://juwels01.ib.juwels.fzj.de/p/project/chfr13/hoermann4/dtool/DATASETS/2020-09-28-02-11-42-503121-n-1944-m-1944-s-bilayer-substratepassivation',
   'uuid': 'b01be676-2a01-45ae-94b3-f2eaa96aa388',
   'name': '2020-

In [35]:
fw['spec']['metadata']['step_specific']

{'conversion': {'lmp_type_to_element_mapping': {'11': 'Au'},
  'element_to_pdb_atom_name_mapping': {'Au': 'AU'},
  'element_to_pdb_residue_name_mapping': {'Au': 'AUM'}},
 'packing': {'surfactant_substrate': {'tolerance': 1.5,
   'constraints': {'layers': [{'bounding_box': [[-98.572, -99.281, 101.617],
       [100.226, 100.354, 126.03802096329918]],
      'lower_constraint_plane': 105.37984599543711,
      'upper_constraint_plane': 122.27517496786207},
     {'bounding_box': [[-98.572, -99.281, 126.03802096329918],
       [100.226, 100.354, 150.45904192659833]],
      'lower_constraint_plane': 129.80086695873626,
      'upper_constraint_plane': 146.69619593116124}]}}},
 'dtool_push': {'dtool_target': '/p/project/chfr13/hoermann4/dtool/DATASETS',
  'remote_dataset': {'uri': 'file://juwels01.ib.juwels.fzj.de/p/project/chfr13/hoermann4/dtool/DATASETS/2020-09-28-02-11-42-503121-n-1944-m-1944-s-bilayer-substratepassivation',
   'uuid': 'b01be676-2a01-45ae-94b3-f2eaa96aa388',
   'name': '2020-

# Filepad

## Overview

### Overview on recent projects in database

In [36]:
query = {'metadata.datetime': {'$gt': '2020'} }

In [37]:
fp.filepad.count_documents(query)

11975

In [38]:
aggregation_pipeline = [
    {
        "$match": query
    },
    {  # group by unique project id
        "$group": { 
            "_id": { 'project': '$metadata.project' },
            "object_count": {"$sum": 1}, # count matching data sets
            "earliest":  {'$min': '$metadata.datetime' },
            "latest":  {'$max': '$metadata.datetime' },
        },
    },
    {  # pull 'project' field up in hierarchy
        "$addFields": { 
            "project": "$_id.project",
        },
    },
    {  # drop nested '_id.project'
        "$project": { 
            "_id": False 
        },
    },
    {  # sort by earliest date, descending
        "$sort": { 
            "earliest": pymongo.DESCENDING,
        }
    }
]

# sort_aggregation
#aggregation_pipeline = [ match_aggregation, group_aggregation, set_aggregation ]
cursor = fp.filepad.aggregate(aggregation_pipeline)

res = [c for c in cursor]
res_df = pd.DataFrame(data=res) # pandas Dataframe is just nice for printing in notebook

In [59]:
res_df

Unnamed: 0,step,earliest,latest,object_count
0,SubstratePassivation:CylindricalPackingAndEquilibartion:GromacsMinimizationEquilibrationRelaxation:GromacsRelaxation:push_filepad,2020-10-13 23:31:57.836878,2020-10-13 23:31:58.351891,206
1,SubstratePassivation:CylindricalPackingAndEquilibartion:GromacsMinimizationEquilibrationRelaxation:GromacsNPTEquilibration:push_filepad,2020-10-13 23:31:57.830704,2020-10-13 23:31:58.346478,252
2,SubstratePassivation:CylindricalPackingAndEquilibartion:GromacsMinimizationEquilibrationRelaxation:GromacsNVTEquilibration:push_filepad,2020-10-13 23:31:57.825805,2020-10-13 23:31:58.642317,273
3,SubstratePassivation:CylindricalPackingAndEquilibartion:GromacsMinimizationEquilibrationRelaxation:GromacsEnergyMinimizationAfterSolvation:push_filepad,2020-10-13 23:31:57.819787,2020-10-13 23:31:58.871696,483
4,SubstratePassivation:CylindricalPackingAndEquilibartion:GromacsMinimizationEquilibrationRelaxation:GromacsSolvate:push_filepad,2020-10-13 23:31:57.815064,2020-10-13 23:31:58.866987,48
5,SubstratePassivation:CylindricalPackingAndEquilibartion:GromacsMinimizationEquilibrationRelaxation:GromacsPrep:push_filepad,2020-10-13 23:31:57.814932,2020-10-13 23:31:58.866857,48
6,SubstratePassivation:CylindricalPackingAndEquilibartion:PDBCleanup:push_filepad,2020-10-13 23:31:57.814621,2020-10-13 23:31:58.866480,24
7,SubstratePassivation:CylindricalPackingAndEquilibartion:CylindricalPacking:CylindricalPackingMain:push_filepad,2020-10-13 23:31:57.814415,2020-10-13 23:31:58.866236,24
8,SubstratePassivation:BilayerPackingAndEquilibartion:GromacsMinimizationEquilibrationRelaxation:GromacsRelaxation:push_filepad,2020-10-13 23:31:57.814026,2020-10-13 23:31:58.865832,176
9,SubstratePassivation:BilayerPackingAndEquilibartion:GromacsMinimizationEquilibrationRelaxation:GromacsNPTEquilibration:push_filepad,2020-10-13 23:31:57.808419,2020-10-13 23:31:58.860104,315


### Overview on recent production projects in database

In [42]:
query = {
    'metadata.datetime': {'$gt': '2020'},
    'metadata.mode': 'production'
}

In [43]:
fp.filepad.count_documents(query)

9563

In [44]:
aggregation_pipeline = [
    {
        "$match": query
    },
    {  # group by unique project id
        "$group": { 
            "_id": { 'project': '$metadata.project' },
            "object_count": {"$sum": 1}, # count matching data sets
            "earliest":  {'$min': '$metadata.datetime' },
            "latest":  {'$max': '$metadata.datetime' },
        },
    },
    {  # pull 'project' field up in hierarchy
        "$addFields": { 
            "project": "$_id.project",
        },
    },
    {  # drop nested '_id.project'
        "$project": { 
            "_id": False 
        },
    },
    {  # sort by earliest date, descending
        "$sort": { 
            "earliest": pymongo.DESCENDING,
        }
    }
]

# sort_aggregation
#aggregation_pipeline = [ match_aggregation, group_aggregation, set_aggregation ]
cursor = fp.filepad.aggregate(aggregation_pipeline)

res = [c for c in cursor]
res_df = pd.DataFrame(data=res) # pandas Dataframe is just nice for printing in notebook

In [45]:
res_df

Unnamed: 0,object_count,earliest,latest,project
0,53,2020-10-24 19:50:57.364604,2020-10-24 19:50:57.367249,2020-10-24-au-111-150x150x150-fcc-substrate-cr...
1,53,2020-10-20 00:36:42.139788,2020-10-20 00:36:42.145747,2020-10-19-au-111-150x150x150-fcc-substrate-cr...
2,2815,2020-10-13 23:31:57.716886,2020-10-13 23:31:58.871696,2020-10-13-ctab-on-au-111-substrate-passivation
3,2703,2020-09-28 02:11:38.861517,2020-09-28 02:11:42.558642,2020-09-28-ctab-on-au-111-substrate-passivation
4,53,2020-09-27 18:58:47.134481,2020-09-27 18:58:47.137826,2020-09-27-au-111-fcc-substrate-creation
5,53,2020-09-26 14:45:31.896748,2020-09-26 14:45:31.898524,2020-09-26-au-111-fcc-substrate-creation-trial
6,53,2020-09-26 00:33:42.497547,2020-09-26 00:33:42.499737,2020-09-25-au-111-fcc-substrate-creation
7,2844,2020-09-14 19:40:24.649384,2020-09-14 19:40:26.998397,2020-09-14-sds-on-au-111-substrate-passivation
8,537,2020-07-29 03:50:20.375090,2020-07-29 03:50:22.884335,2020-07-29-ctab-on-au-111-indenter-passivation
9,399,2020-07-29 03:47:40.301419,2020-07-29 03:47:42.358240,2020-07-29-sds-on-au-111-indenter-passivation


### Overview on steps in project

In [46]:
#project_id = '2020-09-28-ctab-on-au-111-substrate-passivation'
project_id = '2020-10-13-ctab-on-au-111-substrate-passivation'

In [47]:
# queries to the data base are simple dictionaries
query = {
    'metadata.project': project_id,
}

In [48]:
# use underlying MongoDB functionality to check total number of documents matching query
fp.filepad.count_documents(query)

2824

In [49]:
# check files degenerate by 'metadata.type' ad 'metadata.name'
aggregation_pipeline = [
    {
        "$match": query
    },
    {  # group by unique project id
        "$group": { 
            "_id": { 
                'step': '$metadata.step',
            },
            "object_count": {"$sum": 1}, # count matching data sets
            "earliest":  {'$min': '$metadata.datetime' },
            "latest":  {'$max': '$metadata.datetime' },
        },
    },
    {  # sort by earliest date, descending
        "$sort": { 
            "earliest": pymongo.DESCENDING,
        }
    }
]

cursor = fp.filepad.aggregate(aggregation_pipeline)

res = [ {**c['_id'], **c} for c in cursor]
columns = ['step', 'earliest', 'latest', 'object_count', '_id']
res_df = pd.DataFrame(data=res, columns=columns) # pandas Dataframe is just nice for printing in notebook
del res_df["_id"]

In [60]:
res_df

Unnamed: 0,step,earliest,latest,object_count
0,SubstratePassivation:CylindricalPackingAndEquilibartion:GromacsMinimizationEquilibrationRelaxation:GromacsRelaxation:push_filepad,2020-10-13 23:31:57.836878,2020-10-13 23:31:58.351891,206
1,SubstratePassivation:CylindricalPackingAndEquilibartion:GromacsMinimizationEquilibrationRelaxation:GromacsNPTEquilibration:push_filepad,2020-10-13 23:31:57.830704,2020-10-13 23:31:58.346478,252
2,SubstratePassivation:CylindricalPackingAndEquilibartion:GromacsMinimizationEquilibrationRelaxation:GromacsNVTEquilibration:push_filepad,2020-10-13 23:31:57.825805,2020-10-13 23:31:58.642317,273
3,SubstratePassivation:CylindricalPackingAndEquilibartion:GromacsMinimizationEquilibrationRelaxation:GromacsEnergyMinimizationAfterSolvation:push_filepad,2020-10-13 23:31:57.819787,2020-10-13 23:31:58.871696,483
4,SubstratePassivation:CylindricalPackingAndEquilibartion:GromacsMinimizationEquilibrationRelaxation:GromacsSolvate:push_filepad,2020-10-13 23:31:57.815064,2020-10-13 23:31:58.866987,48
5,SubstratePassivation:CylindricalPackingAndEquilibartion:GromacsMinimizationEquilibrationRelaxation:GromacsPrep:push_filepad,2020-10-13 23:31:57.814932,2020-10-13 23:31:58.866857,48
6,SubstratePassivation:CylindricalPackingAndEquilibartion:PDBCleanup:push_filepad,2020-10-13 23:31:57.814621,2020-10-13 23:31:58.866480,24
7,SubstratePassivation:CylindricalPackingAndEquilibartion:CylindricalPacking:CylindricalPackingMain:push_filepad,2020-10-13 23:31:57.814415,2020-10-13 23:31:58.866236,24
8,SubstratePassivation:BilayerPackingAndEquilibartion:GromacsMinimizationEquilibrationRelaxation:GromacsRelaxation:push_filepad,2020-10-13 23:31:57.814026,2020-10-13 23:31:58.865832,176
9,SubstratePassivation:BilayerPackingAndEquilibartion:GromacsMinimizationEquilibrationRelaxation:GromacsNPTEquilibration:push_filepad,2020-10-13 23:31:57.808419,2020-10-13 23:31:58.860104,315


#### Pivot overview on steps and parameters in project

In [64]:
project_id = '2020-10-13-ctab-on-au-111-substrate-passivation'

In [85]:
query = {
    'metadata.project': project_id,
    'metadata.system.surfactant.nmolecules': {'$exists': True},
    'metadata.system.surfactant.aggregates.shape': {'$exists': True},
}

In [87]:
# use underlying MongoDB functionality to check total number of documents matching query
fp.filepad.count_documents(query)

2816

In [95]:
parameters = { 
    'nmolecules': 'metadata.system.surfactant.nmolecules',
    'shape': 'metadata.system.surfactant.aggregates.shape',
}

In [97]:
distinct_parameter_values = {}
for label, key in parameters.items():
    values = fp.filepad.distinct(key, query)
    if None in values:
        values.remove(None)
    distinct_parameter_values[label] = values

In [153]:
print(distinct_parameter_values)

{'nmolecules': [81, 162, 243, 324, 405, 486, 567, 648, 729, 810, 891, 972, 1053, 1134, 1215, 1296, 1377, 1458, 1539, 1620, 1701, 1782, 1863, 1944], 'shape': ['bilayer', 'cylinders']}


#### Refined aggregation for hemicylindrical systems

In [163]:
distinct_parameter_values['shape'].remove('bilayer')

In [164]:
query = {
    'metadata.project': project_id,
    **{parameters[label]: {'$in': values} for label, values in distinct_parameter_values.items()}
}

In [165]:
print(query)

{'metadata.project': '2020-10-13-ctab-on-au-111-substrate-passivation', 'metadata.system.surfactant.nmolecules': {'$in': [81, 162, 243, 324, 405, 486, 567, 648, 729, 810, 891, 972, 1053, 1134, 1215, 1296, 1377, 1458, 1539, 1620, 1701, 1782, 1863, 1944]}, 'metadata.system.surfactant.aggregates.shape': {'$in': ['cylinders']}}


In [166]:
# check files degenerate by 'metadata.type' ad 'metadata.name'
aggregation_pipeline = [
    {
        "$match": query
    },
    {  # group by unique project id
        "$group": { 
            "_id": { 
                'step': '$metadata.step',
                **{label: '${}'.format(key) for label, key in parameters.items()},
            },
            "object_count": {"$sum": 1}, # count matching data sets
            "earliest":  {'$min': '$metadata.datetime' },
            "latest":  {'$max': '$metadata.datetime' },
        },
    },
    {  # sort by earliest date, descending
        "$sort": { 
            "earliest": pymongo.DESCENDING,
        }
    }
]

cursor = fp.filepad.aggregate(aggregation_pipeline)

res = [ {**c['_id'], **c} for c in cursor]
columns = ['step', *parameters.keys(), 'earliest', 'latest', 'object_count', '_id']
res_df = pd.DataFrame(data=res, columns=columns) # pandas Dataframe is just nice for printing in notebook
del res_df["_id"]

In [167]:
res_pivot = res_df.pivot_table(values='object_count', index=['step'], columns=list(parameters.keys()), aggfunc=pd.notna, fill_value=False)
res_pivot.style.apply(highlight_bool)

nmolecules,81,162,243,324,405,486,567,648,729,810,891,972,1053,1134,1215,1296,1377,1458,1539,1620,1701,1782,1863,1944
shape,cylinders,cylinders,cylinders,cylinders,cylinders,cylinders,cylinders,cylinders,cylinders,cylinders,cylinders,cylinders,cylinders,cylinders,cylinders,cylinders,cylinders,cylinders,cylinders,cylinders,cylinders,cylinders,cylinders,cylinders
step,Unnamed: 1_level_2,Unnamed: 2_level_2,Unnamed: 3_level_2,Unnamed: 4_level_2,Unnamed: 5_level_2,Unnamed: 6_level_2,Unnamed: 7_level_2,Unnamed: 8_level_2,Unnamed: 9_level_2,Unnamed: 10_level_2,Unnamed: 11_level_2,Unnamed: 12_level_2,Unnamed: 13_level_2,Unnamed: 14_level_2,Unnamed: 15_level_2,Unnamed: 16_level_2,Unnamed: 17_level_2,Unnamed: 18_level_2,Unnamed: 19_level_2,Unnamed: 20_level_2,Unnamed: 21_level_2,Unnamed: 22_level_2,Unnamed: 23_level_2,Unnamed: 24_level_2
SubstratePassivation:CylindricalPackingAndEquilibartion:CylindricalPacking:CylindricalPackingMain:push_filepad,True,True,True,True,True,True,True,True,True,True,True,True,True,True,True,True,True,True,True,True,True,True,True,True
SubstratePassivation:CylindricalPackingAndEquilibartion:GromacsMinimizationEquilibrationRelaxation:GromacsEnergyMinimizationAfterSolvation:push_filepad,True,True,True,True,True,True,True,True,True,True,True,True,True,True,True,True,True,False,True,True,True,True,True,True
SubstratePassivation:CylindricalPackingAndEquilibartion:GromacsMinimizationEquilibrationRelaxation:GromacsNPTEquilibration:push_filepad,True,True,True,True,True,True,True,True,True,True,False,True,True,False,False,False,False,False,False,False,False,False,False,False
SubstratePassivation:CylindricalPackingAndEquilibartion:GromacsMinimizationEquilibrationRelaxation:GromacsNVTEquilibration:push_filepad,True,True,True,True,True,True,True,True,True,True,False,True,True,False,False,False,False,False,False,True,False,False,False,False
SubstratePassivation:CylindricalPackingAndEquilibartion:GromacsMinimizationEquilibrationRelaxation:GromacsPrep:push_filepad,True,True,True,True,True,True,True,True,True,True,True,True,True,True,True,True,True,True,True,True,True,True,True,True
SubstratePassivation:CylindricalPackingAndEquilibartion:GromacsMinimizationEquilibrationRelaxation:GromacsRelaxation:push_filepad,True,True,True,True,True,True,True,True,True,True,False,True,True,False,False,False,False,False,False,False,False,False,False,False
SubstratePassivation:CylindricalPackingAndEquilibartion:GromacsMinimizationEquilibrationRelaxation:GromacsSolvate:push_filepad,True,True,True,True,True,True,True,True,True,True,True,True,True,True,True,True,True,True,True,True,True,True,True,True
SubstratePassivation:CylindricalPackingAndEquilibartion:PDBCleanup:push_filepad,True,True,True,True,True,True,True,True,True,True,True,True,True,True,True,True,True,True,True,True,True,True,True,True


#### Refined aggregation for bilayer systems

In [173]:
distinct_parameter_values['shape'].remove('cylinders')
distinct_parameter_values['shape'].append('bilayer')

In [174]:
query = {
    'metadata.project': project_id,
    **{parameters[label]: {'$in': values} for label, values in distinct_parameter_values.items()}
}

In [175]:
print(query)

{'metadata.project': '2020-10-13-ctab-on-au-111-substrate-passivation', 'metadata.system.surfactant.nmolecules': {'$in': [81, 162, 243, 324, 405, 486, 567, 648, 729, 810, 891, 972, 1053, 1134, 1215, 1296, 1377, 1458, 1539, 1620, 1701, 1782, 1863, 1944]}, 'metadata.system.surfactant.aggregates.shape': {'$in': ['bilayer']}}


In [176]:
# check files degenerate by 'metadata.type' ad 'metadata.name'
aggregation_pipeline = [
    {
        "$match": query
    },
    {  # group by unique project id
        "$group": { 
            "_id": { 
                'step': '$metadata.step',
                **{label: '${}'.format(key) for label, key in parameters.items()},
            },
            "object_count": {"$sum": 1}, # count matching data sets
            "earliest":  {'$min': '$metadata.datetime' },
            "latest":  {'$max': '$metadata.datetime' },
        },
    },
    {  # sort by earliest date, descending
        "$sort": { 
            "earliest": pymongo.DESCENDING,
        }
    }
]

cursor = fp.filepad.aggregate(aggregation_pipeline)

res = [ {**c['_id'], **c} for c in cursor]
columns = ['step', *parameters.keys(), 'earliest', 'latest', 'object_count', '_id']
res_df = pd.DataFrame(data=res, columns=columns) # pandas Dataframe is just nice for printing in notebook
del res_df["_id"]

In [177]:
res_pivot = res_df.pivot_table(values='object_count', index=['step'], columns=list(parameters.keys()), aggfunc=pd.notna, fill_value=False)
res_pivot.style.apply(highlight_bool)

nmolecules,81,162,243,324,405,486,567,648,729,810,891,972,1053,1134,1215,1296,1377,1458,1539,1620,1701,1782,1863,1944
shape,bilayer,bilayer,bilayer,bilayer,bilayer,bilayer,bilayer,bilayer,bilayer,bilayer,bilayer,bilayer,bilayer,bilayer,bilayer,bilayer,bilayer,bilayer,bilayer,bilayer,bilayer,bilayer,bilayer,bilayer
step,Unnamed: 1_level_2,Unnamed: 2_level_2,Unnamed: 3_level_2,Unnamed: 4_level_2,Unnamed: 5_level_2,Unnamed: 6_level_2,Unnamed: 7_level_2,Unnamed: 8_level_2,Unnamed: 9_level_2,Unnamed: 10_level_2,Unnamed: 11_level_2,Unnamed: 12_level_2,Unnamed: 13_level_2,Unnamed: 14_level_2,Unnamed: 15_level_2,Unnamed: 16_level_2,Unnamed: 17_level_2,Unnamed: 18_level_2,Unnamed: 19_level_2,Unnamed: 20_level_2,Unnamed: 21_level_2,Unnamed: 22_level_2,Unnamed: 23_level_2,Unnamed: 24_level_2
SubstratePassivation:BilayerPackingAndEquilibartion:BilayerPacking:LayeredPackingMain:push_filepad,True,True,True,True,True,True,True,True,True,True,True,True,True,True,True,True,True,True,True,True,True,True,True,True
SubstratePassivation:BilayerPackingAndEquilibartion:BilayerPacking:LayeredPackingMain:push_infiles,True,False,False,False,False,False,False,False,False,False,False,False,False,False,False,False,False,False,False,False,False,False,False,False
SubstratePassivation:BilayerPackingAndEquilibartion:GromacsMinimizationEquilibrationRelaxation:GromacsEnergyMinimizationAfterSolvation:push_filepad,True,True,True,True,True,True,True,True,True,True,True,True,True,True,True,True,True,True,True,False,True,False,True,True
SubstratePassivation:BilayerPackingAndEquilibartion:GromacsMinimizationEquilibrationRelaxation:GromacsNPTEquilibration:push_filepad,True,True,True,True,True,True,True,True,True,True,True,True,True,False,False,True,False,False,False,False,False,False,False,True
SubstratePassivation:BilayerPackingAndEquilibartion:GromacsMinimizationEquilibrationRelaxation:GromacsNVTEquilibration:push_filepad,True,True,True,True,True,True,True,True,True,True,True,True,True,True,False,True,False,False,False,False,True,False,False,True
SubstratePassivation:BilayerPackingAndEquilibartion:GromacsMinimizationEquilibrationRelaxation:GromacsPrep:push_filepad,True,True,True,True,True,True,True,True,True,True,True,True,True,True,True,True,True,True,True,True,True,True,True,True
SubstratePassivation:BilayerPackingAndEquilibartion:GromacsMinimizationEquilibrationRelaxation:GromacsRelaxation:push_filepad,True,True,True,True,True,True,True,True,True,True,True,False,False,False,False,False,False,False,False,False,False,False,False,True
SubstratePassivation:BilayerPackingAndEquilibartion:GromacsMinimizationEquilibrationRelaxation:GromacsSolvate:push_filepad,True,True,True,True,True,True,True,True,True,True,True,True,True,True,True,True,True,True,True,True,True,True,True,True
SubstratePassivation:BilayerPackingAndEquilibartion:PDBCleanup:push_filepad,True,True,True,True,True,True,True,True,True,True,True,True,True,True,True,True,True,True,True,True,True,True,True,True


In [134]:
res_df.groupby(['step', *parameters.keys()])

<pandas.core.groupby.generic.DataFrameGroupBy object at 0x7feb1d814d00>

In [131]:
res_df.set_index('step').stack()

step                                                                                                                                                                 
SubstratePassivation:CylindricalPackingAndEquilibartion:GromacsMinimizationEquilibrationRelaxation:GromacsEnergyMinimizationAfterSolvation:push_filepad  nmolecules                            1944
                                                                                                                                                         shape                            cylinders
                                                                                                                                                         earliest        2020-10-13 23:31:58.871647
                                                                                                                                                         latest          2020-10-13 23:31:58.871696
                                                  

In [None]:
(res_df.set_index('step').stack()
 .groupby(level=[0,1])
 .value_counts()
 .unstack(level=[1,2])
 .fillna(0)
 .sort_index(axis=1))

In [129]:
res_df.groupby()

Unnamed: 0,step,nmolecules,shape,earliest,latest,object_count
0,SubstratePassivation:CylindricalPackingAndEquilibartion:GromacsMinimizationEquilibrationRelaxation:GromacsEnergyMinimizationAfterSolvation:push_filepad,1944,cylinders,2020-10-13 23:31:58.871647,2020-10-13 23:31:58.871696,21
1,SubstratePassivation:CylindricalPackingAndEquilibartion:GromacsMinimizationEquilibrationRelaxation:GromacsSolvate:push_filepad,1944,cylinders,2020-10-13 23:31:58.866984,2020-10-13 23:31:58.866987,2
2,SubstratePassivation:CylindricalPackingAndEquilibartion:GromacsMinimizationEquilibrationRelaxation:GromacsPrep:push_filepad,1944,cylinders,2020-10-13 23:31:58.866854,2020-10-13 23:31:58.866857,2
3,SubstratePassivation:CylindricalPackingAndEquilibartion:PDBCleanup:push_filepad,1944,cylinders,2020-10-13 23:31:58.866480,2020-10-13 23:31:58.866480,1
4,SubstratePassivation:CylindricalPackingAndEquilibartion:CylindricalPacking:CylindricalPackingMain:push_filepad,1944,cylinders,2020-10-13 23:31:58.866236,2020-10-13 23:31:58.866236,1
...,...,...,...,...,...,...
314,SubstratePassivation:BilayerPackingAndEquilibartion:GromacsMinimizationEquilibrationRelaxation:GromacsSolvate:push_filepad,81,bilayer,2020-10-13 23:31:57.793538,2020-10-13 23:31:57.793541,2
315,SubstratePassivation:BilayerPackingAndEquilibartion:GromacsMinimizationEquilibrationRelaxation:GromacsPrep:push_filepad,81,bilayer,2020-10-13 23:31:57.793400,2020-10-13 23:31:57.793403,2
316,SubstratePassivation:BilayerPackingAndEquilibartion:PDBCleanup:push_filepad,81,bilayer,2020-10-13 23:31:57.793073,2020-10-13 23:31:57.793073,1
317,SubstratePassivation:BilayerPackingAndEquilibartion:BilayerPacking:LayeredPackingMain:push_filepad,81,bilayer,2020-10-13 23:31:57.792866,2020-10-13 23:31:57.792866,1


In [123]:
parameters.keys()

dict_keys(['nmolecules', 'shape'])

In [127]:
res_df.set_index(list(parameters.keys()))

Unnamed: 0_level_0,Unnamed: 1_level_0,step,earliest,latest,object_count
nmolecules,shape,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1,Unnamed: 5_level_1
1944,cylinders,SubstratePassivation:CylindricalPackingAndEquilibartion:GromacsMinimizationEquilibrationRelaxation:GromacsEnergyMinimizationAfterSolvation:push_filepad,2020-10-13 23:31:58.871647,2020-10-13 23:31:58.871696,21
1944,cylinders,SubstratePassivation:CylindricalPackingAndEquilibartion:GromacsMinimizationEquilibrationRelaxation:GromacsSolvate:push_filepad,2020-10-13 23:31:58.866984,2020-10-13 23:31:58.866987,2
1944,cylinders,SubstratePassivation:CylindricalPackingAndEquilibartion:GromacsMinimizationEquilibrationRelaxation:GromacsPrep:push_filepad,2020-10-13 23:31:58.866854,2020-10-13 23:31:58.866857,2
1944,cylinders,SubstratePassivation:CylindricalPackingAndEquilibartion:PDBCleanup:push_filepad,2020-10-13 23:31:58.866480,2020-10-13 23:31:58.866480,1
1944,cylinders,SubstratePassivation:CylindricalPackingAndEquilibartion:CylindricalPacking:CylindricalPackingMain:push_filepad,2020-10-13 23:31:58.866236,2020-10-13 23:31:58.866236,1
...,...,...,...,...,...
81,bilayer,SubstratePassivation:BilayerPackingAndEquilibartion:GromacsMinimizationEquilibrationRelaxation:GromacsSolvate:push_filepad,2020-10-13 23:31:57.793538,2020-10-13 23:31:57.793541,2
81,bilayer,SubstratePassivation:BilayerPackingAndEquilibartion:GromacsMinimizationEquilibrationRelaxation:GromacsPrep:push_filepad,2020-10-13 23:31:57.793400,2020-10-13 23:31:57.793403,2
81,bilayer,SubstratePassivation:BilayerPackingAndEquilibartion:PDBCleanup:push_filepad,2020-10-13 23:31:57.793073,2020-10-13 23:31:57.793073,1
81,bilayer,SubstratePassivation:BilayerPackingAndEquilibartion:BilayerPacking:LayeredPackingMain:push_filepad,2020-10-13 23:31:57.792866,2020-10-13 23:31:57.792866,1


In [None]:
pd.MultiIndex.from_frame()

In [124]:

res_df.pivot(index='step', columns='shape', values='object_count')

ValueError: Index contains duplicate entries, cannot reshape

In [112]:
res_df.multiply(*parameters.keys())

Unnamed: 0_level_0,step,shape,earliest,latest,object_count
nmolecules,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1,Unnamed: 5_level_1
1944,SubstratePassivation:CylindricalPackingAndEquilibartion:GromacsMinimizationEquilibrationRelaxation:GromacsEnergyMinimizationAfterSolvation:push_filepad,cylinders,2020-10-13 23:31:58.871647,2020-10-13 23:31:58.871696,21
1944,SubstratePassivation:CylindricalPackingAndEquilibartion:GromacsMinimizationEquilibrationRelaxation:GromacsSolvate:push_filepad,cylinders,2020-10-13 23:31:58.866984,2020-10-13 23:31:58.866987,2
1944,SubstratePassivation:CylindricalPackingAndEquilibartion:GromacsMinimizationEquilibrationRelaxation:GromacsPrep:push_filepad,cylinders,2020-10-13 23:31:58.866854,2020-10-13 23:31:58.866857,2
1944,SubstratePassivation:CylindricalPackingAndEquilibartion:PDBCleanup:push_filepad,cylinders,2020-10-13 23:31:58.866480,2020-10-13 23:31:58.866480,1
1944,SubstratePassivation:CylindricalPackingAndEquilibartion:CylindricalPacking:CylindricalPackingMain:push_filepad,cylinders,2020-10-13 23:31:58.866236,2020-10-13 23:31:58.866236,1
...,...,...,...,...,...
81,SubstratePassivation:BilayerPackingAndEquilibartion:GromacsMinimizationEquilibrationRelaxation:GromacsSolvate:push_filepad,bilayer,2020-10-13 23:31:57.793538,2020-10-13 23:31:57.793541,2
81,SubstratePassivation:BilayerPackingAndEquilibartion:GromacsMinimizationEquilibrationRelaxation:GromacsPrep:push_filepad,bilayer,2020-10-13 23:31:57.793400,2020-10-13 23:31:57.793403,2
81,SubstratePassivation:BilayerPackingAndEquilibartion:PDBCleanup:push_filepad,bilayer,2020-10-13 23:31:57.793073,2020-10-13 23:31:57.793073,1
81,SubstratePassivation:BilayerPackingAndEquilibartion:BilayerPacking:LayeredPackingMain:push_filepad,bilayer,2020-10-13 23:31:57.792866,2020-10-13 23:31:57.792866,1


### Overview on objects in project

In [112]:
# queries to the data base are simple dictionaries
query = {
    'metadata.project': project_id,
}

In [113]:
# use underlying MongoDB functionality to check total number of documents matching query
fp.filepad.count_documents(query)

2854

In [116]:
# check files degenerate by 'metadata.type' ad 'metadata.name'
aggregation_pipeline = [
    {
        "$match": query
    },
    {  # group by unique project id
        "$group": { 
            "_id": { 
                'type': '$metadata.type',
                'name': '$metadata.name',
                #'step': '$metadata.step',
            },
            "object_count": {"$sum": 1}, # count matching data sets
            "earliest":  {'$min': '$metadata.datetime' },
            "latest":  {'$max': '$metadata.datetime' },
        },
    },
    {  # sort by earliest date, descending
        "$sort": { 
            "earliest": pymongo.DESCENDING,
        }
    }
]

cursor = fp.filepad.aggregate(aggregation_pipeline)

res = [ {**c['_id'], **c} for c in cursor]
columns = ['type', 'name', 'earliest', 'latest', 'object_count', '_id']
res_df = pd.DataFrame(data=res, columns=columns) # pandas Dataframe is just nice for printing in notebook
del res_df["_id"]

In [117]:
res_df

Unnamed: 0,type,name,earliest,latest,object_count
0,run_file,,2020-09-14 19:40:24.772796,2020-09-14 19:40:25.498132,14
1,compressed_traj_file,,2020-09-14 19:40:24.772789,2020-09-14 19:40:25.498125,15
2,checkpoint_file,,2020-09-14 19:40:24.772777,2020-09-14 19:40:25.543747,16
3,index_file,,2020-09-14 19:40:24.750985,2020-09-14 19:40:26.964702,80
4,mp4_file,,2020-09-14 19:40:24.740566,2020-09-14 19:40:26.998397,116
5,surfactant_tail_rmsd,,2020-09-14 19:40:24.740563,2020-09-14 19:40:26.998395,118
6,surfactant_head_rmsd,,2020-09-14 19:40:24.740561,2020-09-14 19:40:26.998392,120
7,substrate_rmsd,,2020-09-14 19:40:24.740559,2020-09-14 19:40:26.998390,122
8,counterion_rmsd,,2020-09-14 19:40:24.740556,2020-09-14 19:40:26.998388,123
9,surfactant_tail_surfactant_tail_rdf,,2020-09-14 19:40:24.740554,2020-09-14 19:40:26.998385,123


### Overview on images by distinct steps

In [99]:
query = {
    'metadata.project': project_id,
    'metadata.type': 'png_file',
}

In [100]:
# use underlying MongoDB functionality to check total number of documents matching query
fp.filepad.count_documents(query)

1

In [101]:
# check files degenerate by 'metadata.type' ad 'metadata.name'

aggregation_pipeline = [
    {
        "$match": query
    },
    {  # group by unique project id
        "$group": { 
            "_id": { 
                'type': '$metadata.type',
                'name': '$metadata.name',
                'step': '$metadata.step',
            },
            "object_count": {"$sum": 1}, # count matching data sets
            "earliest":  {'$min': '$metadata.datetime' },
            "latest":  {'$max': '$metadata.datetime' },
        },
    },
    {  # sort by earliest date, descending
        "$sort": { 
            "earliest": pymongo.DESCENDING,
        }
    }
]

cursor = fp.filepad.aggregate(aggregation_pipeline)

res = [ {**c['_id'], **c} for c in cursor]
columns = ['step', 'type', 'name', 'earliest', 'latest', 'object_count', '_id']
res_df = pd.DataFrame(data=res, columns=columns) # pandas Dataframe is just nice for printing in notebook
del res_df["_id"]

In [102]:
res_df

Unnamed: 0,step,type,name,earliest,latest,object_count
0,SubstratePassivation:ComponentMeasures:SurfactantMoleculeMeasures:push_filepad,png_file,,2020-09-14 19:40:24.728604,2020-09-14 19:40:24.728604,1


In [103]:
res_df["step"][0]

'SubstratePassivation:ComponentMeasures:SurfactantMoleculeMeasures:push_filepad'

### Overview on objects in specific step

In [126]:
# queries to the data base are simple dictionaries
query = {
    'metadata.project': project_id,
    'metadata.step': {'$regex': 'GromacsNPTEquilibration:push_filepad'}
}

In [127]:
# use underlying MongoDB functionality to check total number of documents matching query
fp.filepad.count_documents(query)

583

In [128]:
# check files degenerate by 'metadata.type' ad 'metadata.name'
aggregation_pipeline = [
    {
        "$match": query
    },
    {  # group by unique project id
        "$group": { 
            "_id": { 
                'type': '$metadata.type',
                'name': '$metadata.name',
            },
            "object_count": {"$sum": 1}, # count matching data sets
            "earliest":  {'$min': '$metadata.datetime' },
            "latest":  {'$max': '$metadata.datetime' },
        },
    },
    {  # sort by earliest date, descending
        "$sort": { 
            "earliest": pymongo.DESCENDING,
        }
    }
]

cursor = fp.filepad.aggregate(aggregation_pipeline)

res = [ {**c['_id'], **c} for c in cursor]
columns = ['type', 'name', 'earliest', 'latest', 'object_count', '_id']
res_df = pd.DataFrame(data=res, columns=columns) # pandas Dataframe is just nice for printing in notebook
del res_df["_id"]

In [129]:
res_df

Unnamed: 0,type,name,earliest,latest,object_count
0,mp4_file,,2020-09-14 19:40:24.761392,2020-09-14 19:40:26.794411,28
1,surfactant_tail_rmsd,,2020-09-14 19:40:24.761389,2020-09-14 19:40:26.794409,28
2,surfactant_head_rmsd,,2020-09-14 19:40:24.761387,2020-09-14 19:40:26.794407,28
3,substrate_rmsd,,2020-09-14 19:40:24.761385,2020-09-14 19:40:26.794404,28
4,counterion_rmsd,,2020-09-14 19:40:24.761383,2020-09-14 19:40:26.794402,28
5,surfactant_tail_surfactant_tail_rdf,,2020-09-14 19:40:24.761380,2020-09-14 19:40:26.794400,28
6,surfactant_head_surfactant_tail_rdf,,2020-09-14 19:40:24.761378,2020-09-14 19:40:26.794397,28
7,surfactant_head_surfactant_head_rdf,,2020-09-14 19:40:24.761376,2020-09-14 19:40:26.794395,29
8,substrate_surfactant_tail_rdf,,2020-09-14 19:40:24.761374,2020-09-14 19:40:26.794393,29
9,substrate_surfactant_head_rdf,,2020-09-14 19:40:24.761372,2020-09-14 19:40:26.794391,29


### Overview on specific objects in specific steps

In [131]:
# queries to the data base are simple dictionaries
query = {
    'metadata.project': project_id,
    'metadata.step': {'$regex': 'GromacsNPTEquilibration:push_filepad'},
    'metadata.type': 'log_file',
}

In [132]:
# use underlying MongoDB functionality to check total number of documents matching query
fp.filepad.count_documents(query)

31

In [133]:
# check files degenerate by 'metadata.type' ad 'metadata.name'
aggregation_pipeline = [
    {
        "$match": query
    },
    {  # group by unique project id
        "$group": { 
            "_id": { 
                'step': '$metadata.step',
            },
            "object_count": {"$sum": 1}, # count matching data sets
            "earliest":  {'$min': '$metadata.datetime' },
            "latest":  {'$max': '$metadata.datetime' },
        },
    },
    {  # sort by earliest date, descending
        "$sort": { 
            "earliest": pymongo.DESCENDING,
        }
    }
]

cursor = fp.filepad.aggregate(aggregation_pipeline)

res = [ {**c['_id'], **c} for c in cursor]
columns = ['step', 'earliest', 'latest', 'object_count', '_id']
res_df = pd.DataFrame(data=res, columns=columns) # pandas Dataframe is just nice for printing in notebook
del res_df["_id"]

In [134]:
res_df

Unnamed: 0,step,earliest,latest,object_count
0,SubstratePassivation:HemicylindricalPackingAndEquilibartion:GromacsMinimizationEquilibrationRelaxation:GromacsNPTEquilibration:push_filepad,2020-09-14 19:40:24.805409,2020-09-14 19:40:25.934700,10
1,SubstratePassivation:MonolayerPackingAndEquilibartion:GromacsMinimizationEquilibrationRelaxation:GromacsNPTEquilibration:push_filepad,2020-09-14 19:40:24.761346,2020-09-14 19:40:26.794365,21


### Inspect specific file

In [135]:
metadata = fp.filepad.find_one(query)

In [137]:
metadata.keys()

dict_keys(['_id', 'identifier', 'original_file_name', 'original_file_path', 'metadata', 'compressed', 'gfs_id'])

In [142]:
metadata['gfs_id']

'5f603fd8cd1d380ee68ea86a'

In [149]:
content, doc = fp.get_file_by_id(metadata['gfs_id'])

In [155]:
print(content.decode())

                      :-) GROMACS - gmx mdrun, 2019.3 (-:

                            GROMACS is written by:
     Emile Apol      Rossen Apostolov      Paul Bauer     Herman J.C. Berendsen
    Par Bjelkmar      Christian Blau   Viacheslav Bolnykh     Kevin Boyd    
 Aldert van Buuren   Rudi van Drunen     Anton Feenstra       Alan Gray     
  Gerrit Groenhof     Anca Hamuraru    Vincent Hindriksen  M. Eric Irrgang  
  Aleksei Iupinov   Christoph Junghans     Joe Jordan     Dimitrios Karkoulis
    Peter Kasson        Jiri Kraus      Carsten Kutzner      Per Larsson    
  Justin A. Lemkul    Viveca Lindahl    Magnus Lundborg     Erik Marklund   
    Pascal Merz     Pieter Meulenhoff    Teemu Murtola       Szilard Pall   
    Sander Pronk      Roland Schulz      Michael Shirts    Alexey Shvetsov  
   Alfons Sijbers     Peter Tieleman      Jon Vincent      Teemu Virolainen 
 Christian Wennberg    Maarten Wolf   
                           and the project leaders:
        Mark Abraham, Ber