## Import the data

This workbook relies on importing data as follows:

```
./epmt -v submit ./sample/query/18431
./epmt -v submit ./sample/query/30385
```

In [1]:
# import the query api module
import epmt_query as eq

{'host': 'localhost', 'password': 'example', 'user': 'postgres', 'dbname': 'EPMT', 'provider': 'postgres'}


## Basic Queries

The API has a only a few queries: `get_jobs`, `get_procs` and `get_thread_metrics`

Each of these operate at distinct levels: job, process and threads.

### Job Query

In [4]:
# let's get jobs, let's try first without any filter
# we purposely set fmt='terse' to get just the job ids list
eq.get_jobs(fmt='terse')

[u'18431', u'30385', u'32046']

In [5]:
# above we got a list of job ids. That's not terribly useful.
# let's get a pandas dataframe of the actual process objects
eq.get_jobs(jobids = ['18431', '30385'], fmt='pandas')

Unnamed: 0,account,cpu_time,duration,end,env_changes_dict,exitcode,info_dict,jobid,jobname,jobscriptname,ppr,queue,sessionid,start,submit,tags,updated_at,user
0,,27058683,45905753.0,2019-06-04 20:10:26.838998,{},0,[],18431,18431,18431,,,,2019-06-04 20:09:40.933245,,{},2019-06-04 14:41:22.192377,tushar
1,,63523,97869.0,2019-06-04 20:14:24.878195,{},0,[],30385,30385,30385,,,,2019-06-04 20:14:24.780326,,{},2019-06-04 14:44:53.964737,tushar


In [6]:
# if you prefer dealing with python lists and dictionaries,
# you can set fmt='dict' or just leave it out (as it's the default)
# below we get a list containing two dictionaries: one for each job
eq.get_jobs(jobids = ['18431', '30385'])

[{'account': None,
  'cpu_time': 27058683,
  'duration': 45905753.0,
  'end': datetime.datetime(2019, 6, 4, 20, 10, 26, 838998),
  'env_changes_dict': {},
  'exitcode': 0,
  'info_dict': [],
  'jobid': u'18431',
  'jobname': u'18431',
  'jobscriptname': u'18431',
  'ppr': None,
  'queue': None,
  'sessionid': None,
  'start': datetime.datetime(2019, 6, 4, 20, 9, 40, 933245),
  'submit': None,
  'tags': {},
  'updated_at': datetime.datetime(2019, 6, 4, 14, 41, 22, 192377),
  'user': u'tushar'},
 {'account': None,
  'cpu_time': 63523,
  'duration': 97869.0,
  'end': datetime.datetime(2019, 6, 4, 20, 14, 24, 878195),
  'env_changes_dict': {},
  'exitcode': 0,
  'info_dict': [],
  'jobid': u'30385',
  'jobname': u'30385',
  'jobscriptname': u'30385',
  'ppr': None,
  'queue': None,
  'sessionid': None,
  'start': datetime.datetime(2019, 6, 4, 20, 14, 24, 780326),
  'submit': None,
  'tags': {},
  'updated_at': datetime.datetime(2019, 6, 4, 14, 44, 53, 964737),
  'user': u'tushar'}]

### Process Query

In [8]:
# If you want to get the processes belonging to a job
# here each row in the pandas dataframe contains one job process
# again, you can use the 'terse' fmt option to get just the list of database ids of the processes
eq.get_procs(['18431'], fmt='pandas')

Unnamed: 0,PERF_COUNT_SW_CPU_CLOCK,args,cancelled_write_bytes,delayacct_blkio_time,duration,end,exclusive_cpu_time,exename,exitcode,gen,...,time_oncpu,time_waiting,timeslices,updated_at,user,user+system,usertime,vol_ctxsw,wchar,write_bytes
0,2595549,./test-process-tree.sh,0,0,45848263.0,2019-06-04 14:40:26.835416,8940,bash,0,0,...,8940761,96932,6,2019-06-04 14:41:41.103896,tushar,8940,4470,5,0,0
1,1046856,./test-process-tree.sh,0,0,35373741.0,2019-06-04 14:40:16.821898,3947,bash,0,0,...,3947356,4787,3,2019-06-04 14:41:59.645593,tushar,3947,3947,2,0,0
2,1208717443,/etc -exec stat {} ;,0,0,35350474.0,2019-06-04 14:40:16.807963,1291036,find,1,0,...,1291037628,3901716,3652,2019-06-04 14:41:24.858969,tushar,1291036,149926,3614,246,0
3,894723,/etc/ssl/certs/8b59b1ad.0,0,0,901.0,2019-06-04 14:39:44.848089,6790,stat,0,0,...,6790409,0,1,2019-06-04 14:41:22.483269,tushar,6790,3395,0,0,0
4,839139,/etc/speech-dispatcher/modules/epos-generic.conf,0,0,845.0,2019-06-04 14:40:16.337848,8589,stat,0,0,...,8589298,59372,3,2019-06-04 14:41:22.497194,tushar,8589,5726,0,0,0
5,667088,/etc/ssl/certs/455f1b52.0,0,0,672.0,2019-06-04 14:39:46.602664,4080,stat,0,0,...,4080777,0,1,2019-06-04 14:41:22.509718,tushar,4080,0,0,0,0
6,599278,/etc/alternatives/LOCK.7.gz,0,0,604.0,2019-06-04 14:40:06.241266,7440,stat,0,0,...,7441143,0,1,2019-06-04 14:41:22.522131,tushar,7440,3720,0,0,0
7,564519,/etc/glusterfs,0,0,569.0,2019-06-04 14:39:51.551017,8175,stat,0,0,...,8176194,0,1,2019-06-04 14:41:22.533441,tushar,8175,5450,0,0,0
8,955785,/etc/fonts/conf.d/40-nonlatin.conf,0,0,963.0,2019-06-04 14:39:53.886599,8658,stat,0,0,...,8658841,30614,1,2019-06-04 14:41:22.544705,tushar,8658,5772,0,0,0
9,1018917,/etc/brltty/Input/mm/common.kti,0,0,1027.0,2019-06-04 14:39:58.203046,9202,stat,0,0,...,9202984,0,1,2019-06-04 14:41:22.555840,tushar,9202,9202,0,0,0


In [9]:
# suppose you want to filter all processes by tags
eq.get_procs(tags = {'app':'w', 'phase': 'load'}, fmt='terse')

[3619, 3622]

In [10]:
# we could have got the process metadata and metric sums if we used fmt='pandas' or no fmt
# below, each row in the dataframe represents a single process
# You will observe that thread-level metrics (such as usertime, systemtime) are
# already aggregated and available as columns below
eq.get_procs(tags = {'app':'w', 'phase': 'load'}, fmt='pandas')

Unnamed: 0,PERF_COUNT_SW_CPU_CLOCK,args,cancelled_write_bytes,delayacct_blkio_time,duration,end,exclusive_cpu_time,exename,exitcode,gen,...,time_oncpu,time_waiting,timeslices,updated_at,user,user+system,usertime,vol_ctxsw,wchar,write_bytes
0,264920,load,0,0,8583.0,2019-06-04 14:44:24.867268,9898,grep,0,0,...,9898934,0,2,2019-06-04 14:44:54.233242,tushar,9898,4949,1,71,0
1,7906450,,0,0,7911.0,2019-06-04 14:44:24.867006,17448,w.procps,0,0,...,17449098,0,1,2019-06-04 14:44:54.266024,tushar,17448,13086,0,0,0


### Thread Query

In [11]:
# How about getting the threads metrics for these two processes?
eq.get_thread_metrics([3619, 3622])

Unnamed: 0,tid,start,end,usertime,systemtime,rssmax,minflt,majflt,inblock,outblock,...,syscr,syscw,read_bytes,write_bytes,cancelled_write_bytes,time_oncpu,time_waiting,timeslices,rdtsc_duration,PERF_COUNT_SW_CPU_CLOCK
0,27608,1559659464858685,1559659464867268,4949,4949,5172,596,0,6,0,...,72,1,3328,0,0,9898934,0,2,22253488,264920
0,27607,1559659464859095,1559659464867006,13086,4362,6472,765,0,6,0,...,696,0,3328,0,0,17449098,0,1,20512108,7906450


## Getting familiar with useful metrics and keys

`get_jobs` and `get_procs` take a `fltr` and `order` option that can
filter and sort the output based on schema columns. 

In [12]:
# below we filter those processes of the job that exceed a certain
# wallclock time, and then sort them by the exclusive cpu time (user+system)
# fltr can be a lamdba function or a string
eq.get_procs('18431', fltr = lambda p: p.duration > 100000, order = 'desc(p.exclusive_cpu_time)', fmt='pandas')

Unnamed: 0,PERF_COUNT_SW_CPU_CLOCK,args,cancelled_write_bytes,delayacct_blkio_time,duration,end,exclusive_cpu_time,exename,exitcode,gen,...,time_oncpu,time_waiting,timeslices,updated_at,user,user+system,usertime,vol_ctxsw,wchar,write_bytes
0,1208717443,/etc -exec stat {} ;,0,0,35350474.0,2019-06-04 14:40:16.807963,1291036,find,1,0,...,1291037628,3901716,3652,2019-06-04 14:41:24.858969,tushar,1291036,149926,3614,246,0
1,440452151,/usr,0,0,443123.0,2019-06-04 14:39:41.444421,453101,find,0,0,...,453102131,2715435,50,2019-06-04 14:41:32.842824,tushar,453101,194754,0,13661217,0
2,140609,10,0,0,10000270.0,2019-06-04 14:40:26.834114,10896,sleep,0,0,...,10896060,15084,2,2019-06-04 14:41:57.092744,tushar,10896,3632,1,0,0
3,2595549,./test-process-tree.sh,0,0,45848263.0,2019-06-04 14:40:26.835416,8940,bash,0,0,...,8940761,96932,6,2019-06-04 14:41:41.103896,tushar,8940,4470,5,0,0
4,1046856,./test-process-tree.sh,0,0,35373741.0,2019-06-04 14:40:16.821898,3947,bash,0,0,...,3947356,4787,3,2019-06-04 14:41:59.645593,tushar,3947,3947,2,0,0


### Useful metrics and keys

Below are some of the most useful keys in no particular order:

#### Job Keys
 - duration: this is the wallclock time in microseconds
 - cpu_time: user+system time aggregated across all processes of the job
 - start:    start time in microseconds since epoch
 - end:      end time in microseconds since epoch
 - jobid:    database id for job (unique)
 - exitcode: return code from job
 - tags:     dict of key/value pairs
 - processes:list of processes belonging to job
 
 #### Process Keys
 - duration: this is the wallclock time in microseconds
 - exclusive_cpu_time: user+system time for process (aggregated across it's threads)
 - inclusive_cpu_time: user+system time for the process and *all its descendants*
 - start:    start time in microseconds since epoch
 - end:      end time in microseconds since epoch
 - tags:     dict of key/value pairs
 - threads_df: json serialized dataframe of process threads (ADVANCED)
 - threads_sums: key/value pairs consisting of sums of thread metrics (ADVANCED)
 - numtids:  number of threads
 - exename
 - args
 - pid
 - ppid
 - id:       database ID for process
 - exitcode
 - parent
 - children
 - ancestors
 - descendants
 
 #### Thread Keys
 - usertime
 - systemtime
 - user+system
 - rssmax
 - majflt
 - read_bytes
 - write_bytes

## Case Study

Let's walk through a contrived case study to get more familiar with the API.
Along the way we will touch on some advanced topics such as the ORM. You will
also see easy ways to navigate the process tree using the ORM.

Consider the shell script below:
```
$ cat sample/query/18431-job.sh 

#!/bin/bash
export PAPIEX_TAGS="prog:dircrawl;phase:/usr"
find /usr > /dev/null 2>&1

export PAPIEX_TAGS="prog:find;phase:stat"
(find /etc -exec stat {} \; ; ls -l /) > /dev/null 2>&1
sleep 10
```

In [26]:
# ordinarily we would first find the job and then probe downwards
# You can use tags or fltr arguments to find the job
# As we did not include job tags in this script, let's just find the job using
# its job id
job = eq.get_jobs(jobids = ['18431'])[0]
job

{'account': None,
 'cpu_time': 27058683,
 'duration': 45905753.0,
 'end': datetime.datetime(2019, 6, 4, 20, 10, 26, 838998),
 'env_changes_dict': {},
 'exitcode': 0,
 'info_dict': [],
 'jobid': u'18431',
 'jobname': u'18431',
 'jobscriptname': u'18431',
 'ppr': None,
 'queue': None,
 'sessionid': None,
 'start': datetime.datetime(2019, 6, 4, 20, 9, 40, 933245),
 'submit': None,
 'tags': {},
 'updated_at': datetime.datetime(2019, 6, 4, 14, 41, 22, 192377),
 'user': u'tushar'}

In [29]:
# now get the processes that are part of this job, let's sort them by the inclusive time
# we need to pass in the job id to restrict the query to a particular job
# the inclusive_cpu_time sums all the cpu times of the process and its dependents
# in this case you can see that after the top-level 'bash', the 'find' with the
# -exec stat shows up
procs = eq.get_procs(['18431'], order = 'desc(p.inclusive_cpu_time)', fmt='pandas')
procs

Unnamed: 0,PERF_COUNT_SW_CPU_CLOCK,args,cancelled_write_bytes,delayacct_blkio_time,duration,end,exclusive_cpu_time,exename,exitcode,gen,...,time_oncpu,time_waiting,timeslices,updated_at,user,user+system,usertime,vol_ctxsw,wchar,write_bytes
0,2595549,./test-process-tree.sh,0,0,45848263.0,2019-06-04 14:40:26.835416,8940,bash,0,0,...,8940761,96932,6,2019-06-04 14:41:41.103896,tushar,8940,4470,5,0,0
1,1046856,./test-process-tree.sh,0,0,35373741.0,2019-06-04 14:40:16.821898,3947,bash,0,0,...,3947356,4787,3,2019-06-04 14:41:59.645593,tushar,3947,3947,2,0,0
2,1208717443,/etc -exec stat {} ;,0,0,35350474.0,2019-06-04 14:40:16.807963,1291036,find,1,0,...,1291037628,3901716,3652,2019-06-04 14:41:24.858969,tushar,1291036,149926,3614,246,0
3,440452151,/usr,0,0,443123.0,2019-06-04 14:39:41.444421,453101,find,0,0,...,453102131,2715435,50,2019-06-04 14:41:32.842824,tushar,453101,194754,0,13661217,0
4,1242723,/etc/sane.d/artec.conf,0,0,1252.0,2019-06-04 14:40:04.388283,13964,stat,0,0,...,13964945,0,1,2019-06-04 14:41:29.765744,tushar,13964,6982,0,0,0
5,1001789,/etc/ld.so.conf.d/x86_64-linux-gnu_GL.conf,0,0,1009.0,2019-06-04 14:39:47.912468,13645,stat,0,0,...,13645979,0,1,2019-06-04 14:41:38.258388,tushar,13645,10234,0,0,0
6,963668,/etc/apparmor.d/abstractions/dconf,0,0,970.0,2019-06-04 14:39:51.420505,13632,stat,0,0,...,13632351,0,1,2019-06-04 14:41:45.702222,tushar,13632,6816,0,0,0
7,998211,/etc/alternatives/LISTEN.7.gz,0,0,1005.0,2019-06-04 14:40:08.320471,13608,stat,0,0,...,13608832,0,1,2019-06-04 14:41:33.997043,tushar,13608,0,0,0,0
8,864990,/etc/ppp/ip-down.d/postfix,0,0,872.0,2019-06-04 14:39:47.480549,13426,stat,0,0,...,13426061,0,1,2019-06-04 14:41:30.960891,tushar,13426,0,0,0,0
9,982789,/etc/ssl/certs/e536d871.0,0,0,990.0,2019-06-04 14:39:46.504476,13410,stat,0,0,...,13411581,32228,1,2019-06-04 14:41:53.508495,tushar,13410,6705,0,0,0


In [30]:
# now let's try and see if one process was responsible for spawning too many processes
eq.get_procs(['18431'], fltr = 'count(p.children) > 100')

[{u'PERF_COUNT_SW_CPU_CLOCK': 1208717443,
  'args': u'/etc -exec stat {} ;',
  u'cancelled_write_bytes': 0,
  u'delayacct_blkio_time': 0,
  'duration': 35350474.0,
  'end': datetime.datetime(2019, 6, 4, 14, 40, 16, 807963),
  'exclusive_cpu_time': 1291036,
  'exename': u'find',
  'exitcode': 1,
  'gen': 0,
  'group': None,
  u'guest_time': 0,
  'host': u'earth',
  'id': 3,
  u'inblock': 6,
  'inclusive_cpu_time': 26570768,
  u'invol_ctxsw': 37,
  'job': u'18431',
  u'majflt': 0,
  u'minflt': 56865,
  'numtids': 1,
  u'outblock': 0,
  'parent': 2,
  'path': u'/usr/bin/find',
  'pgid': 19161,
  'pid': 19166,
  'ppid': 19165,
  u'processor': 0,
  u'rchar': 43842,
  u'rdtsc_duration': 91702043696,
  u'read_bytes': 3328,
  u'rssmax': 6008,
  'sid': 18431,
  'start': datetime.datetime(2019, 6, 4, 14, 39, 41, 457489),
  u'starttime': 324648960000,
  u'syscr': 76,
  u'syscw': 20,
  u'systemtime': 1141110,
  'tags': {u'phase': u'stat', u'prog': u'find'},
  u'time_oncpu': 1291037628,
  u'time_wa

In [51]:
# now let's walk through the process tree. To make this easy, we use the 'orm' format
# let's sort the processes by exclusive cpu time
# You will get a sorted list of ORM objects, let's see the top 10
procs = eq.get_procs(['18431'], order = 'desc(p.exclusive_cpu_time)', fmt='orm')[:10]
procs

[Process[3],
 Process[929],
 Process[654],
 Process[1392],
 Process[2021],
 Process[1032],
 Process[761],
 Process[2575],
 Process[786],
 Process[2341]]

In [32]:
# lets pick up the first
p = procs[0]
p

Process[3]

In [33]:
p.exename

u'find'

In [44]:
p.exename, p.args, p.duration, len(p.children), p.numtids

(u'find', u'/etc -exec stat {} ;', 35350474.0, 3610, 1)

In [40]:
parent = p.parent
parent

Process[2]

In [43]:
parent.exename, parent.args, parent.pid, len(parent.children), len(parent.descendants)

(u'bash', u'./test-process-tree.sh', 19165, 2, 3612)

In [46]:
# let's see p's thread sums
p.threads_sums

{u'PERF_COUNT_SW_CPU_CLOCK': 1208717443,
 u'cancelled_write_bytes': 0,
 u'delayacct_blkio_time': 0,
 u'guest_time': 0,
 u'inblock': 6,
 u'invol_ctxsw': 37,
 u'majflt': 0,
 u'minflt': 56865,
 u'outblock': 0,
 u'processor': 0,
 u'rchar': 43842,
 u'rdtsc_duration': 91702043696,
 u'read_bytes': 3328,
 u'rssmax': 6008,
 u'starttime': 324648960000,
 u'syscr': 76,
 u'syscw': 20,
 u'systemtime': 1141110,
 u'time_oncpu': 1291037628,
 u'time_waiting': 3901716,
 u'timeslices': 3652,
 u'user+system': 1291036,
 u'usertime': 149926,
 u'vol_ctxsw': 3614,
 u'wchar': 246,
 u'write_bytes': 0}

In [48]:
# let's get the thread dataframes for p
eq.get_thread_metrics(p)

Unnamed: 0,tid,start,end,usertime,systemtime,rssmax,minflt,majflt,inblock,outblock,...,syscr,syscw,read_bytes,write_bytes,cancelled_write_bytes,time_oncpu,time_waiting,timeslices,rdtsc_duration,PERF_COUNT_SW_CPU_CLOCK
0,19166,1559659181457489,1559659216807963,149926,1141110,6008,56865,0,6,0,...,76,20,3328,0,0,1291037628,3901716,3652,91702043696,1208717443
