# Setup 

This notebook requires the following job(s) to be present in the database. 
```
615503
```

If you haven't done so yet, please **epmt submit** those directories to the database of your choice from the shell. See **README.md**:

```
$ epmt -v submit sample/615503.tgz
INFO:epmt_cmds:submit_to_db(sample/615503.tgz,*-papiex-[0-9]*-[0-9]*.csv,False)
INFO:epmt_cmds:./job_metadata is 6772 bytes in archive
INFO:epmt_cmds:1 files to submit
INFO:epmt_cmds:1 hosts found: ['pp053-collated-']
INFO:epmt_cmds:host pp053-collated-: 1 files to import
INFO:epmt_job:Binding to DB: {'filename': 'database.sqlite', 'create_db': True, 'provider': 'sqlite'}
INFO:epmt_job:Generating mapping from schema...
INFO:epmt_job:job_tags: {'atm_res': 'c96l49', 'exp_component': 'atmos', 'exp_name': 'ESM4_historical_D151', 'script_name': 'ESM4_historical_D151_atmos_18540101', 'ocn_res': '0.5l75', 'exp_time': '18540101'}
INFO:epmt_job:Processing job id 615503
INFO:epmt_job:Creating user Jeffrey.Durachta
INFO:epmt_job:Creating job 615503
INFO:epmt_job:Creating host pp053
INFO:epmt_job:Did 1000 (37734 in file)...236.49/sec
INFO:epmt_job:Did 2000 (37734 in file)...279.28/sec
INFO:epmt_job:Did 3000 (37734 in file)...293.66/sec
```

In [2]:
# Import the needed modules
import pandas as pd
import epmt_query as eq
import epmt_outliers as eod

{'host': 'localhost', 'password': 'example', 'user': 'postgres', 'dbname': 'EPMT', 'provider': 'postgres'}


In [2]:
jobs = eq.get_jobs(fmt='pandas')
print(jobs.columns.values)
display(jobs[['jobid', 'duration', 'cpu_time', 'num_procs']])

newdf = eod.detect_outlier_jobs(jobs)
print("Outlier jobs")
display(newdf)

[u'PERF_COUNT_SW_CPU_CLOCK' 'account' u'all_proc_tags'
 u'cancelled_write_bytes' 'cpu_time' u'delayacct_blkio_time' 'duration'
 'end' 'env_changes_dict' 'env_dict' 'exitcode' u'guest_time' u'inblock'
 'info_dict' u'invol_ctxsw' 'jobid' 'jobname' 'jobscriptname' u'majflt'
 u'minflt' u'num_procs' u'num_threads' u'outblock' 'ppr' u'processor'
 'queue' u'rchar' u'rdtsc_duration' u'read_bytes' u'rssmax' 'sessionid'
 'start' 'submit' u'syscr' u'syscw' u'systemtime' 'tags' u'time_oncpu'
 u'time_waiting' u'timeslices' 'updated_at' 'user' u'user+system'
 u'usertime' u'vol_ctxsw' u'wchar' u'write_bytes']


Unnamed: 0,jobid,duration,cpu_time,num_procs
0,615503,2266020000.0,1327237000.0,35483
1,kernel-build-20190610-081150,232336400.0,591514700.0,10601


Outlier jobs


Unnamed: 0,jobid,duration,cpu_time,num_procs
0,615503,False,False,False
1,kernel-build-20190610-081150,False,False,False


In [4]:
jobs = eq.get_jobs(fmt='terse')
print('jobs: ' + str(jobs))

tags = eq.get_unique_process_tags(exclude = ['op_instance', 'op_sequence', 'operation_count', 'instance'], fold=False)
print('tags:' + str(tags))
print

ops = eq.agg_metrics_by_tags(jobs=jobs,tags=tags,fmt='pandas')
print(ops.columns.values)
                
display(ops[['job','tags', 'duration','exclusive_cpu_time','num_procs']])


newdfo = eod.detect_outlier_ops(ops)
print("Outlier operations")
display(newdfo)

jobs: [u'615503', u'kernel-build-20190610-081150']
tags:[{u'op': u'cp'}, {u'op': u'dmput'}, {u'op': u'fregrid'}, {u'op': u'hsmget'}, {u'op': u'mv'}, {u'op': u'ncatted'}, {u'op': u'ncks'}, {u'op': u'ncrcat'}, {u'op': u'plevel'}, {u'op': u'rm'}, {u'op': u'splitvars'}, {u'op': u'timavg'}, {u'op': u'untar'}, {u'operation': u'build'}, {u'operation': u'configure'}, {u'operation': u'download'}, {u'operation': u'extract'}]

[u'PERF_COUNT_SW_CPU_CLOCK' u'cancelled_write_bytes'
 u'delayacct_blkio_time' 'duration' 'exclusive_cpu_time' u'guest_time'
 u'inblock' u'invol_ctxsw' 'job' u'majflt' u'minflt' 'num_procs'
 'num_tids' u'outblock' u'processor' u'rchar' u'rdtsc_duration'
 u'read_bytes' u'rssmax' u'syscr' u'syscw' u'systemtime' 'tags'
 u'time_oncpu' u'time_waiting' u'timeslices' u'user+system' u'usertime'
 u'vol_ctxsw' u'wchar' u'write_bytes']


Unnamed: 0,job,tags,duration,exclusive_cpu_time,num_procs
0,615503,{u'op': u'cp'},122453800.0,40980648.0,3902
1,615503,{u'op': u'dmput'},2286065000.0,12167869.0,272
2,615503,{u'op': u'fregrid'},399781300.0,396146734.0,38
3,615503,{u'op': u'hsmget'},2863656000.0,270022419.0,15948
4,615503,{u'op': u'mv'},1142269000.0,129375864.0,3393
5,615503,{u'op': u'ncatted'},33003460.0,33910501.0,3210
6,615503,{u'op': u'ncks'},51725280.0,45118505.0,720
7,615503,{u'op': u'ncrcat'},107993500.0,93014605.0,282
8,615503,{u'op': u'plevel'},208466600.0,105212285.0,2592
9,615503,{u'op': u'rm'},59274200.0,27965861.0,2772


Outlier operations


Unnamed: 0,jobid,tags,duration,exclusive_cpu_time,num_procs
0,615503,{u'op': u'cp'},False,False,False
1,615503,{u'op': u'dmput'},False,False,False
2,615503,{u'op': u'fregrid'},False,False,False
3,615503,{u'op': u'hsmget'},False,False,True
4,615503,{u'op': u'mv'},False,False,False
5,615503,{u'op': u'ncatted'},False,False,False
6,615503,{u'op': u'ncks'},False,False,False
7,615503,{u'op': u'ncrcat'},False,False,False
8,615503,{u'op': u'plevel'},False,False,False
9,615503,{u'op': u'rm'},False,False,False


In [16]:
procs = eq.get_procs(jobs, fmt='pandas')
print(procs.columns.values)
print(procs['exename'].count())


olps = eod.detect_outlier_processes(procs)
display(procs[['id','exename','tags','duration','exclusive_cpu_time']].tail(20))
olps[['id','exename','tags','duration','exclusive_cpu_time']].tail(20)


[u'PERF_COUNT_SW_CPU_CLOCK' 'args' u'cancelled_write_bytes'
 u'delayacct_blkio_time' 'duration' 'end' 'exclusive_cpu_time' 'exename'
 'exitcode' 'gen' 'group' u'guest_time' 'host' 'id' u'inblock'
 'inclusive_cpu_time' u'invol_ctxsw' 'job' u'majflt' u'minflt' 'numtids'
 u'outblock' 'parent' 'path' 'pgid' 'pid' 'ppid' u'processor' u'rchar'
 u'rdtsc_duration' u'read_bytes' u'rssmax' 'sid' 'start' u'syscr' u'syscw'
 u'systemtime' 'tags' u'time_oncpu' u'time_waiting' u'timeslices'
 'updated_at' 'user' u'user+system' u'usertime' u'vol_ctxsw' u'wchar'
 u'write_bytes']
46084
('duration', array([   13,    71,    80, ..., 46081, 46082, 46083]))
('exclusive_cpu_time', array([   19,    71,    80, ..., 46074, 46081, 46082]))


Unnamed: 0,id,exename,tags,duration,exclusive_cpu_time
46064,35503,cat,"{u'instance': u'1', u'operation': u'configure'...",203.0,9616.0
46065,35502,bash,"{u'instance': u'1', u'operation': u'configure'...",926370.0,55178.0
46066,35501,make,"{u'instance': u'1', u'operation': u'configure'...",3926996.0,11593.0
46067,35500,make,"{u'instance': u'1', u'operation': u'configure'...",5645335.0,21705.0
46068,35499,dash,"{u'instance': u'1', u'operation': u'configure'...",11514012.0,11231.0
46069,35498,make,"{u'instance': u'1', u'operation': u'configure'...",11527394.0,10558.0
46070,35497,make,"{u'instance': u'1', u'operation': u'configure'...",11540720.0,12886.0
46071,35496,make,"{u'instance': u'1', u'operation': u'configure'...",13668007.0,28996.0
46072,35495,bash,"{u'instance': u'1', u'operation': u'build', u'...",134.0,3711.0
46073,35494,bash,"{u'instance': u'1', u'operation': u'build', u'...",12355.0,4073.0


Unnamed: 0,id,exename,tags,duration,exclusive_cpu_time
46064,35503,cat,"{u'instance': u'1', u'operation': u'configure'...",False,False
46065,35502,bash,"{u'instance': u'1', u'operation': u'configure'...",True,False
46066,35501,make,"{u'instance': u'1', u'operation': u'configure'...",True,False
46067,35500,make,"{u'instance': u'1', u'operation': u'configure'...",True,False
46068,35499,dash,"{u'instance': u'1', u'operation': u'configure'...",True,False
46069,35498,make,"{u'instance': u'1', u'operation': u'configure'...",True,False
46070,35497,make,"{u'instance': u'1', u'operation': u'configure'...",True,False
46071,35496,make,"{u'instance': u'1', u'operation': u'configure'...",True,False
46072,35495,bash,"{u'instance': u'1', u'operation': u'build', u'...",False,False
46073,35494,bash,"{u'instance': u'1', u'operation': u'build', u'...",False,False


## Identifying outlier jobs using aggregates based on tags

This example walks through identifying job outliers by aggregating metrics
based on tags. For this experiment you should import the following jobs:

- 633109
- 625142
- 627902
- 629314

In [3]:
# The first thing one has to understand is only jobs with the same
# exp_name AND exp_component can be compared against each other for
# outliers.

# Let's find the jobs with a given exp_component:
eq.get_jobs(tags='exp_component:ice_1x1deg;exp_name:ESM4_historical_D151', fmt='terse')

[u'633109', u'625142', u'627902', u'629314']

In [4]:
# Now for this study we don't have to find the set of unique process tags
# across these jobs, but let's do so anyway so one get a feel of things
# by doing, fold=True, we compact the output
tags = eq.get_unique_process_tags([u'633109', u'625142', u'627902', u'629314'], fold=True)
tags

{u'op': [u'ncatted',
  u'splitvars',
  u'fregrid',
  u'hsmput',
  u'untar',
  u'mv',
  u'dmput',
  u'hsmget',
  u'ncrcat',
  u'timavg',
  u'rm',
  u'cp'],
 u'op_instance': [u'11',
  u'10',
  u'13',
  u'12',
  u'15',
  u'14',
  u'16',
  u'19',
  u'18',
  u'20',
  u'1',
  u'3',
  u'2',
  u'5',
  u'4',
  u'7',
  u'6',
  u'9',
  u'8'],
 u'op_sequence': [u'216',
  u'217',
  u'214',
  u'215',
  u'212',
  u'213',
  u'210',
  u'211',
  u'218',
  u'219',
  u'133',
  u'132',
  u'131',
  u'130',
  u'137',
  u'136',
  u'135',
  u'134',
  u'139',
  u'138',
  u'166',
  u'24',
  u'25',
  u'26',
  u'92',
  u'20',
  u'21',
  u'22',
  u'23',
  u'160',
  u'28',
  u'29',
  u'94',
  u'4',
  u'8',
  u'163',
  u'13',
  u'120',
  u'121',
  u'122',
  u'123',
  u'124',
  u'125',
  u'126',
  u'127',
  u'128',
  u'129',
  u'91',
  u'59',
  u'58',
  u'55',
  u'54',
  u'57',
  u'56',
  u'51',
  u'50',
  u'53',
  u'52',
  u'90',
  u'201',
  u'199',
  u'179',
  u'147',
  u'195',
  u'194',
  u'197',
  u'178',
  u'191'

In [5]:
# The way we have tags currently, it's not possible to pass to
# subsequent functions. So, instead run the above query, this time
# with fold=False
tags = eq.get_unique_process_tags([u'633109', u'625142', u'627902', u'629314'], fold=False)

In [10]:
# We can obtain aggregate metrics across the jobs by tag. Again, we don't
# have to do this step, since there is a direct outlier call that uses
# a job list. But let's learn some more
profile = eq.agg_metrics_by_tags([u'633109', u'625142', u'627902', u'629314'], tags=tags, fmt='pandas')

# the output is already grouped by job,tags. Still let's make sure to 
# so sort by tags. This way we see for a given tag, which job took how much
# For instance, row 3,4 and 5 all deal with the same tag: op_sequence: 51
profile.sort_values(by=['tags'])[['job', 'tags', 'duration', 'exclusive_cpu_time', 'num_procs','majflt']]

Unnamed: 0,job,tags,duration,exclusive_cpu_time,num_procs,majflt
0,625142,"{u'op_instance': u'1', u'op_sequence': u'17', ...",8.937046e+06,1725685.0,49,0
1,625142,"{u'op_instance': u'1', u'op_sequence': u'18', ...",7.543444e+06,2153495.0,170,0
2,625142,"{u'op_instance': u'1', u'op_sequence': u'20', ...",7.565734e+06,2178504.0,170,0
3,627902,"{u'op_instance': u'11', u'op_sequence': u'51',...",6.798583e+06,1587560.0,186,0
4,629314,"{u'op_instance': u'11', u'op_sequence': u'51',...",6.381716e+06,1557599.0,186,0
5,633109,"{u'op_instance': u'11', u'op_sequence': u'51',...",7.840639e+06,2819381.0,186,0
6,625142,"{u'op_instance': u'11', u'op_sequence': u'66',...",7.851022e+06,2723400.0,186,0
7,627902,"{u'op_instance': u'15', u'op_sequence': u'64',...",7.135950e+06,1753529.0,186,0
8,629314,"{u'op_instance': u'15', u'op_sequence': u'64',...",6.596609e+06,1539602.0,186,0
9,633109,"{u'op_instance': u'15', u'op_sequence': u'64',...",8.729731e+06,2854389.0,186,0


In [12]:
# While you can spot outliers by naked eye, it would be a lot better to
# use a computer. So, let's use a simple API call that is passed a job list
# and a set of features to look for while identifying outliers
outliers = eod.detect_outlier_jobs_using_tags([u'633109', u'625142', u'627902', u'629314'], features=['duration', 'exclusive_cpu_time', 'num_procs'])
outliers

Unnamed: 0,jobid,tags,duration,exclusive_cpu_time,num_procs
0,625142,"{u'op_instance': u'1', u'op_sequence': u'17', ...",False,False,True
1,625142,"{u'op_instance': u'1', u'op_sequence': u'18', ...",False,False,True
2,625142,"{u'op_instance': u'1', u'op_sequence': u'20', ...",False,False,True
3,627902,"{u'op_instance': u'11', u'op_sequence': u'51',...",False,False,False
4,629314,"{u'op_instance': u'11', u'op_sequence': u'51',...",False,False,False
5,633109,"{u'op_instance': u'11', u'op_sequence': u'51',...",False,False,False
6,625142,"{u'op_instance': u'11', u'op_sequence': u'66',...",False,False,False
7,627902,"{u'op_instance': u'15', u'op_sequence': u'64',...",False,False,False
8,629314,"{u'op_instance': u'15', u'op_sequence': u'64',...",False,False,False
9,633109,"{u'op_instance': u'15', u'op_sequence': u'64',...",False,False,False


In [18]:
# let's just filter out the rows with one or more True
# they show the outliers
outliers.loc[outliers['duration'] | outliers['num_procs'] | outliers['exclusive_cpu_time'] == True]

Unnamed: 0,jobid,tags,duration,exclusive_cpu_time,num_procs
0,625142,"{u'op_instance': u'1', u'op_sequence': u'17', ...",False,False,True
1,625142,"{u'op_instance': u'1', u'op_sequence': u'18', ...",False,False,True
2,625142,"{u'op_instance': u'1', u'op_sequence': u'20', ...",False,False,True


In [None]:
# In this case the result seems to be red herring because this was the
# only job which had this op_sequence. You will notice by looking at the
# aggregate metrics profile that op_sequences: [17,18, 20] have only the
# job 625142. This probably deserves some investigation as to why a single
# job outlier detection calls it an outlier