In [27]:
%load_ext autoreload
%autoreload 2
%matplotlib inline


The autoreload extension is already loaded. To reload it, use:
  %reload_ext autoreload


In [28]:
import numpy
import scipy
import pandas
import math
import matplotlib
import matplotlib.dates as mpd
import pylab as plt
import datetime as dtm
import pytz
import multiprocessing as mpp
import pickle
import sys
import os
import subprocess
#import lmod
# lmod.load('system')
# lmod.load('texlive')
# lmod.
#
import hpc_lib
#
# TODO: phase out unreferenced hpc_lib calls...
#import hpc_lib
#import hpc_reports
#
def running_mean(X,n=10):
    return (numpy.cumsum(numpy.insert(X,0,0))[n:] - numpy.cumsum(numpy.insert(X,0,0))[:-n])/n
#

## Realtime Monitoring tools

### General objective:
- Some tools to do realtime monitoring of SERC usage
- Nominally more versatile than that; could be SERC, users, groups, etc. 
- Design some metrics to indicate excessive usage; send alerts; make pretty graphs.

Possible starting query:
        squeue -p serc --Format=jobid,jobarrayid,partition,name,userid,timeused,timeleft,numnodes,nodelist,numcpus,state

Use this to identify:
- An individual user or group using >1/3 of the cluster for >1 hour (or so)
- Generally, times of high impact (lots of running and pending jobs)

Note that it is not trivial to estimate this. Loads of jobs pending is not significant, for example, if they have short runtimes. What we really want to catch is a scenario where a job array (or similar) picks up a bunch of idle resources (eg, overnight when usage is lower) for >24 hour runtime jobs.


In [33]:
# format_fields = {'jobid':str, 'jobarrayid':str, 'partition':str, 'name':str, 'username':str, 'timeused':str, 
#                     'timeleft':str, 'numnodes':int, 'numcpus':int, 'numtasks':int, 'nodelist':str, 'state':str}

class SQUEUE_obj(object):
    # SQUEUE manager, principally to estimate realtime SLURM activity. Like SACCT_obj, but
    #  uses squeue. We *could* use SACCT_obj and just limit to --State=running,pending
    #. but it seems that sacct is much slower than squeue.
    #
    def __init__(self, partition='serc', format_fields_dict=None, squeue_prams=None, verbose=False):
        #
        # @squeue_prams: additional or replacement fields for squeue_fields variable, eg parameters
        #. to pass to squeue. Presently, --Format and --partition are specified. some options might also
        #. be allowed as regular inputs. Probably .update(squeue_prams) will be the last thing done, so 
        #  will overried other inputs.
        #
        # TODO: add ***kwargs and handle syantax like SQU_{something} to add to squeue_fields, etc.
        
        if format_fields_dict is None:
            format_fields_dict = hpc_lib.default_SLURM_types_dict
#             format_fields.update({ky:int for ky in ['NODES', 'CPUS', 'TASKS', 'numnodes', 'numtasks', 'numcpus']})
#             #
#             ff_l = {ky.lower():val for ky,val in format_fields.items()}
#             ff_u = {ky.upper():val for ky,val in format_fields.items()}
#             format_fields.update(ff_l)
#             format_fields.update(ff_u)
#             del ff_l
#             del ff_u

        squeue_fields = {'--Format': ['jobid', 'jobarrayid', 'partition', 'name', 'username', 'timeused', 
                                      'timeleft', 'numnodes', 'numcpus', 'numtasks', 'state', 'nodelist'],
                      '--partition': [partition]
                      }
        if isinstance(squeue_prams, dict):
            squeue_fields.update(squeue_prams)
        #
        squeue_delim='%'
        sinfo_str = 'squeue '
        for ky,vl in squeue_fields.items():
            delim=' '
            if ky.startswith('--'):
                # long format
                delim='='
            #
            sinfo_str = '{} {}{}{}'.format(sinfo_str, ky, delim, f':{squeue_delim},'.join(vl))
        #
        if verbose:
            print('*** sinfo_str: {}'.format(sinfo_str))
            print('*** sinfo_ary: {}'.format(sinfo_str.split()))
        #
        # TODO:
        # port some of these bits to class-scope function calls, for class portability
        #
        self.__dict__.update({ky:vl for ky,vl in locals().items() if not ky in ('self', '__class__')})
        self.set_squeue_data()
    #
    def set_squeue_data(self):
        self.squeue_data = self.get_squeue_data()
        self.dtype       = self.squeue_data.dtype
    #
    def __getitem__(self, *args, **kwargs):
        return self.squeue_data.__getitem__(*args, **kwargs)
    def __setitem__(self, *args, kwargs):
        return self.squeue_data.__setitem__
    #
    def get_squeue_data(self, sinfo_str=None, squeue_delim=None, verbose=False):
        sinfo_str = sinfo_str or self.sinfo_str
        squeue_delim = squeue_delim or self.squeue_delim
        #
        squeue_output = subprocess.run(sinfo_str.split(), stdout=subprocess.PIPE).stdout.decode().split('\n')
        #cols = squeue_output[0].split(squeue_delim)
        #
        # there is a smarter way to do this, eg:
        cols = squeue_output[0].split(squeue_delim)
        for k,cl in enumerate(cols):
            cl_0 = cl
            k_rep = 0
            while cols[k] in cols[0:k]:
                cols[k] = f'{cl}_{k_rep}'
        if verbose:
            print('** cols: ', cols)    
        #
        return pandas.DataFrame(data=[[self.format_fields_dict.get(cl.lower(),str)(x) 
                                  for x, cl in zip(rw.split(squeue_delim),
                                self.squeue_fields['--Format']) ]
                                 for rw in squeue_output[1:]],
                                   columns=cols).to_records()
    #
    def get_active_jobs(self, *args, **kwargs):
        # print('** DEBUG: args: {}'.format(args))
        if len(args)>=6:
            args[5]
        kwargs['do_jobs'] = True
        return get_active_cpus(*args, **kwargs)
    #
    def get_active_cpus(self, state='running,pending', do_refresh=False, state_data=None, ncpus=None, do_cpus=True, do_jobs=False):
        if do_refresh:
            self.set_squeue_data()
        #
        if isinstance(state,bytes):
            state=state.decode()
        if isinstance(state,str):
            state=state.split(',')
        #
        for k,s in enumerate(state):
            state[k] = s.upper()
        #
        if state_data is None:
            state_data = self['STATE']
        if ncpus is None:
            ncpus = self['CPUS']
        #
        ix = numpy.isin(state_data, state)
        n_jobs, n_cpus = numpy.sum(ix), numpy.sum(ncpus[ix])
        #
        if do_cpus and do_jobs:
            return (n_jobs, n_cpus)
        if do_cpus:
            return n_cpus
        if do_jobs:
            return n_jobs
    #
    def simple_wait_estimate(self, ncpus=1, max_cpus=4600, do_refresh=False):
        # TODO: figure out the right way(s) to get max_cpus from system.
        active_cpus = self.get_active_cpus(state='running,pending', do_refresh=do_refresh)
        avail_cpus = max_cpus - active_cpus
        #
        if ncpus <= avail_cpus:
            return 0
        #
        cpus_needed = ncpus - avail_cpus
        #
        # now, spin down self['TIME_LEFT'] until we have enough CPUs to do our job. that TIME_LEFT is
        #. when our job should be available.
        #
        return None
        
          
        
SQ = SQUEUE_obj()      
#

In [36]:
#SQ.set_squeue_data()
#
print('** ', SQ['TIME'][0:20])
print('** ', SQ['TIME_LEFT'][0:20])
print('** ', hpc_lib.elapsed_time_2_day('13:14:10'))

**  [0.         0.         0.         0.         0.         0.36340278
 0.98663194 0.69319444 0.20980324 0.55547454 0.55547454 0.55547454
 0.55547454 0.07512731 0.03239583 0.03239583 0.03239583 0.00305556
 0.08740741 0.08740741]
**  [3.47222222e-03 2.00000000e+00 2.00000000e+00 2.00000000e+00
 2.00000000e+00 1.38659722e+00 6.01336806e+00 3.06805556e-01
 7.90196759e-01 1.44452546e+00 1.44452546e+00 1.44452546e+00
 1.44452546e+00 9.15393519e-02 9.67604167e-01 9.67604167e-01
 9.67604167e-01 1.21944444e-01 3.91259259e+00 3.91259259e+00]
**  0.5515046296296297


In [31]:
print('** ', SQ.format_fields_dict['time_left'])
print('** ', SQ.dtype)
SQ.get_active_cpus(do_jobs=True, do_cpus=True)

**  <function elapsed_time_2_day at 0x7ff33c37eca0>
**  (numpy.record, [('index', '<i8'), ('JOBID', 'O'), ('JOBID_0', 'O'), ('PARTITION', 'O'), ('NAME', 'O'), ('USER', 'O'), ('TIME', 'O'), ('TIME_LEFT', 'O'), ('NODES', '<f8'), ('CPUS', '<f8'), ('TASKS', '<f8'), ('STATE', 'O'), ('NODELIST            ', 'O')])


(65, 789.0)

In [10]:
print('** ', SQ['STATE'][0:20])
ix = [s in ['RUNNING'] for s in SQ['STATE'] ]

#print('** ', ix)


**  ['PENDING' 'PENDING' 'PENDING' 'PENDING' 'PENDING' 'RUNNING' 'RUNNING'
 'RUNNING' 'RUNNING' 'RUNNING' 'RUNNING' 'RUNNING' 'RUNNING' 'RUNNING'
 'RUNNING' 'RUNNING' 'RUNNING' 'RUNNING' 'RUNNING' 'RUNNING']


In [45]:


# squeue_output = subprocess.run(sinfo_str.split(), stdout=subprocess.PIPE).stdout.decode().split('\n')
# cols = squeue_output[0].split(squeue_delim)

#
#cols = squeue_output.readline()
# cols = squeue_output[0]
# data_dct = {cl:[] for cl in cols}
#
# for now, let's just make a dict with the columns... or use a f***ing PANDAs DF? most of the columns
#. are text-like values, so a recarray or structured numpy array is probably not a good idea... though
#  it's not that difficult to format columns...
#data = []
#for rw in squeue_output:
#    data += [rw.split()]
#
#data_df = pandas.dataframe(data=data[1:], )
#cols = squeue_output.readline()
#data = squeue_output[1:]


In [46]:
print('*** ', squeue_output[0].split(squeue_delim), ' ** ', len(squeue_output[0].split(squeue_delim)))
print('*** ', squeue_output[1].split(squeue_delim), ' ** ', len(squeue_output[1].split(squeue_delim)))

***  ['JOBID', 'JOBID', 'PARTITION', 'NAME', 'USER', 'TIME', 'TIME_LEFT', 'NODES', 'CPUS', 'TASKS', 'STATE', 'NODELIST            ']  **  12
***  ['60107994', '60107994', 'serc', 'check_scratch_cron', 'cooper96', '0:00', '5:00', '1', '1', '1', 'PENDING', '                    ']  **  12


In [47]:
# print(f'** cols: {squeue_fields["--Format"]}')
# print('** ', squeue_output[0])
# for rw in squeue_output[1:15]:
#     rws = rw.split(squeue_delim)
#     #
#     for x, cl in zip(rws, squeue_fields['--Format']):
#         print('** **', rws)
#         print('* * *', [format_fields_dict.get(cl.lower(),str)(x) for x,cl in zip(rws,squeue_fields['--Format'])] )
    

** cols: ['jobid', 'jobarrayid', 'partition', 'name', 'username', 'timeused', 'timeleft', 'numnodes', 'numcpus', 'numtasks', 'state', 'nodelist']
**  JOBID%JOBID%PARTITION%NAME%USER%TIME%TIME_LEFT%NODES%CPUS%TASKS%STATE%NODELIST            
** ** ['60107994', '60107994', 'serc', 'check_scratch_cron', 'cooper96', '0:00', '5:00', '1', '1', '1', 'PENDING', '                    ']
* * * ['60107994', '60107994', 'serc', 'check_scratch_cron', 'cooper96', '0:00', '5:00', 1, 1, 1, 'PENDING', '                    ']
** ** ['60107994', '60107994', 'serc', 'check_scratch_cron', 'cooper96', '0:00', '5:00', '1', '1', '1', 'PENDING', '                    ']
* * * ['60107994', '60107994', 'serc', 'check_scratch_cron', 'cooper96', '0:00', '5:00', 1, 1, 1, 'PENDING', '                    ']
** ** ['60107994', '60107994', 'serc', 'check_scratch_cron', 'cooper96', '0:00', '5:00', '1', '1', '1', 'PENDING', '                    ']
* * * ['60107994', '60107994', 'serc', 'check_scratch_cron', 'cooper96', '0:

In [52]:
sq_data = pandas.DataFrame(data=[[format_fields_dict.get(cl.lower(),str)(x) 
                                  for x, cl in zip(rw.split(squeue_delim),
                                squeue_fields['--Format']) ]
                                 for rw in squeue_output[1:]], columns=squeue_output[0].lower().split(squeue_delim))

In [57]:
print('** ', sq_data.dtypes)
#
ary_sq_data = sq_data.to_records()
print('** ', ary_sq_data.dtype)

**  jobid                    object
jobid                    object
partition                object
name                     object
user                     object
time                     object
time_left                object
nodes                   float64
cpus                    float64
tasks                   float64
state                    object
nodelist                 object
dtype: object


ValueError: name already used as a name or title

In [24]:
for rw in squeue_output[0:10]:
    print('** ', rw.split())

**  ['JOBID', 'JOBID', 'PARTITION', 'NAME', 'USER', 'TIME', 'TIME_LEFT', 'NODES', 'CPUS', 'TASKS', 'NODELIST', 'STATE']
**  ['60052471', '60052471', 'serc', 'pfor_matlab', 'lpulvi', '0:00', '5-00:00:00', '1', '120', '1', 'PENDING']
**  ['60043735', '60043735', 'serc', 'check_scratch_cron', 'cooper96', '0:00', '5:00', '1', '1', '1', 'PENDING']
**  ['60030631', '60030631', 'serc', 'run_realistic_perturaguilars', '0:00', '6-21:00:00', '1', '64', '64', 'PENDING']
**  ['60071705', '60071705', 'serc', 'RCE_299_5_2CO2', 'regirock', '4:03:39', '3-19:56:21', '10', '240', '240', 'sh03-04n[70,72],sh03RUNNING']
**  ['59982531', '59982531', 'serc', 'spg_sa_sal_ncpus250_earlew', '1-07:08:12', '1-02:51:48', '38', '250', '250', 'sh03-04n[45-46,49-51RUNNING']
**  ['60084542', '60084542', 'serc', 'all_iqt', 'mcshea', '30:03', '23:29:57', '1', '8', '8', 'sh03-04n42', 'RUNNING']
**  ['59736211', '59736211_23', 'serc', 'rot_RCE', 'haofu', '5-19:55:34', '1-03:04:26', '9', '144', '144', 'sh03-04n[50-52],sh03