Skip to content

HTTPS clone URL

Subversion checkout URL

You can clone with HTTPS or Subversion.

Download ZIP
tree: b596092f27
Fetching contributors…

Cannot retrieve contributors at this time

967 lines (829 sloc) 39.468 kb
#!/usr/bin/env python
#
# perfmon - a daemon for monitoring performance of the host on which it is run
# and of all the local VMs, and for generating events based on configurable
# triggers
#
# Notes:
# ======
# The XAPI instance running on localhost monitors a number of variables
# for each VM running locally (i.e not on other pool members) and
# for the host itself. Each variable is stored in 16 RRDs (Round Robin Databases).
#
# Consolidation Number of samples in RRD
# function 5s/sample 1m/sample 1hr/sample 1day/sample
# AVERAGE 120 (10m) 120 (2h) ? ?
# MIN 120 (10m) 120 (2h) ? ?
# MAX 120 (10m) 120 (2h) ? ?
# LAST 120 (10m) 120 (2h) ? ?
#
# The "Consolidation function" tells how that RRD is built up from the
# one with the next highest sample rate. E.g. In the 1m/sample "AVERAGE" RRD
# each sample is the average of 12 from the 1s/sample "AVERAGE" RRD, whereas
# in the 1m/sample "MIN" RRD each sample is the minimum of 12 from the 1s/sample
# "AVERAGE" RRD.
#
# When XAPI is queried over http it selects the column (e.g. "1hr/sample")
# based on the "start" CGI param. It will return the highest level of granularity
# available for the period requested.
#
# The "cf" CGI param specfies the row. (All rows are returned if it's missing.)
import sys
import os
import getopt
import traceback
import XenAPI
import urllib
from xml import sax # used to parse rrd_updates because this may be large and sax is more efficient
from xml.dom import minidom # used to parse other-config:perfmon. Efficiency is less important than reliability here
from xml.parsers.expat import ExpatError
import time
import re
import random
import syslog
import socket
import gc
import signal
import commands
def print_debug(string):
if debug:
print >>sys.stderr, "DEBUG:", string
syslog.syslog(syslog.LOG_USER | syslog.LOG_INFO, "PERFMON(DEBUG): %s" % string)
def log_err(string):
print >>sys.stderr, string
syslog.syslog(syslog.LOG_USER | syslog.LOG_ERR, "PERFMON: %s" % string)
pass
def log_info(string):
print >>sys.stderr, string
syslog.syslog(syslog.LOG_INFO | syslog.LOG_INFO, "PERFMON: %s" % string)
pass
def debug_mem():
objCount = {}
gc.collect()
objList = gc.get_objects()
for obj in objList:
if getattr(obj, "__class__", None):
name = obj.__class__.__name__
else:
name = type(obj)
if objCount.has_key(name):
objCount[name] += 1
else:
objCount[name] = 1
output = []
for name in objCount:
output.append("%s :%s" % (name, objCount[name]))
log_info("\n".join(output))
class PerfMonException(Exception):
pass
class XmlConfigException(PerfMonException):
pass
class UsageException(Exception):
pass
# Start a session with the master of a pool.
# Note: when calling http://localhost/rrd_update we must pass the session
# ID as a param. The host then uses this to verify our validity with
# the master before responding.
# If the verification fails we should get a 401 response
class XapiSession(XenAPI.Session):
""" Object that represents a XenAPI session with the pool master
One of these is needed to refresh a VMMonitor or HOSTMonitor config, or
to refresh an RRDUpdates object
"""
def __init__(self):
XenAPI.Session.__init__(self, "http://_var_xapi_xapi", transport=XenAPI.UDSTransport())
self.xenapi.login_with_password("", "")
def __del__ (self):
self.xenapi.session.logout()
def id(self):
return self._session
class ObjectReport:
def __init__(self, objtype, uuid):
self.objtype = objtype # a string like "vm", or "host" taken from an <entry> tag
self.uuid = uuid # the object's uuid
self.vars = {} # maps rrd variable name to array of floats
def get_uuid(self):
return self.uuid
def get_var_names(self):
return self.vars.keys()
def get_value(self, var_name, row):
try:
return (self.vars[var_name])[row]
except:
return 0.0
def insert_value(self, var_name, index, value):
if not self.vars.has_key(var_name):
self.vars[var_name] = []
self.vars[var_name].insert(index, value)
class RRDReport:
"This is just a data structure passed that is completed by RRDContentHandler"
def __init__(self):
self.reset()
def reset(self):
self.columns = 0 # num xapi vars in xml
self.rows = 0 # num samples in xml
self.start_time = 0 # timestamp of 1st sample in xml
self.end_time = 0 # timestamp of last sample in xml
self.step_time = 0 # seconds between each pair of samples
self.obj_reports = {} # maps uuids to ObjectReports, built from xml
class RRDColumn:
"class used internally by RRDContentHandler"
def __init__(self, paramname, obj_report):
self.paramname = paramname
self.obj_report = obj_report
class RRDContentHandler(sax.ContentHandler):
""" Handles data in this format:
<xport>
<meta>
<start>INTEGER</start>
<step>INTEGER</step>
<end>INTEGER</end>
<rows>INTEGER</rows>
<columns>INTEGER</columns>
<legend>
<entry>IGNOREME:(host|vm):UUID:PARAMNAME</entry>
... another COLUMNS-1 entries ...
</legend>
</meta>
<data>
<row>
<t>INTEGER(END_TIME)</t>
<v>FLOAT</v>
... another COLUMNS-1 values ...
</row>
... another ROWS-2 rows
<row>
<t>INTEGER(START_TIME)</t>
<v>FLOAT</v>
... another COLUMNS-1 values ...
</row>
</data>
</xport>
"""
def __init__(self, report):
"report is saved and later updated by this object. report should contain defaults already"
self.report = report
self.in_start_tag = False
self.in_step_tag = False
self.in_end_tag = False
self.in_rows_tag = False
self.in_columns_tag = False
self.in_entry_tag = False
self.in_row_tag = False
self.column_details = []
self.row = 0
def startElement(self, name, attrs):
self.raw_text = ""
if name == 'start':
self.in_start_tag = True
elif name == 'step':
self.in_step_tag = True
elif name == 'end':
self.in_end_tag = True
elif name == 'rows':
self.in_rows_tag = True
elif name == 'columns':
self.in_columns_tag = True
elif name == 'entry':
self.in_entry_tag = True
elif name == 'row':
self.in_row_tag = True
self.col = 0
if self.in_row_tag:
if name == 't':
self.in_t_tag = True
elif name == 'v':
self.in_v_tag = True
def characters(self, chars):
if (self.in_start_tag or
self.in_step_tag or
self.in_end_tag or
self.in_rows_tag or
self.in_columns_tag or
self.in_entry_tag or
#self.in_row_tag # ignore text under row tag, <row>s are just for holding <t> and <v> nodes
self.in_t_tag or
self.in_v_tag):
self.raw_text += chars
def endElement(self, name):
if name == 'start':
# This overwritten later if there are any rows
self.report.start_time = int(self.raw_text)
self.in_start_tag = False
elif name == 'step':
self.report.step_time = int(self.raw_text)
self.in_step_tag = False
elif name == 'end':
# This overwritten later if there are any rows
self.report.end_time = int(self.raw_text)
self.in_end_tag = False
elif name == 'rows':
self.report.rows = int(self.raw_text)
self.in_rows_tag = False
elif name == 'columns':
self.report.columns = int(self.raw_text)
self.in_columns_tag = False
elif name == 'entry':
(_, objtype, uuid, paramname) = self.raw_text.split(':')
# lookup the obj_report corresponding to this uuid, or create if it does not exist
if not self.report.obj_reports.has_key(uuid):
self.report.obj_reports[uuid] = ObjectReport(objtype, uuid)
obj_report = self.report.obj_reports[uuid]
# save the details of this column
self.column_details.append(RRDColumn(paramname, obj_report))
self.in_entry_tag = False
elif name == 'row':
self.in_row_tag = False
self.row += 1
elif name == 't':
# Extract start and end time from row data as it's more reliable than the values in the meta data
t = int(self.raw_text)
# Last row corresponds to start time
self.report.start_time = t
if self.row == 0:
# First row corresponds to end time
self.report.end_time = t
self.in_t_tag = False
elif name == 'v':
v = float(self.raw_text)
# Find object report and paramname for this col
col_details = self.column_details[self.col]
obj_report = col_details.obj_report
paramname = col_details.paramname
# Update object_report
obj_report.insert_value(paramname, index=0, value=v) # use index=0 as this is the earliest sample so far
# Update position in row
self.col += 1
self.in_t_tag = False
# An object of this class should persist the lifetime of the program
class RRDUpdates:
""" Object used to get and parse the output the http://localhost/rrd_udpates?...
"""
def __init__(self):
# params are what get passed to the CGI executable in the URL
self.params = dict()
self.params['start'] = int(time.time()) - interval # interval seconds ago
self.params['host'] = 'true' # include data for host (as well as for VMs)
self.params['cf'] = 'AVERAGE' # consolidation function, each sample averages 12 from the 5 second RRD
self.params['interval'] = str(rrd_step)
self.report = RRDReport() # data structure updated by RRDContentHandler
def __repr__(self):
return '<RRDUpdates object: params=%s>' % str(self.params)
def refresh(self, session, override_params = {}):
"reread the rrd_updates over CGI and parse"
params = override_params
params['session_id'] = session.id()
params.update(self.params)
paramstr = "&".join(["%s=%s" % (k,params[k]) for k in params])
print_debug("Calling http://localhost/rrd_updates?%s" % paramstr)
# this is better than urllib.urlopen() as it raises an Exception on http 401 'Unauthorised' error
# rather than drop into interactive mode
sock = urllib.URLopener().open("http://localhost/rrd_updates?%s" % paramstr)
xmlsource = sock.read()
sock.close()
# Use sax rather than minidom and save Vvvast amounts of time and memory.
self.report.reset()
sax.parseString(xmlsource, RRDContentHandler(self.report))
# Update the time used on the next run
self.params['start'] = self.report.end_time + 1 # avoid retrieving same data twice
print_debug("Refreshed rrd_updates, start = %d, end = %d, rows = %d" % \
(self.report.start_time, self.report.end_time, self.report.rows))
def get_num_rows(self):
"Return the number of samples of each parameter"
return self.report.rows
def get_obj_report_by_uuid(self, uuid):
"Return an ObjectReport for the object with this uuid"
try:
return self.report.obj_reports[uuid]
except:
return None
def get_uuid_list_by_objtype(self, objtype):
"Return a list of uuids corresonding to the objects of this type for which we have ObjectReports"
return [ objrep.uuid
for objrep in self.report.obj_reports.values()
if objrep.objtype == objtype ]
# Consolidation functions:
supported_consolidation_functions = [ 'sum', 'average', 'max', 'get_percent_fs_usage' ]
def average(mylist):
return sum(mylist)/float(len(mylist))
def get_percent_fs_usage(ignored):
"Get the percent usage of the host filesystem. Input list is ignored and should be empty"
# this file is on the filesystem of interest in both OEM and Retail
output = commands.getoutput('df /etc/passwd')
output = ' '.join(output.splitlines()[1:]) # remove header line and rewrap on single line
percentage = output.split()[4]
# strip of % character and convert to float
return float(percentage[0:-1])/100.0
class VariableConfig:
"""Object storing the configuration of a Variable
Initialisation parameters:
xmldoc = dom object representing the <variable> nodes in the ObjectMonitor config strings.
See VMMonitor.__doc__ and HOSTMonitor.__doc__
alarm_create_callback =
callback called by Variable.update() to create and send an alarm
get_default_variable_config =
a function that VariableConfig.__init__() uses to lookup default tag values
by variable name
"""
def __init__(self, xmldoc, alarm_create_callback, get_default_variable_config):
try: name = xmldoc.getElementsByTagName('name')[0].getAttribute('value')
except IndexError: raise XmlConfigException, "variable missing 'name' tag"
def get_value(tag):
try:
return xmldoc.getElementsByTagName(tag)[0].getAttribute('value')
except:
return get_default_variable_config(name, tag)
rrd_regex = get_value('rrd_regex')
consolidation_fn = get_value('consolidation_fn')
alarm_trigger_level = get_value('alarm_trigger_level')
alarm_trigger_period = get_value('alarm_trigger_period')
alarm_auto_inhibit_period = get_value('alarm_auto_inhibit_period')
alarm_trigger_sense = get_value('alarm_trigger_sense')
alarm_priority = get_value('alarm_priority')
# Save xmldoc: we need this when creating the body of the alarms
self.xmldoc = xmldoc
self.name = name
try:
self.rrd_regex = re.compile(rrd_regex)
except:
raise XmlConfigException, "variable %s: regex %s does not compile" % (name, rrd_regex)
if consolidation_fn not in supported_consolidation_functions:
raise XmlConfigException, "variable %s: consolidation function %s not supported" \
% (name, consolidation_fn)
self.consolidation_fn = eval(consolidation_fn)
try:
self.alarm_trigger_period = int(alarm_trigger_period)
except:
raise XmlConfigException, "variable %s: alarm_trigger_level %s not an int" % \
(name, alarm_trigger_period)
try:
self.alarm_auto_inhibit_period = int(alarm_auto_inhibit_period)
except:
raise XmlConfigException, "variable %s: alarm_auto_inhibit_period %s not an int" % \
(name, alarm_auto_inhibit_period)
try:
trigger_level = float(alarm_trigger_level)
except:
raise XmlConfigException, "variable %s: alarm_trigger_level %s not a float" % \
(name, alarm_trigger_level)
self.alarm_priority = alarm_priority
if alarm_trigger_sense == "high":
self.test_level = lambda : (self.value > trigger_level)
else:
self.test_level = lambda : (self.value < trigger_level)
self.alarm_create_callback = alarm_create_callback
def variable_configs_differ(vc1, vc2):
"Say whether configuration of one variable differs from that of another"
return vc1.xmldoc.toxml() != vc2.xmldoc.toxml()
class VariableState:
""" Object storing the state of a Variable
"""
def __init__(self):
self.value = None
self.timeof_last_alarm = time.time() - self.alarm_auto_inhibit_period
self.trigger_down_counter = self.alarm_trigger_period
class Variable(VariableConfig, VariableState):
""" Variable() is used by ObjectMonitor to create one Variable object for each
variable specified in it's config string
"""
def __init__(self, *args):
VariableConfig.__init__(self, *args)
VariableState.__init__(self)
print_debug("Created Variable %s" % self.name)
def __generate_alarm(self, session):
""" Generate an alarm using callback provided by creator
... provided that one has not been generated in the last
self.alarm_auto_inhibit_period seconds
"""
t = time.time()
delta = t - self.timeof_last_alarm
if delta < self.alarm_auto_inhibit_period:
return # we are in the auto inhibit period - do nothing
self.timeof_last_alarm = t
message = "value: %f\nconfig:\n%s" % (self.value, self.xmldoc.toprettyxml())
self.alarm_create_callback(self, session, message)
def update(self, value, session):
"""Update the value of the variable using an RRDUpdates object
Calls self.__generate_alarm() if level has been 'bad' for more than
self.alarm_trigger_period seconds
"""
self.value = value
print_debug("Variable %s set to %f" % (self.name, value))
if self.test_level():
# level is bad
self.trigger_down_counter -= rrd_step
if self.trigger_down_counter <= 0:
self.__generate_alarm(session)
# reset trigger counter
self.trigger_down_counter = self.alarm_trigger_period
else:
# level good - reset trigger counter
self.trigger_down_counter = self.alarm_trigger_period
class ObjectMonitor:
"""Abstract class, used as base for VMMonitor and HOSTMonitor
Public attributes are uuid, refresh_config()
Inherited classes must implement a public attribute process_rrd_updates()
"""
def __init__(self, uuid):
self.uuid = uuid
self.xmlconfig = None
# "variables" is the public attribute of interest
self.variables = []
self.refresh_config()
def refresh_config(self):
if self.__update_xmlconfig():
# config has changed - reparse it
try:
self.__parse_xmlconfig()
except XmlConfigException, e:
errmsg = "\n".join([ str(x) for x in e.args ])
log_err("%s %s config error: %s" % (self.monitortype, self.uuid, errmsg))
except ExpatError, e:
errmsg = "\n".join([ str(x) for x in e.args ])
log_err("%s %s XML parse error: %s" % (self.monitortype, self.uuid, errmsg))
def __update_xmlconfig(self):
if not all_xmlconfigs.has_key(self.uuid):
xmlconfig = None
else:
xmlconfig = all_xmlconfigs[self.uuid]
if xmlconfig != self.xmlconfig:
self.xmlconfig = xmlconfig
return True
def __parse_xmlconfig(self):
if not self.xmlconfig:
# Possible if this VM/host is not configured yet
self.variables = []
return
xmldoc = minidom.parseString(self.xmlconfig)
variable_nodes = xmldoc.getElementsByTagName('variable')
variable_names = []
for vn in variable_nodes:
# create a variable using the config in vn
var = Variable(vn, self.alarm_create, self.get_default_variable_config)
# Update list of variable names
if var.name not in variable_names:
variable_names.append(var.name)
# build list of variables already present with same name
vars_with_same_name = [ v for v in self.variables if v.name == var.name ]
count = 0
append_var = True
for v in vars_with_same_name:
# this list should be 0 or 1 long!
if count > 0:
log_err("programmer error: found duplicate variable %s (uuid %s)" % (var.name, self.uuid))
self.variables.remove(v)
continue
count += 1
# only replace variable in self.variables if its config has changed.
# This way we don't reset its state
if variable_configs_differ(var, v):
self.variables.remove(v)
else:
append_var = False
if append_var:
print_debug("Appending %s to list of variables for UUID=%s" % (var.name, self.uuid))
self.variables.append(var)
# Now delete any old variables that do not appear in the new variable_nodes
variables_to_remove = [ v for v in self.variables if v.name not in variable_names ]
for v in variables_to_remove:
print_debug("Deleting %s from list of variables for UUID=%s" % (v.name, self.uuid))
self.variables.remove(v)
def process_rrd_updates(self, rrd_updates, session):
print_debug("%sMonitor processing rrd_updates for %s" % (self.monitortype, self.uuid))
obj_report = rrd_updates.get_obj_report_by_uuid(self.uuid)
num_rows = rrd_updates.get_num_rows()
if not obj_report:
return
params_in_obj_report = obj_report.get_var_names()
for var in self.variables:
# find the subset of the params returned for this object that we need to consolidate into var
params_to_consolidate = filter(var.rrd_regex.search, params_in_obj_report)
for row in range(num_rows):
# Get the values to consolidate
values_to_consolidate = map(lambda param: obj_report.get_value(param, row), params_to_consolidate)
# Consolidate them
value = var.consolidation_fn(values_to_consolidate)
# Pass result on to the variable object - this may result in an alarm being generated
var.update(value, session)
def alarm_create(self, var, session, message):
"Callback used by Variable var to actually send an alarm"
print_debug("Creating an alarm for %s %s, message: %s" % (self.monitortype, self.uuid, message))
session.xenapi.message.create("ALARM", var.alarm_priority, self.monitortype, self.uuid, message)
class VMMonitor(ObjectMonitor):
"""Object that maintains state of one VM
Configured by writing an xml string into an other-config key, e.g.
xe vm-param-set uuid=$vmuuid other-config:perfmon=\
'<config><variable><name value="cpu_usage"/><alarm_trigger_level value="0.5"/></variable></config>'
Notes:
- Multiple <variable> nodes allowed
- full list of child nodes is
* name: what to call the variable (no default)
* alarm_priority: the priority of the messages generated (default '5')
* alarm_trigger_level: level of value that triggers an alarm (no default)
* alarm_trigger_sense: 'high' if alarm_trigger_level is a max, otherwise 'low'. (default 'high')
* alarm_trigger_period: num seconds of 'bad' values before an alarm is sent (default '60')
* alarm_auto_inhibit_period: num seconds this alarm disabled after an alarm is sent (default '3600')
* consolidation_fn: how to combine variables from rrd_updates into one value
(default is 'average' for 'cpu_usage' & 'sum' for everything else)
* rrd_regex matches the names of variables from (xe vm-data-sources-list uuid=$vmuuid) used to compute value
(only has defaults for "cpu_usage", "network_usage", and "disk_usage")
"""
def __init__(self, *args):
self.monitortype = "VM"
ObjectMonitor.__init__(self, *args)
print_debug("Created VMMonitor with uuid %s" % self.uuid)
def get_default_variable_config(self, variable_name, config_tag):
"This allows user to not specify full set of tags for each variable in xml config"
if config_tag == 'consolidation_fn':
if variable_name == "cpu_usage": return 'average'
elif variable_name == "fs_usage": return 'get_percent_fs_usage'
else: return 'sum'
elif config_tag == 'rrd_regex':
if variable_name == "cpu_usage": return "cpu[0-9]+"
elif variable_name == "network_usage": return "vif_[0-9]+_[rt]x"
elif variable_name == "disk_usage": return "vbd_(xvd|hd)[a-z]+_(read|write)"
elif variable_name == "fs_usage": return "__DUMMY__" # match nothing
else: raise XmlConfigException, "variable %s: no default rrd_regex - please specify one" % variable_name
elif config_tag == 'alarm_trigger_period': return '60' # 1 minute
elif config_tag == 'alarm_auto_inhibit_period':
if variable_name == "fs_usage": return '604800' # 1 week
else: return '3600' # 1 hour
elif config_tag == 'alarm_trigger_level':
if variable_name == "fs_usage": return '0.9' # trigger when 90% full
else: raise XmlConfigException, "variable %s: no default alarm_trigger_level - please specify one" % variable_name
elif config_tag == 'alarm_trigger_sense': return 'high' # trigger if *above*
elif config_tag == 'alarm_priority': return '5' # the minimum priority required for mail-alarm to send
else: raise XmlConfigException, "variable %s: no default available for tag %s" % (variable_name, config_tag)
class HOSTMonitor(ObjectMonitor):
"""Object that maintains state of one Host
Configured by writing an xml string into an other-config key, e.g.
xe host-param-set uuid=$hostuuid other-config:perfmon=\
'<config><variable><name value="cpu_usage"/><alarm_trigger_level value="0.5"/></variable></config>'
Notes:
- Multiple <variable> nodes allowed
- full list of child nodes is
* name: what to call the variable (no default)
* alarm_priority: the priority of the messages generated (default '5')
* alarm_trigger_level: level of value that triggers an alarm (no default)
* alarm_trigger_sense: 'high' if alarm_trigger_level is a max, otherwise 'low'. (default 'high')
* alarm_trigger_period: num seconds of 'bad' values before an alarm is sent (default '60')
* alarm_auto_inhibit_period: num seconds this alarm disabled after an alarm is sent (default '3600')
* consolidation_fn: how to combine variables from rrd_updates into one value
(default is 'average' for 'cpu_usage' & 'sum' for everything else)
* rrd_regex matches the names of variables from (xe host-data-sources-list uuid=$hostuuid) used to compute value
(only has defaults for "cpu_usage" and "network_usage")
"""
def __init__(self, *args):
self.monitortype = "HOST"
ObjectMonitor.__init__(self, *args)
print_debug("Created HOSTMonitor with uuid %s" % self.uuid)
def get_default_variable_config(self, variable_name, config_tag):
"This allows user to not specify full set of tags for each variable in xml config"
if config_tag == 'consolidation_fn':
if variable_name == "cpu_usage": return 'average'
else: return 'sum'
elif config_tag == 'rrd_regex':
if variable_name == "cpu_usage": return "cpu[0-9]+"
elif variable_name == "network_usage": return "pif_eth[0-9]+_[rt]x"
else: raise XmlConfigException, "variable %s: no default rrd_regex - please specify one" % variable_name
elif config_tag == 'alarm_trigger_period': return '60' # 1 minute
elif config_tag == 'alarm_auto_inhibit_period': return '3600' # 1 hour
elif config_tag == 'alarm_trigger_sense': return 'high' # trigger if *above*
elif config_tag == 'alarm_priority': return '5' # the minimum priority required for mail-alarm to send
else: raise XmlConfigException, "variable %s: no default available for tag %s" % (variable_name, config_tag)
all_xmlconfigs = {}
def update_all_xmlconfigs(session):
"""Update all_xmlconfigs, a global dictionary that maps any uuid
(host or VM) to the xml config string in other-config:perfmon keys"""
global all_xmlconfigs
all_host_recs = session.xenapi.host.get_all_records()
all_vm_recs = session.xenapi.VM.get_all_records()
# build dictionary mapping uuids to other_configs
all_otherconfigs = {}
all_otherconfigs.update([
(all_host_recs[ref]['uuid'], all_host_recs[ref]['other_config'])
for ref in all_host_recs.keys()
])
all_otherconfigs.update([
(all_vm_recs[ref]['uuid'], all_vm_recs[ref]['other_config'])
for ref in all_vm_recs.keys()
])
# rebuild dictionary mapping uuids to xmlconfigs
all_xmlconfigs.clear()
all_xmlconfigs.update([
(uuid, other_config['perfmon'])
for (uuid, other_config) in all_otherconfigs.items()
if other_config.has_key('perfmon')
])
# 5 minute default interval
interval = 300
interval_percent_dither = 5
rrd_step = 60
debug = False
# rate to call update_all_xmlconfigs()
config_update_period = 1800
cmdsockname = "\0perfmon" # an af_unix socket name (the "\0" stops socket.bind() creating a fs node)
cmdmaxlen = 256
def main():
global interval
global interval_percent_dither
global rrd_step
global debug
global config_update_period
maxruns=None
try:
argv = sys.argv[1:]
opts, args = getopt.getopt(argv, "i:n:ds:c:D:",
["interval=", "numloops=","debug","rrdstep=","config_update_period=","interval_percent_dither="])
except getopt.GetoptError:
raise UsageException
configfname = None
for opt, arg in opts:
if opt == '-i' or opt == '--interval':
interval = int(arg)
elif opt == '-n' or opt == '--numloops':
maxruns = int(arg)
elif opt == '-d' or opt == '--debug':
debug = True
elif opt == '-s' or opt == '--rrdstep':
rrd_step = int(arg)
if rrd_step != 5 and rrd_step != 60:
raise UsageException
elif opt == '-c' or opt == '--config_update_period':
config_update_period = int(arg)
elif opt == '-D' or opt == '--interval_percent_dither':
interval_percent_dither = int(arg)
else:
raise UsageException
# open the cmd socket (over which we listen for commands such as "refresh")
cmdsock = socket.socket(socket.AF_UNIX, socket.SOCK_DGRAM)
cmdsock.bind(cmdsockname)
# The dither on each loop (prevents stampede on master)
rand = random.Random().uniform
dither = (interval * interval_percent_dither)/100.0
# Create a XAPI session on first run
restart_session = True
# Create a client for getting the rrd_updates over HTTP
rrd_updates = RRDUpdates()
# Work out when next to update all the xmlconfigs for all the
# hosts and all the VMs. This causes a lot of data to be retrieved
# from the master, so we only do it once every config_update_period
# and we cache the results
next_config_update = time.time()
# monitors for vms running on this host.
# This dictionary uses uuids to lookup each monitor object
vm_mon_lookup = {}
# The monitor for the host
host_mon = None
runs = 0
while True:
print_debug("Run: %d" % runs)
# Get new updates - and catch any http errors
try:
# if session has failed on last run we need to restart it
if restart_session:
session = XapiSession()
restart_session = False
rrd_updates.refresh(session)
# Should we update all_xmlconfigs
if time.time() >= next_config_update:
print_debug("Updating all_xmlconfigs")
# yes - update all the xml configs: this generates a few LARGE xapi messages from the master
update_all_xmlconfigs(session)
# Set time when to do this next
next_config_update = time.time() + config_update_period
# List of VMs present in rrd_updates
vm_uuid_list = rrd_updates.get_uuid_list_by_objtype('vm')
# Remove any monitors for VMs no longer listed in rrd_updates page
for uuid in vm_mon_lookup.keys():
if uuid not in vm_uuid_list:
vm_mon_lookup.pop(uuid)
# Create monitors for VMs that have just appeared in rrd_updates page
for uuid in vm_uuid_list:
if uuid not in vm_mon_lookup.keys():
vm_mon_lookup[uuid] = VMMonitor(uuid)
else:
# check if the config has changed, e.g. by XenCenter
vm_mon_lookup[uuid].refresh_config()
# Remove monitor for the host if it's no longer listed in rrd_updates page
# Create monitor for the host if it has just appeared in rrd_updates page
try:
host_uuid = rrd_updates.get_uuid_list_by_objtype('host')[0] # should only ever be one of these
except:
# list may be empty!
host_uuid = None
if not host_uuid:
host_mon = None
elif not host_mon:
host_mon = HOSTMonitor(host_uuid)
elif host_mon.uuid != host_uuid:
raise PerfMonException, "host uuid in rrd_updates changed (old: %s, new %s)" % \
(host_mon.uuid, host_uuid)
else:
# check if the config has changed, e.g. by XenCenter
host_mon.refresh_config()
# Go through each vm_mon and update it using the rrd_udpates - this may generate alarms
for vm_mon in vm_mon_lookup.values():
vm_mon.process_rrd_updates(rrd_updates, session)
# Ditto for the host_mon
if host_mon:
host_mon.process_rrd_updates(rrd_updates, session)
except IOError, e:
if e.args[0] == 'http error' and e.args[1] in (401, 500):
# Error getting rrd_updates: 401=Unauthorised, 500=Internal - start new session
pass
elif e.args[0] == 'socket error':
# This happens if we send messages or read other-config:perfmon after xapi is restarted
pass
else:
# Don't know why we got this error - crash, die and look at logs later
raise
log_err("caught IOError: (%s) - restarting XAPI session" % " ".join([str(x) for x in e.args]))
restart_session = True
except socket.error, e:
if e.args[0] == 111:
# "Connection refused" - this happens when we try to restart session and *that* fails
pass
log_err("caught socket.error: (%s) - restarting XAPI session" % " ".join([str(x) for x in e.args]))
restart_session = True
runs += 1
if maxruns is not None and runs >= maxruns:
break
# Force collection of cyclically referenced objects cos we don't
# trust GC to do it on its own
gc.collect()
# Sleep for interval +/- dither, exiting early if we recv a cmd
timeout = rand(interval - dither, interval + dither)
cmdsock.settimeout(timeout)
try:
cmd = cmdsock.recv(cmdmaxlen)
except socket.timeout:
pass
else:
if cmd == "refresh":
# This forces a re-read of all the configs on the next loop
next_config_update = time.time()
elif cmd == "debug_mem":
debug_mem()
else:
log_err("received unhandled command %s" % cmd)
# continue to next run
return 0
def sigterm_handler(sig, stack_frame):
log_err("Caught signal %d - exiting" % sig)
sys.exit(1)
pidfile = "/var/run/perfmon.pid"
if __name__ == "__main__":
# setup signal handler to print out notice when killed
signal.signal(signal.SIGTERM, sigterm_handler)
if '--daemon' in sys.argv[1:]:
sys.argv.remove('--daemon')
if os.fork() != 0:
sys.exit(0)
os.setsid()
sys.stdout=open("/dev/null", 'w')
sys.stdin=open("/dev/null", 'r')
sys.stderr=sys.stdout
# Exit if perfmon already running
if os.path.exists(pidfile):
pid = open(pidfile).read()
if os.path.exists("/proc/%s" % pid):
log_err("perfmon already running - exiting")
sys.exit(3)
try:
# Write out pidfile
fd = open(pidfile,"w")
fd.write("%d" % os.getpid())
fd.close()
# run the main loop
rc = main()
except UsageException, e:
# Print the usage
log_err("usage: %s [-i <interval> -n <loops> -d -s <rrd_step> -c <config_update_period> -D <interval_percent_dither>] \\\n" \
"\t[--interval=<interval> --numloops=<loops> --debug \\\n" \
"\t --rrdstep=<rrd_step> --daemon]\n" \
"\t --config_update_period=<config_update_period>\n" \
"\t --interval_percent_dither=<interval_percent_dither>\n" \
" interval:\tseconds between reads of http://localhost/rrd_updates?...\n" \
" loops:\tnumber of times to run before exiting\n" \
" rrd_step:\tseconds between samples provided by rrd_updates. Valid values are 5 or 60\n" \
" config_update_period:\tseconds between getting updates of all VM/host records from master\n" \
" interval_percent_dither:\tmax percent dither in each loop - prevents stampede on master\n" \
% (sys.argv[0]))
rc = 1
except SystemExit:
# we caught a signal which we have already logged
pass
except Exception, e:
ex = sys.exc_info()
err = traceback.format_exception(*ex)
errmsg = "\n".join([ str(x) for x in e.args ])
# print the exception args nicely
log_err(errmsg)
# now log the traceback to syslog
for exline in err:
log_err(exline)
rc = 2
# remove pidfile and exit
os.unlink(pidfile)
sys.exit(rc)
Jump to Line
Something went wrong with that request. Please try again.