Skip to content
Merged
Changes from all commits
Commits
Show all changes
36 commits
Select commit Hold shift + click to select a range
de034d3
Initial sqlite3 support
kkellerlbl Jul 18, 2024
d0abcc8
Update check_rancher_services.py
kkellerlbl Jul 18, 2024
24d5e24
Update check_rancher_services.py
kkellerlbl Jul 18, 2024
bc18a6f
Update check_rancher_services.py
kkellerlbl Jul 18, 2024
beae706
Update check_rancher_services.py
kkellerlbl Jul 18, 2024
b64abc5
Update check_rancher_services.py
kkellerlbl Jul 18, 2024
01aa584
assume healthy stack == healthy services
kkellerlbl Jul 18, 2024
1f9120d
Fix sqlite transaction handling
kkellerlbl Jul 18, 2024
b3e5670
Use timestamp
kkellerlbl Jul 18, 2024
9d26c91
Query services in stack
kkellerlbl Jul 18, 2024
300f419
Update check_rancher_services.py
kkellerlbl Jul 18, 2024
9625229
override rancher health state
kkellerlbl Jul 18, 2024
4f2a49f
Update check_rancher_services.py
kkellerlbl Jul 18, 2024
2fe40f9
Update check_rancher_services.py
kkellerlbl Jul 18, 2024
ae19065
Update check_rancher_services.py
kkellerlbl Jul 18, 2024
47db995
Update check_rancher_services.py
kkellerlbl Jul 18, 2024
9312bbe
Update check_rancher_services.py
kkellerlbl Jul 18, 2024
744492f
Add unhealthy services to output
kkellerlbl Jul 18, 2024
ec0bf58
Update check_rancher_services.py
kkellerlbl Jul 18, 2024
bb62902
Update check_rancher_services.py
kkellerlbl Jul 18, 2024
69e1829
Add primary key to sqlite file
kkellerlbl Jul 18, 2024
0240047
fix primary key syntax
kkellerlbl Jul 18, 2024
bf66bb6
Update check_rancher_services.py
kkellerlbl Jul 18, 2024
147ac11
Add bad service to sqlite
kkellerlbl Jul 18, 2024
289f2b4
Update check_rancher_services.py
kkellerlbl Jul 18, 2024
c38370f
Query sqlite for bad services that are too old
kkellerlbl Jul 18, 2024
8218f10
Update check_rancher_services.py
kkellerlbl Jul 18, 2024
c6d955c
remove old file timestamp check
kkellerlbl Jul 18, 2024
7200dbf
Update check_rancher_services.py
kkellerlbl Jul 18, 2024
689e1fa
added minor comments
kkellerlbl Jul 18, 2024
0d78377
minor output formatting change
kkellerlbl Jul 18, 2024
19d6c77
Update check_rancher_services.py
kkellerlbl Jul 18, 2024
ce15a2f
Fudge critical threshold for unhealthy services
kkellerlbl Jul 18, 2024
ab2a6a5
Update check_rancher_services.py
kkellerlbl Jul 18, 2024
76e560c
Update check_rancher_services.py
kkellerlbl Jul 18, 2024
4c370f0
Additional comments
kkellerlbl Jul 19, 2024
File filter

Filter by extension

Filter by extension

Conversations
Failed to load comments.
Loading
Jump to
Jump to file
Failed to load files.
Loading
Diff view
Diff view
137 changes: 66 additions & 71 deletions check_rancher_services.py
Original file line number Diff line number Diff line change
Expand Up @@ -13,6 +13,7 @@
import json
import subprocess
import time
import sqlite3
# this requires python 3.4
import pathlib
from pprint import pprint
Expand Down Expand Up @@ -60,7 +61,7 @@ def process_section(conf, section):
hostsReq=session.get(urlbase+'/v2-beta/projects/' + envid + '/hosts/', auth=(username,password))
hostData=hostsReq.json()['data']

# monitor an agent
# monitor rancher agents
for host in hostData:
state=3
stateText='UNKNOWN'
Expand Down Expand Up @@ -93,32 +94,13 @@ def process_section(conf, section):
sys.exit(0)
stackId = stackData[myStack]['id']

### this part needs a lot of work
### moving to separate check_rancher_containers.py till we can figure out how to
### get stats directly from rancher 1.x API
# memState = 0
# memStateTxt = 'OK'
# memCommentTxt = ''
## can only check stats on the local host
## to do: try to talk to the websocket to get stats from rancher API instead
# dockerStats = dict()

# only get stats if hostid specified (since some hosts' subprocess module is broken)
# if hostid is not None:
# dockerStatsProc = subprocess.run(["docker", "stats", "--no-stream", "--no-trunc", "-a", "--format", "'{{.ID}}:{{.MemUsage}}'"], stdout=subprocess.PIPE)
## print(dockerStatsProc)
# for line in dockerStatsProc.stdout.decode('utf-8').rstrip().split('\n'):
# mylist = line.strip("'").split(':')
# memUse = mylist[1].split(' ')
# dockerStats[mylist[0]] = memUse[0]
## print(dockerStats)

##### test health of listed services (if any)
# track if there's an old dummy service that wasn't deleted
oldDummyService = None

for serviceId in stackData[myStack]['serviceIds']:
# print (serviceId)
# in that stack, look through serviceIds for named services in /v2-beta/projects/envid/services/serviceId
# in the stack, look through serviceIds for named services in /v2-beta/projects/envid/services/serviceId
serviceReq=session.get(urlbase+'/v2-beta/projects/' + envid + '/services/' + serviceId, auth=(username,password))
svc=serviceReq.json()
if svc['name'] == 'checkmkDummy':
Expand All @@ -137,66 +119,83 @@ def process_section(conf, section):
print (str(serviceState) + ' ' + envname + '_' + stackname + '_' + svc['name'] + ' - ' + serviceStateTxt + ' running instances: ' + str(svc['currentScale']))
# print svc['healthState']


##### test overall stack health
if (conf.has_option(section,'test_stack_health') and conf.getboolean(section,'test_stack_health') is True):
stackState = 3
stackStateTxt = 'UNKNOWN'
stackExtraTxt = ''

if (conf.has_option(section,'stack_health_dir')):
stackHealthFile = conf[section]['stack_health_dir'] + '/' + envname + '_' + stackname + '_stackHealth'
stackPath = pathlib.Path(stackHealthFile)
# make sure the file exists, in case stack has never been healthy
# (should also error immediately if a bad path is provided in the config file)
if (not stackPath.exists()):
stackPath.touch()

stackHealthFile = conf[section]['stack_health_dir'] + '/' + envname + '_' + stackname + '_stackHealth.db'
stackPath = pathlib.Path(stackHealthFile)
# make sure the db file exists, in case stack has never been checked
# (should also error immediately if a bad path is provided in the config file)
if (not stackPath.exists()):
conn = sqlite3.connect(stackHealthFile)
conn.execute('CREATE TABLE badServices (serviceId TEXT PRIMARY KEY, serviceName TEXT, lastUpdate DATETIME DEFAULT CURRENT_TIMESTAMP)')
conn.commit()
conn.close()

conn = sqlite3.connect(stackHealthFile)

if stackData[myStack]['healthState'] == 'healthy':
stackState = 0
stackStateTxt = 'OK'
if (conf.has_option(section,'stack_health_dir')):
stackPath.touch()
# if stackData[myStack]['healthState'] == 'degraded':
# this may be too broad, but let's see if it's a problem
# just assume all services are healthy if stack is, and delete all bad services from the db
conn.execute('DELETE FROM badServices')
conn.commit()

##### if stack reports degraded, look through services in stack to verify
# (rancher 1 doesn't really report this very well)
else:
stackState = 1
stackStateTxt = 'WARNING'
if (conf.has_option(section,'stack_health_dir') and conf.has_option(section,'stack_health_age') and stackPath.exists()):
# check age, if too old, make state critical
# if missing, don't do anything?
if (time.time() - stackPath.stat().st_mtime > float(conf[section]['stack_health_age'])):
stackState = 2
stackStateTxt = 'CRITICAL (state ' + str(int(time.time() - stackPath.stat().st_mtime)) + 'sec old)'

print (str(stackState) + ' ' + envname + '_' + stackname + '_stackHealth - ' + stackStateTxt + ' stack health is ' + stackData[myStack]['healthState'])

# if on a host running containers, check their resources
# assume only one instance per service
### this part needs lots of work
# if hostid is not None:
# instanceReq=session.get(urlbase+'/v2-beta/projects/' + envid + '/instances/' + svc['instanceIds'][0], auth=(username,password))
# rancherInstance=instanceReq.json()
# to do: give a hostname, and match it up to the rancher API hostId
# otherwise, if the hostId changes, such as if a host is removed and added back to Rancher,
# the container memory check will always be OK
# if rancherInstance['hostId'] == hostid:
## print (rancherInstance['name'] + ' ' + rancherInstance['externalId'])
# memUse = dockerStats[rancherInstance['externalId']]
## print (memUse)
## crude hack: docker stats outputs human readable. assume we only care about GB or more use
## future: better calculations
# if 'G' in memUse:
# memState = 1
# memStateTxt = 'WARNING'
# memCommentTxt += (svc['name'] + ': ' + str(memUse) + ' ;; ')

# if hostid is not None:
# print (str(memState) + ' ' + envname + '_' + stackname + '_containerMemory-' + hostid + ' - ' + memStateTxt + ' big mem containers on host ' + hostid + ' : ' + memCommentTxt)
# we're trolling this again, meh. but only when stack is unhealthy, so don't worry about it
for serviceId in stackData[myStack]['serviceIds']:
healthServiceReq=session.get(urlbase+'/v2-beta/projects/' + envid + '/services/' + serviceId, auth=(username,password))
healthSvc=healthServiceReq.json()
# print (healthSvc['id'] + ' ' + healthSvc['healthState'])
if (healthSvc['healthState'] == 'healthy' or healthSvc['healthState'] == 'started-once'):
conn.execute('DELETE FROM badServices WHERE serviceId = ?', [ healthSvc['id'] ] )
conn.commit()
else:
conn.execute('INSERT OR IGNORE INTO badServices (serviceId, serviceName) VALUES (?,?)', [ healthSvc['id'], healthSvc['name']] )
conn.commit()


cursor = conn.cursor()
# this should return only services that have been unhealthy for a while (in theory persistently unhealthy)
query = "SELECT serviceName FROM badServices WHERE (datetime(lastUpdate) < datetime('now','-" + conf[section]['stack_health_age'] + " seconds' ))"
# print (query)
cursor.execute(query)

# fetchall isn't great in theory, but in practice we should have very few rows in these tables
badServices = cursor.fetchall()
if (len(badServices) == 0):
# all services now OK, so assume stack OK
stackState = 0
stackStateTxt = 'OK'
else:
stackState = 1
stackStateTxt = 'WARNING'
stackExtraTxt = ' ; bad services: ' + ' '.join([ t[0] for t in badServices])
query = "SELECT serviceName FROM badServices WHERE (datetime(lastUpdate) < datetime('now','-" + str(2 * int(conf[section]['stack_health_age'])) + " seconds' ))"
# print (query)
cursor.execute(query)
reallyBadServices = cursor.fetchall()
if (len(reallyBadServices) > 0):
stackState = 2
stackStateTxt = 'CRITICAL'

conn.close()
print (str(stackState) + ' ' + envname + '_' + stackname + '_stackHealth - ' + stackStateTxt + ' stack health is ' + stackData[myStack]['healthState'] + stackExtraTxt)

if (not conf.has_option(section,'test_create_new')):
return None
if (conf.getboolean(section,'test_create_new') is False):
return None

### spin up a dummy new service
##### if requested in config, test spinning up a dummy new service
# initially copied from narrative-traefiker
containerConfig = {u'assignServiceIpAddress': False,
u'createIndex': None,
Expand Down Expand Up @@ -343,10 +342,7 @@ def process_section(conf, section):
print (str(dummyServiceState) + ' ' + envname + '_' + stackname + '_createNewService - ' + dummyServiceStateTxt)


# in each service find the last logs? may be hard, need websocket


# main loop
##### main loop
# if args provided, use them, otherwise use sections from config file
if args.sections:
sections = args.sections
Expand All @@ -356,4 +352,3 @@ def process_section(conf, section):
for section in sections:
# print (section)
process_section(conf, section)