From de034d31284891c67b0911818e91d0b25b462a03 Mon Sep 17 00:00:00 2001 From: kkellerlbl Date: Thu, 18 Jul 2024 12:33:13 -0700 Subject: [PATCH 01/36] Initial sqlite3 support Create sqlite3 file for storing list of bad services --- check_rancher_services.py | 5 ++++- 1 file changed, 4 insertions(+), 1 deletion(-) diff --git a/check_rancher_services.py b/check_rancher_services.py index 485297e..891242b 100644 --- a/check_rancher_services.py +++ b/check_rancher_services.py @@ -13,6 +13,7 @@ import json import subprocess import time +import sqlite3 # this requires python 3.4 import pathlib from pprint import pprint @@ -147,7 +148,9 @@ def process_section(conf, section): # make sure the file exists, in case stack has never been healthy # (should also error immediately if a bad path is provided in the config file) if (not stackPath.exists()): - stackPath.touch() + conn = sqlite3.connect(stackPath) + cursor = conn.cursor() + cursor.execute('CREATE TABLE badServices (serviceName text)') if stackData[myStack]['healthState'] == 'healthy': stackState = 0 From d0abcc886196f872867776078f486408d06fa7e6 Mon Sep 17 00:00:00 2001 From: kkellerlbl Date: Thu, 18 Jul 2024 12:34:11 -0700 Subject: [PATCH 02/36] Update check_rancher_services.py fix tabs --- check_rancher_services.py | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/check_rancher_services.py b/check_rancher_services.py index 891242b..81c967f 100644 --- a/check_rancher_services.py +++ b/check_rancher_services.py @@ -148,7 +148,7 @@ def process_section(conf, section): # make sure the file exists, in case stack has never been healthy # (should also error immediately if a bad path is provided in the config file) if (not stackPath.exists()): - conn = sqlite3.connect(stackPath) + conn = sqlite3.connect(stackPath) cursor = conn.cursor() cursor.execute('CREATE TABLE badServices (serviceName text)') From 24d5e247f2d83e720ecb0d2708f6a3d8478287cd Mon Sep 17 00:00:00 2001 From: kkellerlbl Date: Thu, 18 Jul 2024 12:35:05 -0700 Subject: [PATCH 03/36] Update check_rancher_services.py --- check_rancher_services.py | 8 ++++---- 1 file changed, 4 insertions(+), 4 deletions(-) diff --git a/check_rancher_services.py b/check_rancher_services.py index 81c967f..eb84412 100644 --- a/check_rancher_services.py +++ b/check_rancher_services.py @@ -147,10 +147,10 @@ def process_section(conf, section): stackPath = pathlib.Path(stackHealthFile) # make sure the file exists, in case stack has never been healthy # (should also error immediately if a bad path is provided in the config file) - if (not stackPath.exists()): - conn = sqlite3.connect(stackPath) - cursor = conn.cursor() - cursor.execute('CREATE TABLE badServices (serviceName text)') + if (not stackPath.exists()): + conn = sqlite3.connect(stackPath) + cursor = conn.cursor() + cursor.execute('CREATE TABLE badServices (serviceName text)') if stackData[myStack]['healthState'] == 'healthy': stackState = 0 From bc18a6fc4eeac92bb1f516c93f7b43b888cc0243 Mon Sep 17 00:00:00 2001 From: kkellerlbl Date: Thu, 18 Jul 2024 12:35:47 -0700 Subject: [PATCH 04/36] Update check_rancher_services.py --- check_rancher_services.py | 8 ++++---- 1 file changed, 4 insertions(+), 4 deletions(-) diff --git a/check_rancher_services.py b/check_rancher_services.py index eb84412..9add8dc 100644 --- a/check_rancher_services.py +++ b/check_rancher_services.py @@ -143,10 +143,10 @@ def process_section(conf, section): stackStateTxt = 'UNKNOWN' if (conf.has_option(section,'stack_health_dir')): - stackHealthFile = conf[section]['stack_health_dir'] + '/' + envname + '_' + stackname + '_stackHealth' - stackPath = pathlib.Path(stackHealthFile) - # make sure the file exists, in case stack has never been healthy - # (should also error immediately if a bad path is provided in the config file) + stackHealthFile = conf[section]['stack_health_dir'] + '/' + envname + '_' + stackname + '_stackHealth' + stackPath = pathlib.Path(stackHealthFile) + # make sure the file exists, in case stack has never been healthy + # (should also error immediately if a bad path is provided in the config file) if (not stackPath.exists()): conn = sqlite3.connect(stackPath) cursor = conn.cursor() From beae7066fda66aa29e17bb4998f44d5db29070fd Mon Sep 17 00:00:00 2001 From: kkellerlbl Date: Thu, 18 Jul 2024 12:36:32 -0700 Subject: [PATCH 05/36] Update check_rancher_services.py Fix path to sqlite file --- check_rancher_services.py | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/check_rancher_services.py b/check_rancher_services.py index 9add8dc..6349a00 100644 --- a/check_rancher_services.py +++ b/check_rancher_services.py @@ -148,7 +148,7 @@ def process_section(conf, section): # make sure the file exists, in case stack has never been healthy # (should also error immediately if a bad path is provided in the config file) if (not stackPath.exists()): - conn = sqlite3.connect(stackPath) + conn = sqlite3.connect(stackHealthFile) cursor = conn.cursor() cursor.execute('CREATE TABLE badServices (serviceName text)') From b64abc5d7b15efb2b3b36f8498e7868f83ac0443 Mon Sep 17 00:00:00 2001 From: kkellerlbl Date: Thu, 18 Jul 2024 12:37:48 -0700 Subject: [PATCH 06/36] Update check_rancher_services.py use .db extension --- check_rancher_services.py | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/check_rancher_services.py b/check_rancher_services.py index 6349a00..1ca50c1 100644 --- a/check_rancher_services.py +++ b/check_rancher_services.py @@ -143,7 +143,7 @@ def process_section(conf, section): stackStateTxt = 'UNKNOWN' if (conf.has_option(section,'stack_health_dir')): - stackHealthFile = conf[section]['stack_health_dir'] + '/' + envname + '_' + stackname + '_stackHealth' + stackHealthFile = conf[section]['stack_health_dir'] + '/' + envname + '_' + stackname + '_stackHealth.db' stackPath = pathlib.Path(stackHealthFile) # make sure the file exists, in case stack has never been healthy # (should also error immediately if a bad path is provided in the config file) From 01aa5841152b2353c333d7c212fdf8f8cb9af416 Mon Sep 17 00:00:00 2001 From: kkellerlbl Date: Thu, 18 Jul 2024 12:43:48 -0700 Subject: [PATCH 07/36] assume healthy stack == healthy services Assume that if Rancher reports the stack is healthy, all services are currently healthy, and delete the table rows. --- check_rancher_services.py | 8 ++++++-- 1 file changed, 6 insertions(+), 2 deletions(-) diff --git a/check_rancher_services.py b/check_rancher_services.py index 1ca50c1..2ebd4b0 100644 --- a/check_rancher_services.py +++ b/check_rancher_services.py @@ -151,12 +151,16 @@ def process_section(conf, section): conn = sqlite3.connect(stackHealthFile) cursor = conn.cursor() cursor.execute('CREATE TABLE badServices (serviceName text)') - + + conn = sqlite3.connect(stackHealthFile) + cursor = conn.cursor() + if stackData[myStack]['healthState'] == 'healthy': stackState = 0 stackStateTxt = 'OK' if (conf.has_option(section,'stack_health_dir')): - stackPath.touch() + cursor.execute('DELETE FROM badServices') + # if stackData[myStack]['healthState'] == 'degraded': # this may be too broad, but let's see if it's a problem else: From 1f9120db0f8a081bfcb56dddbb348cb89902ecac Mon Sep 17 00:00:00 2001 From: kkellerlbl Date: Thu, 18 Jul 2024 12:49:37 -0700 Subject: [PATCH 08/36] Fix sqlite transaction handling --- check_rancher_services.py | 9 +++++---- 1 file changed, 5 insertions(+), 4 deletions(-) diff --git a/check_rancher_services.py b/check_rancher_services.py index 2ebd4b0..c133e3a 100644 --- a/check_rancher_services.py +++ b/check_rancher_services.py @@ -149,17 +149,17 @@ def process_section(conf, section): # (should also error immediately if a bad path is provided in the config file) if (not stackPath.exists()): conn = sqlite3.connect(stackHealthFile) - cursor = conn.cursor() - cursor.execute('CREATE TABLE badServices (serviceName text)') + conn.execute('CREATE TABLE badServices (serviceName text)') + conn.close() conn = sqlite3.connect(stackHealthFile) - cursor = conn.cursor() if stackData[myStack]['healthState'] == 'healthy': stackState = 0 stackStateTxt = 'OK' if (conf.has_option(section,'stack_health_dir')): - cursor.execute('DELETE FROM badServices') + conn.execute('DELETE FROM badServices') + conn.commit() # if stackData[myStack]['healthState'] == 'degraded': # this may be too broad, but let's see if it's a problem @@ -173,6 +173,7 @@ def process_section(conf, section): stackState = 2 stackStateTxt = 'CRITICAL (state ' + str(int(time.time() - stackPath.stat().st_mtime)) + 'sec old)' + conn.close() print (str(stackState) + ' ' + envname + '_' + stackname + '_stackHealth - ' + stackStateTxt + ' stack health is ' + stackData[myStack]['healthState']) # if on a host running containers, check their resources From b3e56705515387b8db529652df8a4d69507c4fdd Mon Sep 17 00:00:00 2001 From: kkellerlbl Date: Thu, 18 Jul 2024 12:55:29 -0700 Subject: [PATCH 09/36] Use timestamp Add a timestamp to bad service table. --- check_rancher_services.py | 10 ++++++++-- 1 file changed, 8 insertions(+), 2 deletions(-) diff --git a/check_rancher_services.py b/check_rancher_services.py index c133e3a..591366f 100644 --- a/check_rancher_services.py +++ b/check_rancher_services.py @@ -149,7 +149,7 @@ def process_section(conf, section): # (should also error immediately if a bad path is provided in the config file) if (not stackPath.exists()): conn = sqlite3.connect(stackHealthFile) - conn.execute('CREATE TABLE badServices (serviceName text)') + conn.execute('CREATE TABLE badServices (serviceName TEXT, lastUpdate DATETIME DEFAULT CURRENT_TIMESTAMP)') conn.close() conn = sqlite3.connect(stackHealthFile) @@ -158,11 +158,17 @@ def process_section(conf, section): stackState = 0 stackStateTxt = 'OK' if (conf.has_option(section,'stack_health_dir')): +# just assume all services are healthy if stack is, and delete them from the db conn.execute('DELETE FROM badServices') conn.commit() -# if stackData[myStack]['healthState'] == 'degraded': # this may be too broad, but let's see if it's a problem +# plan: look through services in stack +# if service healthy, delete from table +# if service unhealthy, look for it in table +# if not in table, insert it +# if in table, leave it +# after all services in stack are done, look through table, if any service timestamp is too old, throw alert else: stackState = 1 stackStateTxt = 'WARNING' From 9d26c91672e6e8e5d6511f22f47690628b9960d2 Mon Sep 17 00:00:00 2001 From: kkellerlbl Date: Thu, 18 Jul 2024 13:03:52 -0700 Subject: [PATCH 10/36] Query services in stack Also add serviceId to sqlite3 --- check_rancher_services.py | 13 +++++++++---- 1 file changed, 9 insertions(+), 4 deletions(-) diff --git a/check_rancher_services.py b/check_rancher_services.py index 591366f..29922aa 100644 --- a/check_rancher_services.py +++ b/check_rancher_services.py @@ -145,11 +145,11 @@ def process_section(conf, section): if (conf.has_option(section,'stack_health_dir')): stackHealthFile = conf[section]['stack_health_dir'] + '/' + envname + '_' + stackname + '_stackHealth.db' stackPath = pathlib.Path(stackHealthFile) - # make sure the file exists, in case stack has never been healthy + # make sure the db file exists, in case stack has never been checked # (should also error immediately if a bad path is provided in the config file) if (not stackPath.exists()): conn = sqlite3.connect(stackHealthFile) - conn.execute('CREATE TABLE badServices (serviceName TEXT, lastUpdate DATETIME DEFAULT CURRENT_TIMESTAMP)') + conn.execute('CREATE TABLE badServices (serviceId TEXT, serviceName TEXT, lastUpdate DATETIME DEFAULT CURRENT_TIMESTAMP)') conn.close() conn = sqlite3.connect(stackHealthFile) @@ -161,8 +161,7 @@ def process_section(conf, section): # just assume all services are healthy if stack is, and delete them from the db conn.execute('DELETE FROM badServices') conn.commit() - - # this may be too broad, but let's see if it's a problem + # plan: look through services in stack # if service healthy, delete from table # if service unhealthy, look for it in table @@ -170,6 +169,12 @@ def process_section(conf, section): # if in table, leave it # after all services in stack are done, look through table, if any service timestamp is too old, throw alert else: + # we're trolling this again, meh + for service in stackData[myStack]['serviceIds']: + serviceReq=session.get(urlbase+'/v2-beta/projects/' + envid + '/services/' + serviceId, auth=(username,password)) + svc=serviceReq.json() + print svc['healthState'] + stackState = 1 stackStateTxt = 'WARNING' if (conf.has_option(section,'stack_health_dir') and conf.has_option(section,'stack_health_age') and stackPath.exists()): From 300f41973a502a4e33967dcb780489e032fad3c1 Mon Sep 17 00:00:00 2001 From: kkellerlbl Date: Thu, 18 Jul 2024 13:04:26 -0700 Subject: [PATCH 11/36] Update check_rancher_services.py --- check_rancher_services.py | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/check_rancher_services.py b/check_rancher_services.py index 29922aa..7e0a54d 100644 --- a/check_rancher_services.py +++ b/check_rancher_services.py @@ -173,7 +173,7 @@ def process_section(conf, section): for service in stackData[myStack]['serviceIds']: serviceReq=session.get(urlbase+'/v2-beta/projects/' + envid + '/services/' + serviceId, auth=(username,password)) svc=serviceReq.json() - print svc['healthState'] + print (svc['healthState']) stackState = 1 stackStateTxt = 'WARNING' From 9625229c0c997c8ffa58ead120954a794cdefb89 Mon Sep 17 00:00:00 2001 From: kkellerlbl Date: Thu, 18 Jul 2024 13:12:08 -0700 Subject: [PATCH 12/36] override rancher health state If all services in stack are healthy, assume stack is now healthy --- check_rancher_services.py | 25 +++++++++++++++++-------- 1 file changed, 17 insertions(+), 8 deletions(-) diff --git a/check_rancher_services.py b/check_rancher_services.py index 7e0a54d..f265f82 100644 --- a/check_rancher_services.py +++ b/check_rancher_services.py @@ -174,15 +174,24 @@ def process_section(conf, section): serviceReq=session.get(urlbase+'/v2-beta/projects/' + envid + '/services/' + serviceId, auth=(username,password)) svc=serviceReq.json() print (svc['healthState']) + conn.execute('DELETE FROM badServices WHERE serviceId = ?',svc['id']) + conn.commit() - stackState = 1 - stackStateTxt = 'WARNING' - if (conf.has_option(section,'stack_health_dir') and conf.has_option(section,'stack_health_age') and stackPath.exists()): - # check age, if too old, make state critical - # if missing, don't do anything? - if (time.time() - stackPath.stat().st_mtime > float(conf[section]['stack_health_age'])): - stackState = 2 - stackStateTxt = 'CRITICAL (state ' + str(int(time.time() - stackPath.stat().st_mtime)) + 'sec old)' + cursor = conn.cursor() + cursor.execute('SELECT * FROM badServices') + if (len(cursor.fetchall()) == 0): + # all services now OK, so assume stack OK + stackState = 0 + stackStateTxt = 'OK' + else: + stackState = 1 + stackStateTxt = 'WARNING' + if (conf.has_option(section,'stack_health_dir') and conf.has_option(section,'stack_health_age') and stackPath.exists()): + # check age, if too old, make state critical + # if missing, don't do anything? + if (time.time() - stackPath.stat().st_mtime > float(conf[section]['stack_health_age'])): + stackState = 2 + stackStateTxt = 'CRITICAL (state ' + str(int(time.time() - stackPath.stat().st_mtime)) + 'sec old)' conn.close() print (str(stackState) + ' ' + envname + '_' + stackname + '_stackHealth - ' + stackStateTxt + ' stack health is ' + stackData[myStack]['healthState']) From 4f2a49f217322ac7067e29c5453b1aa5f77cc341 Mon Sep 17 00:00:00 2001 From: kkellerlbl Date: Thu, 18 Jul 2024 13:13:34 -0700 Subject: [PATCH 13/36] Update check_rancher_services.py --- check_rancher_services.py | 10 +++++----- 1 file changed, 5 insertions(+), 5 deletions(-) diff --git a/check_rancher_services.py b/check_rancher_services.py index f265f82..7617040 100644 --- a/check_rancher_services.py +++ b/check_rancher_services.py @@ -187,11 +187,11 @@ def process_section(conf, section): stackState = 1 stackStateTxt = 'WARNING' if (conf.has_option(section,'stack_health_dir') and conf.has_option(section,'stack_health_age') and stackPath.exists()): - # check age, if too old, make state critical - # if missing, don't do anything? - if (time.time() - stackPath.stat().st_mtime > float(conf[section]['stack_health_age'])): - stackState = 2 - stackStateTxt = 'CRITICAL (state ' + str(int(time.time() - stackPath.stat().st_mtime)) + 'sec old)' + # check age, if too old, make state critical + # if missing, don't do anything? + if (time.time() - stackPath.stat().st_mtime > float(conf[section]['stack_health_age'])): + stackState = 2 + stackStateTxt = 'CRITICAL (state ' + str(int(time.time() - stackPath.stat().st_mtime)) + 'sec old)' conn.close() print (str(stackState) + ' ' + envname + '_' + stackname + '_stackHealth - ' + stackStateTxt + ' stack health is ' + stackData[myStack]['healthState']) From 2fe40f99685cd52203bc0eb1008c0458274cb377 Mon Sep 17 00:00:00 2001 From: kkellerlbl Date: Thu, 18 Jul 2024 13:14:11 -0700 Subject: [PATCH 14/36] Update check_rancher_services.py --- check_rancher_services.py | 6 +++--- 1 file changed, 3 insertions(+), 3 deletions(-) diff --git a/check_rancher_services.py b/check_rancher_services.py index 7617040..45c7daf 100644 --- a/check_rancher_services.py +++ b/check_rancher_services.py @@ -189,9 +189,9 @@ def process_section(conf, section): if (conf.has_option(section,'stack_health_dir') and conf.has_option(section,'stack_health_age') and stackPath.exists()): # check age, if too old, make state critical # if missing, don't do anything? - if (time.time() - stackPath.stat().st_mtime > float(conf[section]['stack_health_age'])): - stackState = 2 - stackStateTxt = 'CRITICAL (state ' + str(int(time.time() - stackPath.stat().st_mtime)) + 'sec old)' + if (time.time() - stackPath.stat().st_mtime > float(conf[section]['stack_health_age'])): + stackState = 2 + stackStateTxt = 'CRITICAL (state ' + str(int(time.time() - stackPath.stat().st_mtime)) + 'sec old)' conn.close() print (str(stackState) + ' ' + envname + '_' + stackname + '_stackHealth - ' + stackStateTxt + ' stack health is ' + stackData[myStack]['healthState']) From ae190652501ead6f0c94b2080fe29f6a9d5e79b0 Mon Sep 17 00:00:00 2001 From: kkellerlbl Date: Thu, 18 Jul 2024 13:21:21 -0700 Subject: [PATCH 15/36] Update check_rancher_services.py scan services if stack health bad --- check_rancher_services.py | 8 ++++---- 1 file changed, 4 insertions(+), 4 deletions(-) diff --git a/check_rancher_services.py b/check_rancher_services.py index 45c7daf..a1121d3 100644 --- a/check_rancher_services.py +++ b/check_rancher_services.py @@ -171,10 +171,10 @@ def process_section(conf, section): else: # we're trolling this again, meh for service in stackData[myStack]['serviceIds']: - serviceReq=session.get(urlbase+'/v2-beta/projects/' + envid + '/services/' + serviceId, auth=(username,password)) - svc=serviceReq.json() - print (svc['healthState']) - conn.execute('DELETE FROM badServices WHERE serviceId = ?',svc['id']) + healthServiceReq=session.get(urlbase+'/v2-beta/projects/' + envid + '/services/' + serviceId, auth=(username,password)) + healthSvc=healthServiceReq.json() + print (healthSvc['id'] + ' ' + healthSvc['healthState']) + conn.execute('DELETE FROM badServices WHERE serviceId = ?', [ healthSvc['id'] ] ) conn.commit() cursor = conn.cursor() From 47db995ea01820f17a2c73f4cc4d6c0a479273ad Mon Sep 17 00:00:00 2001 From: kkellerlbl Date: Thu, 18 Jul 2024 13:23:52 -0700 Subject: [PATCH 16/36] Update check_rancher_services.py --- check_rancher_services.py | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/check_rancher_services.py b/check_rancher_services.py index a1121d3..d466307 100644 --- a/check_rancher_services.py +++ b/check_rancher_services.py @@ -170,7 +170,7 @@ def process_section(conf, section): # after all services in stack are done, look through table, if any service timestamp is too old, throw alert else: # we're trolling this again, meh - for service in stackData[myStack]['serviceIds']: + for serviceId in stackData[myStack]['serviceIds']: healthServiceReq=session.get(urlbase+'/v2-beta/projects/' + envid + '/services/' + serviceId, auth=(username,password)) healthSvc=healthServiceReq.json() print (healthSvc['id'] + ' ' + healthSvc['healthState']) From 9312bbe01505cd74bdb29a41d89ffca6e68059fe Mon Sep 17 00:00:00 2001 From: kkellerlbl Date: Thu, 18 Jul 2024 13:26:02 -0700 Subject: [PATCH 17/36] Update check_rancher_services.py --- check_rancher_services.py | 5 +++-- 1 file changed, 3 insertions(+), 2 deletions(-) diff --git a/check_rancher_services.py b/check_rancher_services.py index d466307..4748ac1 100644 --- a/check_rancher_services.py +++ b/check_rancher_services.py @@ -174,8 +174,9 @@ def process_section(conf, section): healthServiceReq=session.get(urlbase+'/v2-beta/projects/' + envid + '/services/' + serviceId, auth=(username,password)) healthSvc=healthServiceReq.json() print (healthSvc['id'] + ' ' + healthSvc['healthState']) - conn.execute('DELETE FROM badServices WHERE serviceId = ?', [ healthSvc['id'] ] ) - conn.commit() + if (healthSvc['healthState'] == 'healthy' or healthSvc['healthState'] == 'started-once'): + conn.execute('DELETE FROM badServices WHERE serviceId = ?', [ healthSvc['id'] ] ) + conn.commit() cursor = conn.cursor() cursor.execute('SELECT * FROM badServices') From 744492f610b7b3bbd7f3f98b0d144e36e9f76407 Mon Sep 17 00:00:00 2001 From: kkellerlbl Date: Thu, 18 Jul 2024 13:36:55 -0700 Subject: [PATCH 18/36] Add unhealthy services to output --- check_rancher_services.py | 10 +++++++--- 1 file changed, 7 insertions(+), 3 deletions(-) diff --git a/check_rancher_services.py b/check_rancher_services.py index 4748ac1..7dbed6b 100644 --- a/check_rancher_services.py +++ b/check_rancher_services.py @@ -141,6 +141,7 @@ def process_section(conf, section): if (conf.has_option(section,'test_stack_health') and conf.getboolean(section,'test_stack_health') is True): stackState = 3 stackStateTxt = 'UNKNOWN' + stackExtraTxt = '' if (conf.has_option(section,'stack_health_dir')): stackHealthFile = conf[section]['stack_health_dir'] + '/' + envname + '_' + stackname + '_stackHealth.db' @@ -179,14 +180,17 @@ def process_section(conf, section): conn.commit() cursor = conn.cursor() - cursor.execute('SELECT * FROM badServices') - if (len(cursor.fetchall()) == 0): + cursor.execute('SELECT serviceName FROM badServices') + badServices = cursor.fetchall() + if (len(badServices) == 0): # all services now OK, so assume stack OK stackState = 0 stackStateTxt = 'OK' else: stackState = 1 stackStateTxt = 'WARNING' + stackExtraTxt = 'bad services: ' + ' '.join(badServices) + if (conf.has_option(section,'stack_health_dir') and conf.has_option(section,'stack_health_age') and stackPath.exists()): # check age, if too old, make state critical # if missing, don't do anything? @@ -195,7 +199,7 @@ def process_section(conf, section): stackStateTxt = 'CRITICAL (state ' + str(int(time.time() - stackPath.stat().st_mtime)) + 'sec old)' conn.close() - print (str(stackState) + ' ' + envname + '_' + stackname + '_stackHealth - ' + stackStateTxt + ' stack health is ' + stackData[myStack]['healthState']) + print (str(stackState) + ' ' + envname + '_' + stackname + '_stackHealth - ' + stackStateTxt + ' stack health is ' + stackData[myStack]['healthState'] + stackExtraTxt) # if on a host running containers, check their resources # assume only one instance per service From ec0bf588c5c90b1c544aab2c66cd42207e54287d Mon Sep 17 00:00:00 2001 From: kkellerlbl Date: Thu, 18 Jul 2024 13:43:23 -0700 Subject: [PATCH 19/36] Update check_rancher_services.py --- check_rancher_services.py | 4 ++-- 1 file changed, 2 insertions(+), 2 deletions(-) diff --git a/check_rancher_services.py b/check_rancher_services.py index 7dbed6b..2e3559d 100644 --- a/check_rancher_services.py +++ b/check_rancher_services.py @@ -189,7 +189,7 @@ def process_section(conf, section): else: stackState = 1 stackStateTxt = 'WARNING' - stackExtraTxt = 'bad services: ' + ' '.join(badServices) + stackExtraTxt = 'bad services: ' + ' '.join([ t[0] for t in badServices]) if (conf.has_option(section,'stack_health_dir') and conf.has_option(section,'stack_health_age') and stackPath.exists()): # check age, if too old, make state critical @@ -199,7 +199,7 @@ def process_section(conf, section): stackStateTxt = 'CRITICAL (state ' + str(int(time.time() - stackPath.stat().st_mtime)) + 'sec old)' conn.close() - print (str(stackState) + ' ' + envname + '_' + stackname + '_stackHealth - ' + stackStateTxt + ' stack health is ' + stackData[myStack]['healthState'] + stackExtraTxt) + print (str(stackState) + ' ' + envname + '_' + stackname + '_stackHealth - ' + stackStateTxt + ' stack health is ' + stackData[myStack]['healthState'] + ' ' + stackExtraTxt) # if on a host running containers, check their resources # assume only one instance per service From bb6290266d3745a28774df3b96a1af9e4b234202 Mon Sep 17 00:00:00 2001 From: kkellerlbl Date: Thu, 18 Jul 2024 13:44:43 -0700 Subject: [PATCH 20/36] Update check_rancher_services.py --- check_rancher_services.py | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/check_rancher_services.py b/check_rancher_services.py index 2e3559d..ce4fc5f 100644 --- a/check_rancher_services.py +++ b/check_rancher_services.py @@ -199,7 +199,7 @@ def process_section(conf, section): stackStateTxt = 'CRITICAL (state ' + str(int(time.time() - stackPath.stat().st_mtime)) + 'sec old)' conn.close() - print (str(stackState) + ' ' + envname + '_' + stackname + '_stackHealth - ' + stackStateTxt + ' stack health is ' + stackData[myStack]['healthState'] + ' ' + stackExtraTxt) + print (str(stackState) + ' ' + envname + '_' + stackname + '_stackHealth - ' + stackStateTxt + ' stack health is ' + stackData[myStack]['healthState'] + ' ; ' + stackExtraTxt) # if on a host running containers, check their resources # assume only one instance per service From 69e1829f4db66edd4e71cf2c46ae4fd5693eb9cc Mon Sep 17 00:00:00 2001 From: kkellerlbl Date: Thu, 18 Jul 2024 13:48:24 -0700 Subject: [PATCH 21/36] Add primary key to sqlite file --- check_rancher_services.py | 6 +++++- 1 file changed, 5 insertions(+), 1 deletion(-) diff --git a/check_rancher_services.py b/check_rancher_services.py index ce4fc5f..c05a697 100644 --- a/check_rancher_services.py +++ b/check_rancher_services.py @@ -150,7 +150,7 @@ def process_section(conf, section): # (should also error immediately if a bad path is provided in the config file) if (not stackPath.exists()): conn = sqlite3.connect(stackHealthFile) - conn.execute('CREATE TABLE badServices (serviceId TEXT, serviceName TEXT, lastUpdate DATETIME DEFAULT CURRENT_TIMESTAMP)') + conn.execute('CREATE TABLE badServices (serviceId TEXT, serviceName TEXT, lastUpdate DATETIME DEFAULT CURRENT_TIMESTAMP) PRIMARY KEY (serviceId)') conn.close() conn = sqlite3.connect(stackHealthFile) @@ -178,6 +178,10 @@ def process_section(conf, section): if (healthSvc['healthState'] == 'healthy' or healthSvc['healthState'] == 'started-once'): conn.execute('DELETE FROM badServices WHERE serviceId = ?', [ healthSvc['id'] ] ) conn.commit() + else: + conn.execute('DELETE FROM badServices WHERE serviceId = ?', [ healthSvc['id'] ] ) + conn.commit() + cursor = conn.cursor() cursor.execute('SELECT serviceName FROM badServices') From 02400477984c724e9860b6ab2bf4fa0f02ef082a Mon Sep 17 00:00:00 2001 From: kkellerlbl Date: Thu, 18 Jul 2024 13:49:23 -0700 Subject: [PATCH 22/36] fix primary key syntax --- check_rancher_services.py | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/check_rancher_services.py b/check_rancher_services.py index c05a697..a469ec3 100644 --- a/check_rancher_services.py +++ b/check_rancher_services.py @@ -150,7 +150,7 @@ def process_section(conf, section): # (should also error immediately if a bad path is provided in the config file) if (not stackPath.exists()): conn = sqlite3.connect(stackHealthFile) - conn.execute('CREATE TABLE badServices (serviceId TEXT, serviceName TEXT, lastUpdate DATETIME DEFAULT CURRENT_TIMESTAMP) PRIMARY KEY (serviceId)') + conn.execute('CREATE TABLE badServices (serviceId TEXT PRIMARY KEY, serviceName TEXT, lastUpdate DATETIME DEFAULT CURRENT_TIMESTAMP)') conn.close() conn = sqlite3.connect(stackHealthFile) From bf66bb61993b23bfffb1dde86af35e7296d066e8 Mon Sep 17 00:00:00 2001 From: kkellerlbl Date: Thu, 18 Jul 2024 13:50:17 -0700 Subject: [PATCH 23/36] Update check_rancher_services.py --- check_rancher_services.py | 1 + 1 file changed, 1 insertion(+) diff --git a/check_rancher_services.py b/check_rancher_services.py index a469ec3..c7c0e56 100644 --- a/check_rancher_services.py +++ b/check_rancher_services.py @@ -151,6 +151,7 @@ def process_section(conf, section): if (not stackPath.exists()): conn = sqlite3.connect(stackHealthFile) conn.execute('CREATE TABLE badServices (serviceId TEXT PRIMARY KEY, serviceName TEXT, lastUpdate DATETIME DEFAULT CURRENT_TIMESTAMP)') + conn.commit() conn.close() conn = sqlite3.connect(stackHealthFile) From 147ac11173afb295af7b07ac7e235c8fa1d4945c Mon Sep 17 00:00:00 2001 From: kkellerlbl Date: Thu, 18 Jul 2024 13:52:42 -0700 Subject: [PATCH 24/36] Add bad service to sqlite --- check_rancher_services.py | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/check_rancher_services.py b/check_rancher_services.py index c7c0e56..7ddffea 100644 --- a/check_rancher_services.py +++ b/check_rancher_services.py @@ -180,7 +180,7 @@ def process_section(conf, section): conn.execute('DELETE FROM badServices WHERE serviceId = ?', [ healthSvc['id'] ] ) conn.commit() else: - conn.execute('DELETE FROM badServices WHERE serviceId = ?', [ healthSvc['id'] ] ) + conn.execute('INSERT OR IGNORE INTO badServices (serviceId, serviceName) VALUES ( ?,?)', [ healthSvc['id'] healthSvc['name']] ) conn.commit() From 289f2b4ccc2cd95ea8198722d6d4e0c585bde909 Mon Sep 17 00:00:00 2001 From: kkellerlbl Date: Thu, 18 Jul 2024 13:53:43 -0700 Subject: [PATCH 25/36] Update check_rancher_services.py --- check_rancher_services.py | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/check_rancher_services.py b/check_rancher_services.py index 7ddffea..fa5745d 100644 --- a/check_rancher_services.py +++ b/check_rancher_services.py @@ -180,7 +180,7 @@ def process_section(conf, section): conn.execute('DELETE FROM badServices WHERE serviceId = ?', [ healthSvc['id'] ] ) conn.commit() else: - conn.execute('INSERT OR IGNORE INTO badServices (serviceId, serviceName) VALUES ( ?,?)', [ healthSvc['id'] healthSvc['name']] ) + conn.execute('INSERT OR IGNORE INTO badServices (serviceId, serviceName) VALUES (?,?)', [ healthSvc['id'], healthSvc['name']] ) conn.commit() From c38370fa338a0e43e16cf1e23bf3d2a322bd5797 Mon Sep 17 00:00:00 2001 From: kkellerlbl Date: Thu, 18 Jul 2024 14:25:57 -0700 Subject: [PATCH 26/36] Query sqlite for bad services that are too old --- check_rancher_services.py | 6 +++++- 1 file changed, 5 insertions(+), 1 deletion(-) diff --git a/check_rancher_services.py b/check_rancher_services.py index fa5745d..c080037 100644 --- a/check_rancher_services.py +++ b/check_rancher_services.py @@ -185,7 +185,11 @@ def process_section(conf, section): cursor = conn.cursor() - cursor.execute('SELECT serviceName FROM badServices') + query = "SELECT serviceName FROM badServices WHERE (datetime(lastUpdate) < datetime('now','-" + conf[section]['stack_health_age'] + " seconds' ))" +# print (query) + cursor.execute(query) + + cursor.execute('SELECT serviceName FROM badServices WHERE (datetime(lastUpdate) < datetime('now ') badServices = cursor.fetchall() if (len(badServices) == 0): # all services now OK, so assume stack OK From 8218f10fc318e2b525633c57989368223d114b39 Mon Sep 17 00:00:00 2001 From: kkellerlbl Date: Thu, 18 Jul 2024 14:26:35 -0700 Subject: [PATCH 27/36] Update check_rancher_services.py --- check_rancher_services.py | 1 - 1 file changed, 1 deletion(-) diff --git a/check_rancher_services.py b/check_rancher_services.py index c080037..6607386 100644 --- a/check_rancher_services.py +++ b/check_rancher_services.py @@ -189,7 +189,6 @@ def process_section(conf, section): # print (query) cursor.execute(query) - cursor.execute('SELECT serviceName FROM badServices WHERE (datetime(lastUpdate) < datetime('now ') badServices = cursor.fetchall() if (len(badServices) == 0): # all services now OK, so assume stack OK From c6d955cd7629806b53b0f67cf23e73f6def40402 Mon Sep 17 00:00:00 2001 From: kkellerlbl Date: Thu, 18 Jul 2024 14:29:02 -0700 Subject: [PATCH 28/36] remove old file timestamp check --- check_rancher_services.py | 7 ++++--- 1 file changed, 4 insertions(+), 3 deletions(-) diff --git a/check_rancher_services.py b/check_rancher_services.py index 6607386..5b4982e 100644 --- a/check_rancher_services.py +++ b/check_rancher_services.py @@ -202,9 +202,10 @@ def process_section(conf, section): if (conf.has_option(section,'stack_health_dir') and conf.has_option(section,'stack_health_age') and stackPath.exists()): # check age, if too old, make state critical # if missing, don't do anything? - if (time.time() - stackPath.stat().st_mtime > float(conf[section]['stack_health_age'])): - stackState = 2 - stackStateTxt = 'CRITICAL (state ' + str(int(time.time() - stackPath.stat().st_mtime)) + 'sec old)' + # not using, soon to remove +# if (time.time() - stackPath.stat().st_mtime > float(conf[section]['stack_health_age'])): +# stackState = 2 +# stackStateTxt = 'CRITICAL (state ' + str(int(time.time() - stackPath.stat().st_mtime)) + 'sec old)' conn.close() print (str(stackState) + ' ' + envname + '_' + stackname + '_stackHealth - ' + stackStateTxt + ' stack health is ' + stackData[myStack]['healthState'] + ' ; ' + stackExtraTxt) From 7200dbfa225d40dd144d8b10a65bf1147f4c598e Mon Sep 17 00:00:00 2001 From: kkellerlbl Date: Thu, 18 Jul 2024 14:29:32 -0700 Subject: [PATCH 29/36] Update check_rancher_services.py --- check_rancher_services.py | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/check_rancher_services.py b/check_rancher_services.py index 5b4982e..ef94987 100644 --- a/check_rancher_services.py +++ b/check_rancher_services.py @@ -199,7 +199,7 @@ def process_section(conf, section): stackStateTxt = 'WARNING' stackExtraTxt = 'bad services: ' + ' '.join([ t[0] for t in badServices]) - if (conf.has_option(section,'stack_health_dir') and conf.has_option(section,'stack_health_age') and stackPath.exists()): +# if (conf.has_option(section,'stack_health_dir') and conf.has_option(section,'stack_health_age') and stackPath.exists()): # check age, if too old, make state critical # if missing, don't do anything? # not using, soon to remove From 689e1facec4fd387b033c8fd6d0dd74536fd4ed8 Mon Sep 17 00:00:00 2001 From: kkellerlbl Date: Thu, 18 Jul 2024 14:39:06 -0700 Subject: [PATCH 30/36] added minor comments --- check_rancher_services.py | 8 ++++---- 1 file changed, 4 insertions(+), 4 deletions(-) diff --git a/check_rancher_services.py b/check_rancher_services.py index ef94987..c44c652 100644 --- a/check_rancher_services.py +++ b/check_rancher_services.py @@ -160,15 +160,13 @@ def process_section(conf, section): stackState = 0 stackStateTxt = 'OK' if (conf.has_option(section,'stack_health_dir')): -# just assume all services are healthy if stack is, and delete them from the db +# just assume all services are healthy if stack is, and delete all bad services from the db conn.execute('DELETE FROM badServices') conn.commit() # plan: look through services in stack # if service healthy, delete from table -# if service unhealthy, look for it in table -# if not in table, insert it -# if in table, leave it +# if service unhealthy, insert or ignore (ignore preserves original timestamp) # after all services in stack are done, look through table, if any service timestamp is too old, throw alert else: # we're trolling this again, meh @@ -199,6 +197,8 @@ def process_section(conf, section): stackStateTxt = 'WARNING' stackExtraTxt = 'bad services: ' + ' '.join([ t[0] for t in badServices]) +# ideally, have something here to set state CRITICAL if age is even older (maybe 2x what's in the ini file?) + # if (conf.has_option(section,'stack_health_dir') and conf.has_option(section,'stack_health_age') and stackPath.exists()): # check age, if too old, make state critical # if missing, don't do anything? From 0d78377ebc3b2f0dd6623f1e5a96f6ee1b9b7b5a Mon Sep 17 00:00:00 2001 From: kkellerlbl Date: Thu, 18 Jul 2024 15:43:59 -0700 Subject: [PATCH 31/36] minor output formatting change --- check_rancher_services.py | 4 ++-- 1 file changed, 2 insertions(+), 2 deletions(-) diff --git a/check_rancher_services.py b/check_rancher_services.py index c44c652..ab657b9 100644 --- a/check_rancher_services.py +++ b/check_rancher_services.py @@ -195,7 +195,7 @@ def process_section(conf, section): else: stackState = 1 stackStateTxt = 'WARNING' - stackExtraTxt = 'bad services: ' + ' '.join([ t[0] for t in badServices]) + stackExtraTxt = ' ; bad services: ' + ' '.join([ t[0] for t in badServices]) # ideally, have something here to set state CRITICAL if age is even older (maybe 2x what's in the ini file?) @@ -208,7 +208,7 @@ def process_section(conf, section): # stackStateTxt = 'CRITICAL (state ' + str(int(time.time() - stackPath.stat().st_mtime)) + 'sec old)' conn.close() - print (str(stackState) + ' ' + envname + '_' + stackname + '_stackHealth - ' + stackStateTxt + ' stack health is ' + stackData[myStack]['healthState'] + ' ; ' + stackExtraTxt) + print (str(stackState) + ' ' + envname + '_' + stackname + '_stackHealth - ' + stackStateTxt + ' stack health is ' + stackData[myStack]['healthState'] + stackExtraTxt) # if on a host running containers, check their resources # assume only one instance per service From 19d6c77e740ec9dde0711bc1828ba6abab5929e5 Mon Sep 17 00:00:00 2001 From: kkellerlbl Date: Thu, 18 Jul 2024 15:44:46 -0700 Subject: [PATCH 32/36] Update check_rancher_services.py --- check_rancher_services.py | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/check_rancher_services.py b/check_rancher_services.py index ab657b9..20db180 100644 --- a/check_rancher_services.py +++ b/check_rancher_services.py @@ -173,7 +173,7 @@ def process_section(conf, section): for serviceId in stackData[myStack]['serviceIds']: healthServiceReq=session.get(urlbase+'/v2-beta/projects/' + envid + '/services/' + serviceId, auth=(username,password)) healthSvc=healthServiceReq.json() - print (healthSvc['id'] + ' ' + healthSvc['healthState']) +# print (healthSvc['id'] + ' ' + healthSvc['healthState']) if (healthSvc['healthState'] == 'healthy' or healthSvc['healthState'] == 'started-once'): conn.execute('DELETE FROM badServices WHERE serviceId = ?', [ healthSvc['id'] ] ) conn.commit() From ce15a2f68a50e99aaa6b7be8e05d13a6f8663329 Mon Sep 17 00:00:00 2001 From: kkellerlbl Date: Thu, 18 Jul 2024 15:48:42 -0700 Subject: [PATCH 33/36] Fudge critical threshold for unhealthy services --- check_rancher_services.py | 7 +++++++ 1 file changed, 7 insertions(+) diff --git a/check_rancher_services.py b/check_rancher_services.py index 20db180..6f42052 100644 --- a/check_rancher_services.py +++ b/check_rancher_services.py @@ -196,6 +196,13 @@ def process_section(conf, section): stackState = 1 stackStateTxt = 'WARNING' stackExtraTxt = ' ; bad services: ' + ' '.join([ t[0] for t in badServices]) + query = "SELECT serviceName FROM badServices WHERE (datetime(lastUpdate) < datetime('now','-" + (2 * int(conf[section]['stack_health_age'])) + " seconds' ))" +# print (query) + cursor.execute(query) + reallyBadServices = cursor.fetchall() + if (len(reallyBadServices) == 0): + stackState = 2 + stackStateTxt = 'CRITICAL' # ideally, have something here to set state CRITICAL if age is even older (maybe 2x what's in the ini file?) From ab2a6a54207b49f1ba339ff82c783974190af525 Mon Sep 17 00:00:00 2001 From: kkellerlbl Date: Thu, 18 Jul 2024 15:49:11 -0700 Subject: [PATCH 34/36] Update check_rancher_services.py --- check_rancher_services.py | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/check_rancher_services.py b/check_rancher_services.py index 6f42052..20822ec 100644 --- a/check_rancher_services.py +++ b/check_rancher_services.py @@ -196,7 +196,7 @@ def process_section(conf, section): stackState = 1 stackStateTxt = 'WARNING' stackExtraTxt = ' ; bad services: ' + ' '.join([ t[0] for t in badServices]) - query = "SELECT serviceName FROM badServices WHERE (datetime(lastUpdate) < datetime('now','-" + (2 * int(conf[section]['stack_health_age'])) + " seconds' ))" + query = "SELECT serviceName FROM badServices WHERE (datetime(lastUpdate) < datetime('now','-" + str(2 * int(conf[section]['stack_health_age'])) + " seconds' ))" # print (query) cursor.execute(query) reallyBadServices = cursor.fetchall() From 76e560c70c9965b436489820ec51965c4460cf81 Mon Sep 17 00:00:00 2001 From: kkellerlbl Date: Thu, 18 Jul 2024 15:50:46 -0700 Subject: [PATCH 35/36] Update check_rancher_services.py --- check_rancher_services.py | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/check_rancher_services.py b/check_rancher_services.py index 20822ec..b9810cd 100644 --- a/check_rancher_services.py +++ b/check_rancher_services.py @@ -200,7 +200,7 @@ def process_section(conf, section): # print (query) cursor.execute(query) reallyBadServices = cursor.fetchall() - if (len(reallyBadServices) == 0): + if (len(reallyBadServices) > 0): stackState = 2 stackStateTxt = 'CRITICAL' From 4c370f09b70d2862ec08832663baf5d0c097fe35 Mon Sep 17 00:00:00 2001 From: kkellerlbl Date: Fri, 19 Jul 2024 11:24:04 -0700 Subject: [PATCH 36/36] Additional comments Also removed unused commented out code --- check_rancher_services.py | 78 ++++++--------------------------------- 1 file changed, 12 insertions(+), 66 deletions(-) diff --git a/check_rancher_services.py b/check_rancher_services.py index b9810cd..8ef995b 100644 --- a/check_rancher_services.py +++ b/check_rancher_services.py @@ -61,7 +61,7 @@ def process_section(conf, section): hostsReq=session.get(urlbase+'/v2-beta/projects/' + envid + '/hosts/', auth=(username,password)) hostData=hostsReq.json()['data'] -# monitor an agent +# monitor rancher agents for host in hostData: state=3 stateText='UNKNOWN' @@ -94,32 +94,13 @@ def process_section(conf, section): sys.exit(0) stackId = stackData[myStack]['id'] -### this part needs a lot of work -### moving to separate check_rancher_containers.py till we can figure out how to -### get stats directly from rancher 1.x API -# memState = 0 -# memStateTxt = 'OK' -# memCommentTxt = '' -## can only check stats on the local host -## to do: try to talk to the websocket to get stats from rancher API instead -# dockerStats = dict() - -# only get stats if hostid specified (since some hosts' subprocess module is broken) -# if hostid is not None: -# dockerStatsProc = subprocess.run(["docker", "stats", "--no-stream", "--no-trunc", "-a", "--format", "'{{.ID}}:{{.MemUsage}}'"], stdout=subprocess.PIPE) -## print(dockerStatsProc) -# for line in dockerStatsProc.stdout.decode('utf-8').rstrip().split('\n'): -# mylist = line.strip("'").split(':') -# memUse = mylist[1].split(' ') -# dockerStats[mylist[0]] = memUse[0] -## print(dockerStats) - +##### test health of listed services (if any) # track if there's an old dummy service that wasn't deleted oldDummyService = None for serviceId in stackData[myStack]['serviceIds']: # print (serviceId) -# in that stack, look through serviceIds for named services in /v2-beta/projects/envid/services/serviceId +# in the stack, look through serviceIds for named services in /v2-beta/projects/envid/services/serviceId serviceReq=session.get(urlbase+'/v2-beta/projects/' + envid + '/services/' + serviceId, auth=(username,password)) svc=serviceReq.json() if svc['name'] == 'checkmkDummy': @@ -138,6 +119,8 @@ def process_section(conf, section): print (str(serviceState) + ' ' + envname + '_' + stackname + '_' + svc['name'] + ' - ' + serviceStateTxt + ' running instances: ' + str(svc['currentScale'])) # print svc['healthState'] + +##### test overall stack health if (conf.has_option(section,'test_stack_health') and conf.getboolean(section,'test_stack_health') is True): stackState = 3 stackStateTxt = 'UNKNOWN' @@ -164,12 +147,10 @@ def process_section(conf, section): conn.execute('DELETE FROM badServices') conn.commit() -# plan: look through services in stack -# if service healthy, delete from table -# if service unhealthy, insert or ignore (ignore preserves original timestamp) -# after all services in stack are done, look through table, if any service timestamp is too old, throw alert +##### if stack reports degraded, look through services in stack to verify +# (rancher 1 doesn't really report this very well) else: - # we're trolling this again, meh + # we're trolling this again, meh. but only when stack is unhealthy, so don't worry about it for serviceId in stackData[myStack]['serviceIds']: healthServiceReq=session.get(urlbase+'/v2-beta/projects/' + envid + '/services/' + serviceId, auth=(username,password)) healthSvc=healthServiceReq.json() @@ -183,10 +164,12 @@ def process_section(conf, section): cursor = conn.cursor() + # this should return only services that have been unhealthy for a while (in theory persistently unhealthy) query = "SELECT serviceName FROM badServices WHERE (datetime(lastUpdate) < datetime('now','-" + conf[section]['stack_health_age'] + " seconds' ))" # print (query) cursor.execute(query) + # fetchall isn't great in theory, but in practice we should have very few rows in these tables badServices = cursor.fetchall() if (len(badServices) == 0): # all services now OK, so assume stack OK @@ -204,48 +187,15 @@ def process_section(conf, section): stackState = 2 stackStateTxt = 'CRITICAL' -# ideally, have something here to set state CRITICAL if age is even older (maybe 2x what's in the ini file?) - -# if (conf.has_option(section,'stack_health_dir') and conf.has_option(section,'stack_health_age') and stackPath.exists()): - # check age, if too old, make state critical - # if missing, don't do anything? - # not using, soon to remove -# if (time.time() - stackPath.stat().st_mtime > float(conf[section]['stack_health_age'])): -# stackState = 2 -# stackStateTxt = 'CRITICAL (state ' + str(int(time.time() - stackPath.stat().st_mtime)) + 'sec old)' - conn.close() print (str(stackState) + ' ' + envname + '_' + stackname + '_stackHealth - ' + stackStateTxt + ' stack health is ' + stackData[myStack]['healthState'] + stackExtraTxt) -# if on a host running containers, check their resources -# assume only one instance per service -### this part needs lots of work -# if hostid is not None: -# instanceReq=session.get(urlbase+'/v2-beta/projects/' + envid + '/instances/' + svc['instanceIds'][0], auth=(username,password)) -# rancherInstance=instanceReq.json() -# to do: give a hostname, and match it up to the rancher API hostId -# otherwise, if the hostId changes, such as if a host is removed and added back to Rancher, -# the container memory check will always be OK -# if rancherInstance['hostId'] == hostid: -## print (rancherInstance['name'] + ' ' + rancherInstance['externalId']) -# memUse = dockerStats[rancherInstance['externalId']] -## print (memUse) -## crude hack: docker stats outputs human readable. assume we only care about GB or more use -## future: better calculations -# if 'G' in memUse: -# memState = 1 -# memStateTxt = 'WARNING' -# memCommentTxt += (svc['name'] + ': ' + str(memUse) + ' ;; ') - -# if hostid is not None: -# print (str(memState) + ' ' + envname + '_' + stackname + '_containerMemory-' + hostid + ' - ' + memStateTxt + ' big mem containers on host ' + hostid + ' : ' + memCommentTxt) - if (not conf.has_option(section,'test_create_new')): return None if (conf.getboolean(section,'test_create_new') is False): return None -### spin up a dummy new service +##### if requested in config, test spinning up a dummy new service # initially copied from narrative-traefiker containerConfig = {u'assignServiceIpAddress': False, u'createIndex': None, @@ -392,10 +342,7 @@ def process_section(conf, section): print (str(dummyServiceState) + ' ' + envname + '_' + stackname + '_createNewService - ' + dummyServiceStateTxt) -# in each service find the last logs? may be hard, need websocket - - -# main loop +##### main loop # if args provided, use them, otherwise use sections from config file if args.sections: sections = args.sections @@ -405,4 +352,3 @@ def process_section(conf, section): for section in sections: # print (section) process_section(conf, section) -