Skip to content
New issue

Have a question about this project? Sign up for a free GitHub account to open an issue and contact its maintainers and the community.

By clicking “Sign up for GitHub”, you agree to our terms of service and privacy statement. We’ll occasionally send you account related emails.

Already on GitHub? Sign in to your account

Monitoring and other improvements #14

Merged
merged 30 commits into from
Sep 28, 2023
Merged
Show file tree
Hide file tree
Changes from all commits
Commits
Show all changes
30 commits
Select commit Hold shift + click to select a range
a2af6e7
imp: add support for multivalueDns sharing between groups
johnalotoski Sep 19, 2023
f598131
imp: add sops yaml template example
johnalotoski Sep 19, 2023
725cfa1
imp: add bech32 to cardano-node shell env
johnalotoski Sep 19, 2023
87a5dbe
imp: add cold.vkey and leadership, kes aliases to bp role
johnalotoski Sep 19, 2023
b8d67cb
imp: update block producer secrets paths and sops template
johnalotoski Sep 20, 2023
3953a4b
imp: rewrite jobs for stake pool dir layout change
johnalotoski Sep 20, 2023
d71b3ec
imp: output cardano-parts consistent custom node cfg
johnalotoski Sep 21, 2023
22a3681
imp: update pool/group bulk creds on kes rotate
johnalotoski Sep 21, 2023
3e2a202
imp: adds decryption checks to job automation
johnalotoski Sep 21, 2023
edef707
imp: adds encryption checks to job automation
johnalotoski Sep 21, 2023
a8b35be
imp: allows splitting of stake pool and no-deploy dir for secrets
johnalotoski Sep 21, 2023
b9343a9
fix: for string splitting in cli eval
johnalotoski Sep 22, 2023
48ca8f0
imp: mv basic and common modules to profiles
johnalotoski Sep 22, 2023
8fd1d87
cleanup: role-block-producer paths
johnalotoski Sep 22, 2023
df26fa1
imp: grafana-agent profile for linux srv integration
johnalotoski Sep 22, 2023
20943e3
bump: iohk-nix for sanchonet updated ledger peer slot
johnalotoski Sep 26, 2023
ca54ac9
imp: add db tools to node machines
johnalotoski Sep 26, 2023
d623ae3
imp: take an optional kes period for pool creation
johnalotoski Sep 26, 2023
ed51d3a
imp: scrape cardano-node machines to grafana cloud
johnalotoski Sep 26, 2023
4f8d7da
imp: add a secrets check to the pre-push job
johnalotoski Sep 26, 2023
ed401d9
imp: add addnl cli aliases for block producers
johnalotoski Sep 26, 2023
52db321
Add tf mimir/loki providers, mimirtool bin, grafana stackName opt
johnalotoski Sep 28, 2023
27ed2ce
bump: switch to upstream tf provider now with mimir/loki
johnalotoski Sep 28, 2023
413f673
tmpl: anchor gitignore paths and add more exclusions
johnalotoski Sep 28, 2023
f97adc7
tmpl: add example files for secrets layout update
johnalotoski Sep 28, 2023
f6ade94
tmpl: implement tf workspace handling for cluster/grafana
johnalotoski Sep 28, 2023
8ac1961
tmpl: enable mkMerge on flake.terraform attrs
johnalotoski Sep 28, 2023
bf031b8
tmpl: update cluster and colmena machine defns
johnalotoski Sep 28, 2023
984633b
tmpl: add grafana cloud tf, example dash and alert
johnalotoski Sep 28, 2023
6a8336b
imp: include nix-import files in treefmt
johnalotoski Sep 28, 2023
File filter

Filter by extension

Filter by extension

Conversations
Failed to load comments.
Loading
Jump to
Jump to file
Failed to load files.
Loading
Diff view
Diff view
12 changes: 6 additions & 6 deletions flake.lock

Some generated files are not rendered by default. Learn more about how customized files appear on GitHub.

28 changes: 17 additions & 11 deletions flake/nixosModules/module-cardano-node-group.nix
Original file line number Diff line number Diff line change
Expand Up @@ -11,9 +11,8 @@
# * This is a cardano-node add-on to the upstream cardano-node nixos service module
# * This module assists with group deployments
# * The upstream cardano-node nixos service module should still be imported separately
flake: {
flake.nixosModules.module-cardano-node-group = {
config,
{moduleWithSystem, ...}: {
flake.nixosModules.module-cardano-node-group = moduleWithSystem ({config, ...}: nixos @ {
pkgs,
lib,
name,
Expand All @@ -25,14 +24,14 @@ flake: {
inherit (types) bool float int;
inherit (nodeResources) cpuCount memMiB;

inherit (config.cardano-parts.cluster.group.meta) environmentName;
inherit (config.cardano-parts.perNode.lib) cardanoLib;
inherit (config.cardano-parts.perNode.meta) cardanoNodePort cardanoNodePrometheusExporterPort hostAddr nodeId;
inherit (config.cardano-parts.perNode.pkgs) cardano-node-pkgs;
inherit (nixos.config.cardano-parts.cluster.group.meta) environmentName;
inherit (nixos.config.cardano-parts.perNode.lib) cardanoLib;
inherit (nixos.config.cardano-parts.perNode.meta) cardanoNodePort cardanoNodePrometheusExporterPort hostAddr nodeId;
inherit (nixos.config.cardano-parts.perNode.pkgs) cardano-node-pkgs;
inherit (cardanoLib.environments.${environmentName}.nodeConfig) ByronGenesisFile;
inherit ((fromJSON (readFile ByronGenesisFile)).protocolConsts) protocolMagic;

cfg = config.services.cardano-node;
cfg = nixos.config.services.cardano-node;
in {
# Leave the import of the upstream cardano-node service for
# cardano-parts consuming repos so that service import can be customized.
Expand All @@ -41,7 +40,7 @@ flake: {
# perNode nixos options as this leads to infinite recursion.
#
# imports = [
# config.cardano-parts.perNode.pkgs.cardano-node-service;
# nixos.config.cardano-parts.perNode.pkgs.cardano-node-service;
# ];

options = {
Expand Down Expand Up @@ -69,7 +68,14 @@ flake: {
};

config = {
environment.systemPackages = [cardano-node-pkgs.cardano-cli];
environment.systemPackages = [
config.cardano-parts.pkgs.bech32
cardano-node-pkgs.cardano-cli
config.cardano-parts.pkgs.db-analyser
config.cardano-parts.pkgs.db-truncater
config.cardano-parts.pkgs.db-synthesizer
];

environment.variables = {
CARDANO_NODE_NETWORK_ID = toString protocolMagic;
CARDANO_NODE_SOCKET_PATH = cfg.socketPath 0;
Expand Down Expand Up @@ -184,5 +190,5 @@ flake: {
}
];
};
};
});
}
Original file line number Diff line number Diff line change
@@ -1,4 +1,4 @@
# nixosModule: module-basic
# nixosModule: profile-basic
#
# TODO: Move this to a docs generator
#
Expand All @@ -11,7 +11,7 @@
moduleWithSystem,
...
}: {
flake.nixosModules.module-basic = moduleWithSystem ({system}: {
flake.nixosModules.profile-basic = moduleWithSystem ({system}: {
name,
pkgs,
...
Expand Down Expand Up @@ -85,7 +85,11 @@
};

services = {
chrony.enable = true;
chrony = {
enable = true;
extraConfig = "rtcsync";
};

cron.enable = true;
fail2ban.enable = true;
openssh = {
Expand Down
Original file line number Diff line number Diff line change
@@ -1,4 +1,4 @@
# nixosModule: module-common
# nixosModule: profile-common
#
# TODO: Move this to a docs generator
#
Expand All @@ -17,7 +17,7 @@
moduleWithSystem,
...
}: {
flake.nixosModules.module-common = moduleWithSystem ({system}: {
flake.nixosModules.profile-common = moduleWithSystem ({system}: {
config,
lib,
...
Expand Down
236 changes: 236 additions & 0 deletions flake/nixosModules/profile-grafana-agent.nix
Original file line number Diff line number Diff line change
@@ -0,0 +1,236 @@
# nixosModule: profile-grafana-agent
#
# TODO: Move this to a docs generator
#
# Attributes available on nixos module import:
#
# Tips:
#
{
flake.nixosModules.profile-grafana-agent = {
config,
lib,
name,
...
}:
with builtins;
with lib; let
inherit (config.cardano-parts.perNode.meta) cardanoNodePrometheusExporterPort hostAddr;
inherit (groupCfg) groupName groupFlake;
inherit (groupCfg.meta) environmentName;

groupCfg = config.cardano-parts.cluster.group;
groupOutPath = groupFlake.self.outPath;

pathPrefix = "${groupOutPath}/secrets/monitoring/";
trimStorePrefix = path: last (split "/nix/store/[^/]+/" path);
verboseTrace = key: traceVerbose ("${name}: using " + (trimStorePrefix key));

mkSopsSecret = secretsFile: {
${secretsFile} = verboseTrace (pathPrefix + secretsFile + ".enc") {
sopsFile = pathPrefix + secretsFile + ".enc";
};
};
in {
systemd.services.grafana-agent = {
after = ["sops-secrets.service"];
wants = ["sops-secrets.service"];
};

sops.secrets =
mkSopsSecret "grafana-agent-metrics-url"
// mkSopsSecret "grafana-agent-metrics-username"
// mkSopsSecret "grafana-agent-metrics-password";

services.grafana-agent = {
enable = true;

credentials = let
sopsPath = name: config.sops.secrets.${name}.path;
in {
# Loaded as env vars
METRICS_REMOTE_WRITE_URL = sopsPath "grafana-agent-metrics-url";
METRICS_REMOTE_WRITE_USERNAME = sopsPath "grafana-agent-metrics-username";

# Loaded as files
metrics_remote_write_password = sopsPath "grafana-agent-metrics-password";
};

extraFlags = [
"-disable-reporting"
];

settings = let
metrics-client = {
basic_auth = {
password_file = "\${CREDENTIALS_DIRECTORY}/metrics_remote_write_password";
username = "\${METRICS_REMOTE_WRITE_USERNAME}";
};
url = "\${METRICS_REMOTE_WRITE_URL}";
};

relabelConfig-agent_hostname-instance = [
{
action = "replace";
source_labels = ["agent_hostname"];
target_label = "instance";
}
{
action = "labeldrop";
regex = "^agent_hostname$";
}
];
in {
integrations = {
agent = {
enabled = true;
metric_relabel_configs = [
{
action = "keep";
regex = "^prometheus_target_.*|prometheus_sd_discovered_targets|agent_build.*|agent_wal_samples_appended_total|process_start_time_seconds$";
source_labels = ["__name__"];
}
];

relabel_configs =
relabelConfig-agent_hostname-instance
++ [
{
action = "replace";
replacement = "integrations/agent-check";
target_label = "job";
}
];
};

node_exporter = {
set_collectors = [
"boottime"
"conntrack"
"cpu"
"diskstats"
"filefd"
"filesystem"
"loadavg"
"meminfo"
"netdev"
"netstat"
"os"
"sockstat"
"softnet"
"stat"
"time"
"timex"
"uname"
"vmstat"
];

relabel_configs = relabelConfig-agent_hostname-instance;

metric_relabel_configs = [
{
action = "keep";
source_labels = ["__name__"];
regex =
"^"
+ concatMapStringsSep "|" (s: "(${s})") [
"node_boot_time_seconds"
"node_context_switches_total"
"node_cpu_seconds_total"
"node_disk_io_time_(seconds|weighted_seconds)_total"
"node_disk_(read|reads|writes|written)_.*"
"node_filefd_.*"
"node_filesystem_.*"
"node_intr_total"
"node_load([[:digit:]]+)"
"node_memory_(Active(|_file|_anon)|Inactive(|_file|_anon))_bytes"
"node_memory_Anon(HugePages|Pages)_bytes"
"node_memory_(Bounce|Committed_AS|CommitLimit|Dirty|Mapped)_bytes"
"node_memory_DirectMap(1G|2M|4k)_bytes"
"node_memory_HugePages_(Free|Rsvd|Surp|Total)"
"node_memory_Hugepagesize_bytes"
"node_memory_(Mem(Available|Free|Total)|Buffers|Cached|SwapTotal)_bytes"
"node_memory_Shmem(|HugPages|PmdMapped)_bytes"
"node_memory_S(Reclaimable|Unreclaim)_bytes"
"node_memory_Vmalloc(Chunk|Total|Used)_bytes"
"node_memory_Writeback(|Tmp)_bytes"
"node_netstat_Icmp6_(InErrors|InMsgs|OutMsgs)"
"node_netstat_Icmp_(InErrors|InMsgs|OutMsgs)"
"node_netstat_IpExt_(InOctets|OutOctets)"
"node_netstat_TcpExt_(ListenDrops|ListenOverflows|TCPSynRetrans)"
"node_netstat_Tcp_(InErrs|InSegs|OutRsts|OutSegs|RetransSegs)"
"node_netstat_Udp6_(InDatagrams|InErrors|NoPorts|OutDatagrams|RcvbufErrors|SndbufErrors)"
"node_netstat_Udp_(InDatagrams|InErrors|NoPorts|OutDatagrams|RcvbufErrors|SndbufErrors)"
"node_netstat_UdpLite_InErrors"
"node_network_.*"
"node_nf_conntrack_entries(|_limit)"
"node_os_info"
"node_sockstat_(FRAG|FRAG6|RAW|RAW6)_inuse"
"node_sockstat_sockets_used"
"node_sockstat_TCP6_inuse"
"node_sockstat_TCP_(alloc|inuse|mem|orphan|tw)"
"node_sockstat_(TCP|UDP)_mem_bytes"
"node_sockstat_UDP_mem"
"node_sockstat_(UDP|UDP6|UDPLITE|UDPLITE6)_inuse"
"node_softnet_(dropped|processed|times_squeezed)_total"
"node_timex_(estimated_error|maxerror|offset)_seconds"
"node_timex_sync_status"
"node_time_zone_offset_seconds"
"node_uname_info"
"node_vmstat_(pgmajfault|pgfault|pgpgin|pgpgout|pswpin|pswpout|oom_kill)"
]
+ "$";
}
{
action = "drop";
source_labels = ["__name__"];
regex = "^node_filesystem_readonly$";
}
{
# filesystem collector
action = "keep";
source_labels = ["mountpoint"];
regex = "^|/|/boot|/state|/home|/nix$";
}
{
# cpu collector
action = "keep";
source_labels = ["mode"];
regex = "^|system|user|iowait|steal|idle$";
}
];
};

prometheus_remote_write = [metrics-client];
};

metrics = {
configs = [
{
name = "integrations";
remote_write = [metrics-client];

scrape_configs = [
(mkIf (config.services ? cardano-node && config.services.cardano-node.enable) {
job_name = "integrations/cardano-node";
static_configs = [
{
targets = ["${hostAddr}:${toString cardanoNodePrometheusExporterPort}"];
labels = {
instance = name;
environment = environmentName;
group = groupName;
};
}
];
})
];
}
];
global.scrape_interval = "1m";
wal_directory = "\${STATE_DIRECTORY}/grafana-agent-wal";
};
};
};
};
}
Loading