Skip to content

Commit

Permalink
workbench: set placement and resources for Nomad Cloud "perf" class
Browse files Browse the repository at this point in the history
  • Loading branch information
fmaste authored and Jimbo4350 committed May 31, 2023
1 parent c36ce0f commit e2c464c
Show file tree
Hide file tree
Showing 3 changed files with 144 additions and 42 deletions.
18 changes: 13 additions & 5 deletions nix/workbench/backend/nomad-job.nix
Expand Up @@ -164,6 +164,12 @@ let
# SRE: Only 3 Nomad datacenters exist actually.
datacenters = [ "eu-central-1" "us-east-2" "ap-southeast-2" ];

# Specifies user-defined constraints on the task. This can be provided
# multiple times to define additional constraints.
# Cloud runs set the distinct hosts constraint here but local runs can't
# because we are only starting one Nomad client.
constraint = null;

# The reschedule stanza specifies the group's rescheduling strategy. If
# specified at the job level, the configuration will apply to all groups
# within the job. If the reschedule stanza is present on both the job and
Expand Down Expand Up @@ -193,11 +199,11 @@ let
ONE_TRACER_PER_NODE = oneTracerPerNode;
};

# A group defines a series of tasks that should be co-located
# on the same client (host). All tasks within a group will be
# placed on the same host.
# A group defines a series of tasks that should be co-located on the same
# client (host). All tasks within a group will be placed on the same host.
# https://developer.hashicorp.com/nomad/docs/job-specification/group
group = let
# For each node-specs.json object
valueF = (taskName: serviceName: portName: portNum: nodeSpec: (groupDefaults // {

# Specifies the number of instances that should be running under for
Expand Down Expand Up @@ -255,8 +261,8 @@ let
constraint = {
attribute = "\${node.class}";
operator = "=";
# For testing best to avoid using "infra" node class as HA jobs runs
# there, for benchmarking dedicated static machines in the "perf"
# For testing we avoid using "infra" node class as HA jobs runs there
# For benchmarking dedicated static machines in the "perf"
# class are used and this value should be updated accordingly.
value = "qa";
};
Expand Down Expand Up @@ -327,6 +333,8 @@ let

# Sensible defaults to run cloud version of "default", "ci-test" and
# "ci-bench" in cardano-world qa class Nomad nodes.
# For benchmarking dedicated static machines in the "perf" class are
# used and this value should be updated accordingly.
resources = {
# Task can only ask for 'cpu' or 'cores' resource, not both.
#cpu = 512;
Expand Down
9 changes: 0 additions & 9 deletions nix/workbench/backend/nomad.sh
Expand Up @@ -238,15 +238,6 @@ backend_nomad() {
done
;;

allocate-run-nomad-job-patch-group-constraints )
local usage="USAGE: wb backend $op RUN-DIR CONSTRAINTS-JSON-ARRAY"
local dir=${1:?$usage}; shift
local constraints_array=${1:?$usage}; shift
local nomad_environment=$(envjqr 'nomad_environment')
local nomad_job_name=$(jq -r ". [\"job\"] | keys[0]" "${dir}"/nomad/nomad-job.json)
jq ".[\"job\"][\"${nomad_job_name}\"][\"group\"] |= with_entries(.value.constraint = \$constraints_array)" --argjson constraints_array "${constraints_array}" "${dir}"/nomad/nomad-job.json | sponge "${dir}"/nomad/nomad-job.json
;;

# Called by the sub-backends, don't use `fatal` and let them do the cleaning
deploy-genesis-wget )
local usage="USAGE: wb backend $op RUN-DIR"
Expand Down
159 changes: 131 additions & 28 deletions nix/workbench/backend/nomad/cloud.sh
Expand Up @@ -211,44 +211,147 @@ backend_nomadcloud() {
# The job file is "slightly" modified (jq) to suit the running environment.
backend_nomad allocate-run-nomad-job-patch-namespace "${dir}" "${NOMAD_NAMESPACE}"
backend_nomad allocate-run-nomad-job-patch-nix "${dir}"

# Set the placement info and resources accordingly
local nomad_job_name=$(jq -r ". [\"job\"] | keys[0]" "${dir}"/nomad/nomad-job.json)
if test -z "${WB_SHELL_PROFILE}"
then
fatal "Envar \"WB_SHELL_PROFILE\" is empty!"
else
if test "${WB_SHELL_PROFILE:0:6}" != 'cw-perf'
# Placement:
############
## "distinct_hosts": Instructs the scheduler to not co-locate any groups
## on the same machine. When specified as a job constraint, it applies
## to all groups in the job. When specified as a group constraint, the
## effect is constrained to that group. This constraint can not be
## specified at the task level. Note that the attribute parameter should
## be omitted when using this constraint.
## https://developer.hashicorp.com/nomad/docs/job-specification/constraint#distinct_hosts
local job_constraints_array
job_constraints_array='
[
{
"operator": "distinct_hosts"
, "value": "true"
}
]
'
jq \
--argjson job_constraints_array "${job_constraints_array}" \
".[\"job\"][\"${nomad_job_name}\"].constraint |= \$job_constraints_array" \
"${dir}"/nomad/nomad-job.json \
| \
sponge "${dir}"/nomad/nomad-job.json
# Resources:
############
local group_constraints_array
# "perf" profiles run on the "perf" class
if test "${WB_SHELL_PROFILE:0:7}" = 'cw-perf'
then
# Right now only "live" is using "perf" class distinct nodes!
backend_nomad allocate-run-nomad-job-patch-group-constraints "${dir}" \
"[ \
{ \
\"operator\": \"=\" \
, \"attribute\": \"\${node.class}\" \
, \"value\": \"perf\" \
} \
, { \
\"operator\": \"distinct_property\" \
, \"attribute\": \"\${attr.unique.hostname}\" \
, \"value\": 1 \
} \
]"
group_constraints_array='
[
{
"operator": "="
, "attribute": "${node.class}"
, "value": "perf"
}
]
'
# Set the resources, only for perf!
# AWS:
## c5.2xlarge: 8 vCPU and 16 Memory (GiB)
## https://aws.amazon.com/ec2/instance-types/c5/
# Nomad:
## - cpu.reservablecores = 8
## - cpu.arch: = amd64
## - cpu.frequency: = 3400
## - cpu.modelname: = Intel(R) Xeon(R) Platinum 8275CL CPU @ 3.00GHz
## - cpu.numcores: = 8
## - cpu.reservablecores: = 8
## - cpu.totalcompute: = 27200
## - memory.totalbytes = 16300142592
## Pesimistic: 1,798 MiB / 15,545 MiB Total
## Optimistic: 1,396 MiB / 15,545 MiB Total
local resources='{
"cores": 8
, "memory": 12000
, "memory_max": 15000
}'
jq \
--argjson resources "${resources}" \
".[\"job\"][\"${nomad_job_name}\"][\"group\"] |= with_entries(.value.task |= with_entries( .value.resources = \$resources ) )" \
"${dir}"/nomad/nomad-job.json \
| \
sponge "${dir}"/nomad/nomad-job.json
# Fix for region mismatches
###########################
# There are USx16 and APx18 and we need USx17 and APx17
jq \
".[\"job\"][\"${nomad_job_name}\"][\"group\"][\"node-49\"][\"affinity\"][\"value\"] = \"ap-southeast-2\"" \
"${dir}"/nomad/nomad-job.json \
| \
sponge "${dir}"/nomad/nomad-job.json
# We use "us-east-2" and they use "us-east-1"
jq \
".[\"job\"][\"${nomad_job_name}\"][\"datacenters\"] |= [\"eu-central-1\", \"us-east-1\", \"ap-southeast-2\"]" \
"${dir}"/nomad/nomad-job.json \
| \
sponge "${dir}"/nomad/nomad-job.json
jq \
".[\"job\"][\"${nomad_job_name}\"][\"group\"] |= with_entries( if (.value.affinity.value == \"us-east-2\") then (.value.affinity.value |= \"us-east-1\") else (.) end )" \
"${dir}"/nomad/nomad-job.json \
| \
sponge "${dir}"/nomad/nomad-job.json
# Non "perf" profiles run on the "qa" class
else
# Right now only testing, using "qa" class distinct nodes!
backend_nomad allocate-run-nomad-job-patch-group-constraints "${dir}" \
"[ \
{ \
\"operator\": \"=\" \
, \"attribute\": \"\${node.class}\" \
, \"value\": \"qa\" \
} \
, { \
\"operator\": \"distinct_property\" \
, \"attribute\": \"\${attr.unique.hostname}\" \
, \"value\": 1 \
} \
]"
group_constraints_array='
[
{
"operator": "="
, "attribute": "${node.class}"
, "value": "qa"
}
]
'
fi
jq \
--argjson group_constraints_array "${group_constraints_array}" \
".[\"job\"][\"${nomad_job_name}\"][\"group\"] |= with_entries(.value.constraint = \$group_constraints_array)" \
"${dir}"/nomad/nomad-job.json \
| \
sponge "${dir}"/nomad/nomad-job.json
fi

# Store a summary of the job.
jq \
'{
"namespace": ( .job["workbench-cluster-job"].namespace )
, "datacenters": ( .job["workbench-cluster-job"].datacenters )
, "constraint": ( .job["workbench-cluster-job"].constraint )
, "groups": (
.job["workbench-cluster-job"].group
| with_entries(
.value |= {
"constraint": .constraint
, "affinity": .affinity
, "tasks": (
.task | with_entries(
.value |= {
"resources": .resources
, "nix_installables": .config.nix_installables
, "templates": ( .template | map(.destination) )
}
)
)
}
)
)
}' \
"${dir}"/nomad/nomad-job.json \
> "${dir}"/nomad/nomad-job.summary.json

backend_nomad allocate-run "${dir}"
;;

Expand Down Expand Up @@ -319,7 +422,7 @@ backend_nomadcloud() {
--region "${s3_region}" \
|| true
# Reminder to remove old files.
msg "Still avaiable files at $(yellow "\"s3://${s3_bucket_name}\""):"
msg "Still available files at $(yellow "\"s3://${s3_bucket_name}\""):"
aws s3 ls \
s3://"${s3_bucket_name}"/ \
--region "${s3_region}"
Expand Down

0 comments on commit e2c464c

Please sign in to comment.