Skip to content

Commit

Permalink
[LAB-631] ansible 1.1.0 (#706)
Browse files Browse the repository at this point in the history
  • Loading branch information
alabdao committed Oct 17, 2023
1 parent 624a15c commit 18014ad
Show file tree
Hide file tree
Showing 6 changed files with 246 additions and 23 deletions.
9 changes: 6 additions & 3 deletions infrastructure/ansible/files/compute.service
Original file line number Diff line number Diff line change
Expand Up @@ -13,11 +13,14 @@ ExecStart=bacalhau serve \
--labels owner={{ owner }} \
{% if ansible_ec2_instance_type is defined %}
--labels instance-type={{ ansible_ec2_instance_type }} \
{% endif %}
{% if ansible_ec2_instance_id is defined %}
--labels instance-id={{ ansible_ec2_instance_id }} \
{% endif %}
--peer {{ requester_peer }} \
--limit-job-memory 12gb \
{% if gpu %}
--limit-job-gpu 1 \
--limit-job-memory {{ (ansible_memtotal_mb | int * 0.80) | round | int }}Mb \
{% if num_of_gpus | int > 0 %}
--limit-job-gpu {{ num_of_gpus | int }} \
{% endif %}
--job-selection-accept-networked \
--job-selection-data-locality anywhere
Expand Down
67 changes: 67 additions & 0 deletions infrastructure/ansible/files/compute.yaml
Original file line number Diff line number Diff line change
@@ -0,0 +1,67 @@
---
Node:
ServerAPI:
Host: 0.0.0.0
Port: 1234
TLS:
AutoCert: ""
AutoCertCachePath: ""
ServerCertificate: ""
ServerTLSKey: ""
Libp2P:
SwarmPort: 1235
PeerConnect: none
IPFS:
Connect: ""
PrivateInternal: false
SwarmKeyPath: ""
ServePath: ""
Compute:
Capacity:
IgnorePhysicalResourceLimits: false
TotalResourceLimits:
CPU: ""
Memory: ""
Disk: ""
GPU: ""
JobResourceLimits:
CPU: ""
Memory: ""
Disk: ""
GPU: ""
DefaultJobResourceLimits:
CPU: 100m
Memory: 100Mi
Disk: ""
GPU: ""
QueueResourceLimits:
CPU: ""
Memory: ""
Disk: ""
GPU: ""
MaxJobExecutionTimeout: 0s
ExecutionStore:
Type: BoltDB
Path: ""
JobTimeouts:
JobExecutionTimeoutClientIDBypassList: []
JobNegotiationTimeout: 3m0s
MinJobExecutionTimeout: 500ms
MaxJobExecutionTimeout: {{ bacalhau_compute_max_job_execution_timeout | default('24h') }}
DefaultJobExecutionTimeout: 10m0s
JobSelection:
Locality: anywhere
RejectStatelessJobs: false
AcceptNetworkedJobs: true
ProbeHTTP: ""
ProbeExec: ""
Queue:
ExecutorBufferBackoffDuration: 50ms
Logging:
LogRunningExecutionsInterval: 10s
User:
KeyPath: /home/ubuntu/.bacalhau/user_id.pem
Libp2PKeyPath: /home/ubuntu/.bacalhau/libp2p_private_key
Metrics:
Libp2PTracerPath: /dev/null
EventTracerPath: /dev/null
120 changes: 120 additions & 0 deletions infrastructure/ansible/files/requester.yaml
Original file line number Diff line number Diff line change
@@ -0,0 +1,120 @@
---
Node:
ClientAPI:
Host: bootstrap.production.bacalhau.org
Port: 1234
TLS:
AutoCert: ""
AutoCertCachePath: ""
ServerCertificate: ""
ServerTLSKey: ""
ServerAPI:
Host: 0.0.0.0
Port: 1234
TLS:
AutoCert: ""
AutoCertCachePath: ""
ServerCertificate: ""
ServerTLSKey: ""
Libp2P:
SwarmPort: 1235
PeerConnect: none
IPFS:
Connect: ""
PrivateInternal: true
SwarmAddresses:
SwarmKeyPath: ""
ServePath: ""
Compute:
Capacity:
IgnorePhysicalResourceLimits: false
TotalResourceLimits:
CPU: ""
Memory: ""
Disk: ""
GPU: ""
JobResourceLimits:
CPU: ""
Memory: ""
Disk: ""
GPU: ""
DefaultJobResourceLimits:
CPU: 100m
Memory: 100Mi
Disk: ""
GPU: ""
QueueResourceLimits:
CPU: ""
Memory: ""
Disk: ""
GPU: ""
MaxJobExecutionTimeout: 0s
ExecutionStore:
Type: BoltDB
Path: ""
JobTimeouts:
JobExecutionTimeoutClientIDBypassList: []
JobNegotiationTimeout: 3m0s
MinJobExecutionTimeout: 500ms
MaxJobExecutionTimeout: 2562047h47m16s
DefaultJobExecutionTimeout: 10m0s
JobSelection:
Locality: anywhere
RejectStatelessJobs: false
AcceptNetworkedJobs: false
ProbeHTTP: ""
ProbeExec: ""
Queue:
ExecutorBufferBackoffDuration: 50ms
Logging:
LogRunningExecutionsInterval: 10s
Requester:
JobDefaults:
ExecutionTimeout: 30m0s
ExternalVerifierHook: ""
JobSelectionPolicy:
Locality: anywhere
RejectStatelessJobs: false
AcceptNetworkedJobs: false
ProbeHTTP: ""
ProbeExec: ""
JobStore:
Type: BoltDB
Path: ""
HousekeepingBackgroundTaskInterval: 30s
NodeRankRandomnessRange: 5
OverAskForBidsFactor: 3
FailureInjectionConfig:
IsBadActor: false
EvaluationBroker:
EvalBrokerVisibilityTimeout: 1m0s
EvalBrokerInitialRetryDelay: 1s
EvalBrokerSubsequentRetryDelay: 30s
EvalBrokerMaxRetryCount: 10
Worker:
WorkerCount: 4
WorkerEvalDequeueTimeout: 5s
WorkerEvalDequeueBaseBackoff: 1s
WorkerEvalDequeueMaxBackoff: 30s
BootstrapAddresses:
DownloadURLRequestRetries: 3
DownloadURLRequestTimeout: 5m0s
VolumeSizeRequestTimeout: 2m0s
ExecutorPluginPath: /home/ubuntu/.bacalhau/plugins
ComputeStoragePath: /home/ubuntu/.bacalhau/executor_storages
LoggingMode: default
Type:
- requester
EstuaryAPIKey: ""
AllowListedLocalPaths: []
DisabledFeatures:
Engines: []
Publishers: []
Storages: []
Labels: {}
User:
KeyPath: /home/ubuntu/.bacalhau/user_id.pem
Libp2PKeyPath: /home/ubuntu/.bacalhau/libp2p_private_key
Metrics:
Libp2PTracerPath: /dev/null
EventTracerPath: /dev/null
24 changes: 19 additions & 5 deletions infrastructure/ansible/provision_compute_only.yaml
Original file line number Diff line number Diff line change
Expand Up @@ -2,19 +2,20 @@
remote_user: ubuntu
# Ability to override host, useful to running playbook in local mode
hosts: "{{ target_hosts | default('tag_Type_compute_only') }}"
gather_facts: true
vars:
nvidia_distribution: ubuntu2004
ipfs_version: "0.18.0"
ipfs_path: "/opt/ipfs"
gpu: true
go_version: 1.20.3
bacalhau_version: "v1.1.0"
bacalhau_node_type: "compute"
ipfs_connect: /ip4/127.0.0.1/tcp/5001
owner: labdao
tasks:
# Must provide limit flag to ensure running against correct environment
- fail:
- name: Checking if limit is provided
ansible.builtin.fail:
msg: "you must use -l or --limit"
when: ansible_limit is not defined
run_once: true
Expand All @@ -26,10 +27,23 @@
- name: Install Docker
ansible.builtin.include_tasks: tasks/install_docker_tasks.yaml

# Get GPU info from system
- name: Get lshw display info
become: true
ansible.builtin.command: lshw -c display -json
changed_when: true
register: lshw_output

- name: set number of gpus available
vars:
query: "[?vendor=='NVIDIA Corporation']"
ansible.builtin.set_fact:
num_of_gpus: "{{ lshw_output.stdout | from_json | json_query(query) | length }}"

# GPU
- name: Install tools and binaries for GPU support
ansible.builtin.include_tasks: tasks/install_gpu_tasks.yaml
when: gpu
when: num_of_gpus | int > 0

# Ensure handlers are restarted before continuing
- name: flush handlers
Expand All @@ -45,8 +59,8 @@
ansible.builtin.include_tasks: tasks/install_bacalhau_tasks.yaml
tags: bacalhau

- name: Pull common containers
ansible.builtin.include_tasks: tasks/pull_common_containers.yaml
# - name: Pull common containers
# ansible.builtin.include_tasks: tasks/pull_common_containers.yaml

handlers:
- name: Restart docker
Expand Down
44 changes: 34 additions & 10 deletions infrastructure/ansible/tasks/install_bacalhau_tasks.yaml
Original file line number Diff line number Diff line change
@@ -1,6 +1,6 @@
# Try running Bacalhau first, to see what version it is.
- name: Check bacalhau version
ansible.builtin.command: /usr/local/bin/bacalhau version
ansible.builtin.command: /usr/local/bin/bacalhau version --client --no-style --hide-header
register: existing_bacalhau_version
ignore_errors: true
changed_when: false
Expand All @@ -9,7 +9,7 @@

- name: Set fact for currently installed version
ansible.builtin.set_fact:
bacalhau_installed_version: "{{ existing_bacalhau_version.stdout.split('Server Version: ')[1] }}"
bacalhau_installed_version: "{{ existing_bacalhau_version.stdout | trim }}"
when: existing_bacalhau_version.stdout != ''

- name: Print installed kubo version
Expand Down Expand Up @@ -41,17 +41,12 @@
msg: "Running on environment: {{ ansible_ec2_tags_instance_Env }}"
when: ansible_ec2_tags_instance_Env is defined

- name: Set fact when its prod node
ansible.builtin.set_fact:
requester_hostname: "requester.labdao.xyz"
ipfs_hostname: "ipfs.labdao.xyz"
when: ansible_ec2_tags_instance_Env is defined and ansible_ec2_tags_instance_Env | lower == "prod"

- name: Set fact when its non-prod node
- name: Set fact
ansible.builtin.set_fact:
requester_hostname: "requester.{{ ansible_ec2_tags_instance_Env | lower }}.labdao.xyz"
ipfs_hostname: "ipfs.{{ ansible_ec2_tags_instance_Env | lower }}.labdao.xyz"
when: ansible_ec2_tags_instance_Env is defined and ansible_ec2_tags_instance_Env | lower != "prod"
receptor_hostname: "receptor.{{ ansible_ec2_tags_instance_Env | lower }}.labdao.xyz"
when: ansible_ec2_tags_instance_Env is defined

# Bacalhau PeerID, example `curl -s bacalhau.staging.labdao.xyz:1234/node_info | jq -r '.PeerInfo.ID'`
- name: Determine requester bacalhau peer id
Expand All @@ -72,11 +67,26 @@
ipfs_connect: "/dns4/{{ ipfs_hostname }}/tcp/5001"
when: ipfs_hostname is defined

- name: Set receptor url
ansible.builtin.set_fact:
receptor_url: "http://{{ receptor_hostname }}:8080/judge"
when: receptor_hostname is defined

- name: Ensure path to bacalhau dir exists
become: true
ansible.builtin.file:
path: /home/ubuntu/.bacalhau/
state: directory
mode: "0755"
owner: ubuntu
group: ubuntu

- name: Install the Bacalhau systemd unit
become: true
ansible.builtin.template:
src: "files/{{ bacalhau_node_type }}.service"
dest: /etc/systemd/system
mode: "0644"
notify:
- Restart Bacalhau

Expand All @@ -86,3 +96,17 @@
name: "{{ bacalhau_node_type }}"
state: started
enabled: true

- name: Flush handler to ensure Bacalhau is running
ansible.builtin.meta: flush_handlers

- name: Deploy config file
become: true
ansible.builtin.template:
src: "files/{{ bacalhau_node_type }}.yaml"
dest: /home/ubuntu/.bacalhau/config.yaml
owner: ubuntu
group: ubuntu
mode: "0644"
notify:
- Restart Bacalhau
5 changes: 0 additions & 5 deletions infrastructure/ansible/vars/prod.yaml

This file was deleted.

0 comments on commit 18014ad

Please sign in to comment.