In [1]:
%load_ext autoreload
%autoreload 2

import plotly.express as px
import plotly.graph_objects as go
import re
import time

import utils
from slurm import SlurmClient

# Slurm Queue

In [2]:
!sinfo

PARTITION AVAIL  TIMELIMIT  NODES  STATE NODELIST
q1*          up 7-00:00:00      1   unk* h100-st-p548xlarge-4
q1*          up 7-00:00:00     26    mix h100-st-p548xlarge-[9-10,13-14,29,38,76,227-242,333,448-449]
q1*          up 7-00:00:00    303  alloc h100-st-p548xlarge-[0-3,5-6,11-12,15-28,30-37,39-47,77-208,264-305,327-332,360-443]
q1*          up 7-00:00:00    120   idle h100-st-p548xlarge-[7-8,48-75,209-226,243-263,306-326,334-359,444-447]


In [3]:
sl = SlurmClient()
sl.get_queue_summary()

Unnamed: 0,ACCOUNT,ST,NODES
0,all,R,17
1,ar-ai-hipri,R,257
2,ar-ai-midpri,R,1
3,ar-ai-research-interns,R,3
4,ar-ai-voice-hipri,PD,36
5,ar-ai-voice-hipri,R,53


In [4]:
sl.get_recent_queue('all')

Unnamed: 0,JOBID,ACCOUNT,NAME,USER,ST,START_TIME,TIME,NODES,TIME_SECS
19,8214,all,sft-eval,jiuhai,R,2024-08-09T07:29:59,7:23:09,1,26589
18,7765,all,sft-eval,jiuhai,R,2024-08-08T23:30:57,15:22:11,16,55331


In [5]:
sl.get_recent_queue('ar-ai-research-interns')

Unnamed: 0,JOBID,ACCOUNT,NAME,USER,ST,START_TIME,TIME,NODES,TIME_SECS
22,7489,ar-ai-research-interns,bash,yfzhuang,R,2024-08-08T18:05:03,20:48:07,1,74887
23,7488,ar-ai-research-interns,bash,yfzhuang,R,2024-08-08T18:03:36,20:49:34,1,74974
21,7337,ar-ai-research-interns,jupyter,imzyc,R,2024-08-08T00:14:40,1-14:38:30,1,139110


In [6]:
sl.get_recent_queue('ar-ai-voice-hipri')

Unnamed: 0,JOBID,ACCOUNT,NAME,USER,ST,START_TIME,TIME,NODES,TIME_SECS
36,7405,ar-ai-voice-hipri,14CC-MAIN-2022-40,arashe,R,2024-08-09T14:18:43,34:30,2,2070
37,7404,ar-ai-voice-hipri,13CC-MAIN-2022-40,arashe,R,2024-08-09T14:11:16,41:57,2,2517
38,7403,ar-ai-voice-hipri,12CC-MAIN-2022-40,arashe,R,2024-08-09T14:09:45,43:28,2,2608
39,7402,ar-ai-voice-hipri,11CC-MAIN-2022-40,arashe,R,2024-08-09T14:02:06,51:07,2,3067
40,7400,ar-ai-voice-hipri,9CC-MAIN-2022-40,arashe,R,2024-08-09T13:31:19,1:21:54,2,4914
41,7399,ar-ai-voice-hipri,8CC-MAIN-2022-40,arashe,R,2024-08-09T12:48:42,2:04:31,2,7471
42,7397,ar-ai-voice-hipri,6CC-MAIN-2022-40,arashe,R,2024-08-09T09:33:26,5:19:47,2,19187
43,7396,ar-ai-voice-hipri,5CC-MAIN-2022-40,arashe,R,2024-08-09T09:23:36,5:29:37,2,19777
44,7395,ar-ai-voice-hipri,4CC-MAIN-2022-40,arashe,R,2024-08-09T09:20:17,5:32:56,2,19976
45,7394,ar-ai-voice-hipri,3CC-MAIN-2022-40,arashe,R,2024-08-09T09:17:38,5:35:35,2,20135


In [54]:
q = sl.get_queue_summary()
running = {}
for _, row in q.iterrows():
    if row.ST == 'R':
        running[row.ACCOUNT] = row.NODES
print(running)

quota = {'ar-ai-research-interns': 8,
         'ar-ai-voice-hipri': 50, 'ar-ai-hipri': 400}

evict_ids = {}
for acc in running:
    if acc not in quota:  # quota = 0
        print(acc)

{'all': 17, 'ar-ai-hipri': 274, 'ar-ai-midpri': 2, 'ar-ai-research-interns': 3, 'ar-ai-voice-hipri': 53}
all
ar-ai-midpri


In [30]:
q_summary = q.groupby(['ACCOUNT', 'ST']).aggregate(
    {'NODES': 'sum'}).reset_index()
q_summary

Unnamed: 0,ACCOUNT,ST,NODES
0,all,R,17
1,ar-ai-hipri,R,280
2,ar-ai-midpri,R,2
3,ar-ai-research-interns,R,3
4,ar-ai-voice-hipri,PD,94
5,ar-ai-voice-hipri,R,53


# Metrics

# Launch

In [10]:
MM9_CONF_DIR = "/fsx_0/user/tranx/experiments/llm_mm_aligner/stage1_mm9"
sbatch_base = "/fsx_0/user/tranx/experiments/llm_mm_aligner/base_pretrain_MM9_70B_sbatch.sh"
sbatch_overwrite = {
    "job-name": "test",
    "nodes": 2,
    "ntasks": 2
}

envvar = {
    "JSON_CONFIG": f"{MM9_CONF_DIR}/fbl_pretrain_MM9_70B_Llama31_336px_2nodes.json",
    "CONDA_ENV": "aligner_v7",
}

sbatch_vars_string = []
for k, v in sbatch_overwrite.items():
    sbatch_vars_string.append(f"--{k}={v}")
sbatch_vars_string = ' '.join(sbatch_vars_string)

job_id = utils.get_bash_output(
    f'sbatch --parsable {sbatch_vars_string} {sbatch_base}', print_cmd=True, print_output=True)
job_id = int(job_id)

sbatch --parsable --job-name=test --nodes=2 --ntasks=2 /fsx_0/user/tranx/experiments/llm_mm_aligner/base_pretrain_MM9_70B_sbatch.sh
8253



In [9]:
!scancel 8253

In [47]:
!cat /fsx_0/user/tranx/output/slurm_logs/output_8253.txt

Using conda environment: base
Error: CONDA_DEFAULT_ENV is not set to 


In [15]:
!tail -f /fsx_0/user/tranx/output/slurm_logs/output_8253.txt

tail: cannot open '/fsx_0/user/tranx/output/slurm_logs/output_8253.txt' for reading: No such file or directory
tail: no files remaining


In [92]:
# https://docs.icer.msu.edu/Slurm_Environment_Variables/

In [91]:
s = utils.get_bash_output('ping google.com -c 3', print_output=True)

PING google.com (142.250.191.238) 56(84) bytes of data.
64 bytes from ord38s32-in-f14.1e100.net (142.250.191.238): icmp_seq=1 ttl=112 time=8.74 ms
64 bytes from ord38s32-in-f14.1e100.net (142.250.191.238): icmp_seq=2 ttl=112 time=8.30 ms
64 bytes from ord38s32-in-f14.1e100.net (142.250.191.238): icmp_seq=3 ttl=112 time=8.23 ms

--- google.com ping statistics ---
3 packets transmitted, 3 received, 0% packet loss, time 2002ms
rtt min/avg/max/mdev = 8.230/8.421/8.736/0.224 ms



In [87]:
print(s)

PING google.com (142.250.191.238) 56(84) bytes of data.
64 bytes from ord38s32-in-f14.1e100.net (142.250.191.238): icmp_seq=1 ttl=112 time=8.62 ms
64 bytes from ord38s32-in-f14.1e100.net (142.250.191.238): icmp_seq=2 ttl=112 time=8.24 ms
64 bytes from ord38s32-in-f14.1e100.net (142.250.191.238): icmp_seq=3 ttl=112 time=9.76 ms

--- google.com ping statistics ---
3 packets transmitted, 3 received, 0% packet loss, time 2003ms
rtt min/avg/max/mdev = 8.238/8.873/9.760/0.646 ms

