In [1]:
import os
import re
import yaml

import numpy as np
import torch
import torch.nn.functional as F
import logging
from exults.log_utils import Logger

from extractive_structures import ROOT
from pathlib import Path
import exults.run_manager as rm
import json

import subprocess

from exults.slurm_utils import JobsWatcher

import exults.plot_utils as pu

import pandas as pd
import numpy as np
import lets_plot as lp
lp.LetsPlot.setup_html()

from exults.tensorial import Long

In [2]:
expts_root = Path(ROOT) / 'paper_experiments'
output_root = Path(ROOT) / 'results'

In [9]:
def run_sbatch(config_path, num_devices, slurm_path, node, preassign=False, dry_run=False):
    if preassign:
        output_path = rm.get_run_dir(
            config_path=config_path,
            runs_root=output_root,
            experiments_root=expts_root,
        )
        print(f'Preassigning output dir to {output_path}')
        preassign_dict = {'RM_OUTPUT_DIR': str(output_path)}
    else:
        preassign_dict = {}
    
    flags = [ f'--gres=gpu:{num_devices}', f'-w {node}']
    slurm_cmd = ['sbatch', *flags , slurm_path]
    if dry_run:
        print(f'CONFIG_FILE={config_path} ' + ' '.join(slurm_cmd))
        return
    try:
        slurm_output = subprocess.run(
            slurm_cmd, 
            env={**os.environ, 'CONFIG_FILE': config_path, **preassign_dict}, 
            capture_output=True, 
            check=True
        )
    except subprocess.CalledProcessError as e:
        print(e.stdout)
        print(e.stderr)
        raise e
    print(' '.join(slurm_cmd), slurm_output.stdout, slurm_output.stderr)
    string = slurm_output.stdout
    if not isinstance(string, str):
        string = string.decode()
    job_id = re.search(r"job (?P<id>[0-9]+)", string).group("id")
    if preassign:
        return job_id, output_path
    else:
        return job_id

def get_last_output(cfg_path, _output_root=None, _expts_root=None):
    if _output_root is None:
        _output_root = output_root
    if _expts_root is None:
        _expts_root = expts_root
    parent_dir = Path(rm.get_run_dir_parent(cfg_path, _output_root, _expts_root))
    dirs = [d for d in os.listdir(parent_dir)  if os.path.isdir(parent_dir / d)]
    success_dir = [d for d in dirs if 'done.out' in os.listdir(parent_dir / d)]
    max_run = max(int(d) for d in dirs)
    max_success = max(int(d) for d in success_dir)
    if max_run != max_success:
        print(f'Warning: latest run {max_run} of {cfg_path} is not successful. Falling back to {max_success}')
    return parent_dir / str(max_success)
        
    

In [4]:
model_tags = ['gemma_27b', 'llama_1b']

In [5]:
import extractive_structures.scripts.eval_ocr as eval_ocr

  from .autonotebook import tqdm as notebook_tqdm


In [6]:
lrs = [1e-6, 3e-6, 1e-5, 3e-5]
epochses = [4, 8, 12, 16]
cfgs = []
for lr in lrs:
    for epochs in epochses:
        for model_tag in model_tags:
            cfg = eval_ocr.Cfg(
                model_tag=model_tag,
                lr=lr,
                epochs=epochs,
                seeds=[0, 1, 2, 3, 4],
                half_precision=False
            )
            cfg_path = expts_root / 'sweep_all' / f'full_{model_tag}_{lr}_{epochs}.yaml'
            cfg.save(
                cfg_path,
                meta_kwargs=dict(_experiments_root=str(expts_root), _output_root=str(output_root))
            )
            cfgs.append({
                'lr': lr,
                'model_tag': model_tag, 
                'epochs': epochs,
                'cfg_path': cfg_path
            })

In [12]:
jobs = []
for cfg in cfgs:
    if cfg['model_tag'] == 'llama_1b':
        num_devices = 4
        node = 'balrog'
    else:
        continue
        num_devices = 8
        node = 'saruman'
    job_id = run_sbatch(
        cfg['cfg_path'],
        num_devices=num_devices, 
        node=node,
        slurm_path=str(ROOT/'slurm/eval_ocr.sh'),
        dry_run=False
    )
    jobs.append({** cfg, 'job_id': job_id})

b''
b'sbatch: error: invalid partition specified:  jsteinhardt\nsbatch: error: Batch job submission failed: Invalid partition name specified\n'


CalledProcessError: Command '['sbatch', '--gres=gpu:4', '-w balrog', '-p jsteinhardt', '/data/fjiahai/extractive_structures/slurm/eval_ocr.sh']' returned non-zero exit status 1.