In [9]:
import sys
import os
import json
notebook_dir = os.getcwd()

sys.path.append(os.path.abspath(os.path.join(notebook_dir, '..', 'llm-negotiations')))
from envs.negotiation_env import NegotiationEnv
from trainer.GRPOEnvTrainer import GRPOEnvTrainer
from trainer.utils import get_default_grpo_config
import hydra
from omegaconf import DictConfig, open_dict, OmegaConf
from hydra.utils import instantiate
from simulator.games import Game




In [10]:
import os
os.environ["HYDRA_FULL_ERROR"] = "1"

In [11]:

from helpers.utils import unpack_nested_yaml, fill_defaults, get_inference_root_overrides


In [12]:
from hydra import compose, initialize
from hydra.core.global_hydra import GlobalHydra
from hydra.core.hydra_config import HydraConfig
from helpers.utils import unpack_nested_yaml, get_inference_root_overrides, fill_defaults


In [13]:
from envs.negotiation_env import NegotiationEnv


# If running in Jupyter/IPython, clear sys.argv to avoid argument parsing issues
if 'ipykernel_launcher' in sys.argv[0]:
    sys.argv = [sys.argv[0]]


# # Manually initialize Hydra
if GlobalHydra.instance().is_initialized():
    GlobalHydra.instance().clear()
    
# Explicitly initialize Hydra - this is what was missing in your code
initialize(version_base=None, config_path="../llm-negotiations/configs")

# Now you can call compose()
cfg = hydra.compose(config_name="inference_root",  return_hydra_config=True)

HydraConfig().cfg = cfg

with open_dict(cfg):
    # Set the working directory to the parent directory
    cfg.work_dir = os.path.abspath(os.path.join(os.getcwd(), "../llm-negotiations"))

# print(OmegaConf.to_yaml(cfg))




def get_config(cfg: DictConfig):
  
        with open_dict(cfg['experiment']):
            # unpack nested yaml files
            _ = unpack_nested_yaml(cfg['experiment'])
            # check if any keys are missing and update default run-time overrides
         
            overrides = get_inference_root_overrides(cfg, "/cluster/home/fraluca/negotio2/llm-negotiations/configs/inference_root.yaml")
            _ = fill_defaults(cfg['experiment'], root_overrides=overrides, defaults_file="/cluster/home/fraluca/negotio2/llm-negotiations/configs/negotiation_defaults.yaml")
            # unpack default yaml files (if any)
            _ = unpack_nested_yaml(cfg['experiment'])
            # update model constructors in case of model overrides

            config = cfg.experiment

            config_dict = OmegaConf.to_container(config, resolve=True)
            print("Config:\n", json.dumps(config_dict, indent=4))

            return config



# Call your processing function
config = get_config(cfg)

print(OmegaConf.to_yaml(config))

Config:
 {
    "num_negotiations": 17000,
    "checkpoint_frequency": 2500,
    "dataset_name": "LuckyLukke/negotio_REFUEL_8B_twosided",
    "onesided": false,
    "seed": 188,
    "game": {
        "_target_": "simulator.games.Game",
        "name": "generic-rental-agreement",
        "issues": [
            "gen-ra-rent.yaml"
        ],
        "issue_weights": [
            [
                1
            ],
            [
                1
            ]
        ],
        "scale": [
            100,
            100
        ],
        "description": "A landlord and a prospective tenant are negotiating a rental agreement.",
        "sides": [
            "You are an advisor representing the best interests of the landlord. Your main goal is to negotiate the best possible agreement for the landlord based on the information in the payoff tables. The numbers in the payoff tables show how valuable each outcome is to you. You can trust that the payoffs assigned to the different options in y

In [14]:

negotiation_env = NegotiationEnv(config)

print("NegotiationEnv initialized")

train_dataset = negotiation_env.get_dataset(size=2000)
test_dataset = negotiation_env.get_dataset(size=200)

reward_functions = negotiation_env.get_reward_functions()

# notable defaults: lr = 1e-6, max_grad_norm = 0.01, constant lr 10 warmup steps, 1024 tokens in+out
run_name = "grpo_negotiation_test_1"
num_gpus = torch.cuda.device_count()
training_args = get_default_grpo_config(run_name=run_name, num_gpus=num_gpus)

#Model that should be trained
model = negotiation_env.agent_1.model.model_name
print("Model to be trained:", model)
tokenizer.pad_token = tokenizer.eos_token
tokenizer.pad_token_id = tokenizer.eos_token_id

  

{'_target_': 'simulator.games.Game', 'name': 'generic-rental-agreement', 'issues': ['gen-ra-rent.yaml'], 'issue_weights': [[1], [1]], 'scale': [100, 100], 'description': 'A landlord and a prospective tenant are negotiating a rental agreement.', 'sides': ['You are an advisor representing the best interests of the landlord. Your main goal is to negotiate the best possible agreement for the landlord based on the information in the payoff tables. The numbers in the payoff tables show how valuable each outcome is to you. You can trust that the payoffs assigned to the different options in your table are accurate. Do not bring up any issues that are not specifically noted in your payoff table. It is possible that there is only 1 issue.', 'You are an advisor representing the best interests of the tenant. Your main goal is to negotiate the best possible agreement for the tenant based on the information in the payoff tables. The numbers in the payoff tables show how valuable each outcome is to y

InstantiationException: Error in call to target 'models.huggingface_model.HuggingFaceModel':
OutOfMemoryError('CUDA out of memory. Tried to allocate 32.00 MiB. GPU 0 has a total capacity of 10.57 GiB of which 11.06 MiB is free. Including non-PyTorch memory, this process has 10.55 GiB memory in use. Of the allocated memory 10.21 GiB is allocated by PyTorch, with 23.47 MiB allocated in private pools (e.g., CUDA Graphs), and 55.66 MiB is reserved by PyTorch but unallocated. If reserved but unallocated memory is large try setting PYTORCH_CUDA_ALLOC_CONF=expandable_segments:True to avoid fragmentation.  See documentation for Memory Management  (https://pytorch.org/docs/stable/notes/cuda.html#environment-variables)')
full_key: experiment.agent_1.model

In [None]:
trainer = GRPOEnvTrainer(
      model=model,
      processing_class=tokenizer,
      reward_funcs=reward_functions, 
      env=negotiation_env,
      args=training_args,
      train_dataset=dataset
  )

trainer.train()