In [1]:
import glob

game_files = [
    name
    for name in glob.glob("./z-machine-games-master/jericho-game-suite/*.z5")
    if "zork1" not in name
]
main_game_file = "./z-machine-games-master/jericho-game-suite/zork1.z5"

train_model_name = "unsloth/Llama-3.2-3B-Instruct-bnb-4bit" #"unsloth/llama-3-8b-bnb-4bit"
chat_template_name = "llama-3.2"

In [2]:
import jericho


def get_steps(filename: str):
    env = jericho.FrotzEnv(filename)
    
    initial_obs, info = env.reset()
    walkthrough = env.get_walkthrough()

    steps = []
    
    obs = initial_obs
    for step in walkthrough:
        steps.append((obs, step))
        #print(obs, step)
        obs, reward, done, info = env.step(step)
        #print(reward)
        if done:
            break

    env.close()

    return steps


steps = []
for game_file in game_files:
    steps.append(get_steps(game_file))

In [3]:
from datasets import Dataset
#from unsloth import standardize_sharegpt

def steps_to_dataset(steps: list[list[tuple[str, str]]], length: int, overlap: bool = True):
    convos = []

    for game in steps:
        convo = []
        n = 0
        
        for step in game:
            convo.append({"role": "user", "content": step[0]})
            convo.append({"role": "assistant", "content": step[1]})
            n += 1
            if overlap:
                if length > 0 and n > length:
                    n -= 1
                    convo.pop(0)
                    convo.pop(0)
                    
                convos.append(list(convo))
            else:
                if length > 0 and n >= length:
                    n = 0
                    convos.append(convo)
                    convo = []

        if len(convo) > 0:
            convos.append(convo)

    return Dataset.from_dict({"conversations": convos})

dataset = steps_to_dataset(steps, 5)
print(dataset[0])
#dataset = standardize_sharegpt(dataset)
#print(dataset[0])

{'conversations': [{'content': '\nWelcome to Adventure!\n\nMurdac\nAn adventure game by Jonathan R. Partington (Cambridge University, 1982)\n[This translation: version 1.111115 / Phoenix v1.04 / Inform v6.32\nPlease type "inform" for further details.]\n\nWelcome to the Land of Murdac. This is version 1.07.\n\nType HELP for basic information, and BLURB for the full story.\nAll comments to JRP1 please. New commands BRIEF/TERSE,\nNORMAL/STANDARD, VERBOSE and EXAMINE have now been added.\nYou are standing outside the door of a small flint hut.\nThere are paths off to the east, west and south.\nThe door is locked', 'role': 'user'}, {'content': 's', 'role': 'assistant'}]}


In [21]:
# Taken from this article:
# https://docs.unsloth.ai/get-started/fine-tuning-llms-guide/tutorial-how-to-finetune-llama-3-and-use-in-ollama
# https://colab.research.google.com/github/unslothai/notebooks/blob/main/nb/Advanced_Llama3_2_(3B)_GRPO_LoRA.ipynb
from unsloth import FastLanguageModel
from unsloth.chat_templates import get_chat_template
import torch

max_seq_length = 2048
dtype = None
load_in_4bit = True
lora_rank = 64 # Larger rank = smarter, but slower

def load_model(model_name):
    model, tokenizer = FastLanguageModel.from_pretrained(
        model_name = model_name,
        max_seq_length = max_seq_length,
        dtype = dtype,
        load_in_4bit = load_in_4bit,
        fast_inference = True, # required for vLLM, https://github.com/huggingface/open-r1/issues/572
        max_lora_rank = lora_rank,
    )
    
    model = FastLanguageModel.get_peft_model(
        model,
        r = 16, # any number > 0, suggested 8, 16, 32, 64, 128
        target_modules = ["q_proj", "k_proj", "v_proj", "o_proj",
                         "gate_proj", "up_proj", "down_proj"],
        lora_alpha = 16,
        lora_dropout = 0, # supports any, but 0 is optimized
        bias = "none", # supports any, but "none" is optimized
        use_gradient_checkpointing = "unsloth", # True or "unsloth" for very long context
        random_state = 3407,
        use_rslora = False, # We support rank stabilized LoRA
        loftq_config = None # And LoftQ
    )
    
    tokenizer = get_chat_template(tokenizer, chat_template = chat_template_name)

    return model, tokenizer

model, tokenizer = load_model(train_model_name)

INFO 12-10 00:06:06 [vllm_utils.py:702] Unsloth: Patching vLLM v1 graph capture
INFO 12-10 00:06:06 [vllm_utils.py:732] Unsloth: Patching vLLM v0 graph capture
==((====))==  Unsloth 2025.12.1: Fast Llama patching. Transformers: 4.57.3. vLLM: 0.10.2.
   \\   /|    NVIDIA RTX 4000 Ada Generation. Num GPUs = 1. Max memory: 19.548 GB. Platform: Linux.
O^O/ \_/ \    Torch: 2.8.0+cu128. CUDA: 8.9. CUDA Toolkit: 12.8. Triton: 3.4.0
\        /    Bfloat16 = TRUE. FA [Xformers = 0.0.32.post1. FA2 = False]
 "-____-"     Free license: http://github.com/unslothai/unsloth
Unsloth: Fast downloading is enabled - ignore downloading bars which are red colored!
Unsloth: Your GPU cannot handle sequence lengths of 256 due to limited GPU memory.
Unsloth: Your GPU can only handle approximately the maximum sequence length of 256.
Unsloth: vLLM loading unsloth/llama-3.2-3b-instruct-bnb-4bit with actual GPU utilization = 3.47%
Unsloth: Your GPU has CUDA compute capability 8.9 with VRAM = 19.55 GB.
Unsloth: Usi

RuntimeError: Duplicate layer name: model.layers.0.self_attn.attn

In [5]:
# https://docs.unsloth.ai/get-started/fine-tuning-llms-guide/datasets-guide
def formatting_prompts_func(examples):
    convos = examples["conversations"]
    texts = [
        tokenizer.apply_chat_template(convo, tokenize=False, add_generation_prompt = False)
        for convo in convos
    ]
    return {'text': texts}

dataset = dataset.map(formatting_prompts_func, batched=True)
#dataset[0]
#dataset[0]['text']

  StockPickler.save(self, obj, save_persistent_id)
  StockPickler.save(self, obj, save_persistent_id)


Map:   0%|          | 0/7742 [00:00<?, ? examples/s]

In [6]:
from trl import SFTTrainer
from transformers import TrainingArguments
from unsloth import is_bfloat16_supported
from unsloth.chat_templates import train_on_responses_only

enable_bf16 = is_bfloat16_supported()

trainer = SFTTrainer(
    model = model,
    tokenizer = tokenizer,
    train_dataset = dataset,
    dataset_text_field = "text",
    max_seq_length = max_seq_length,
    dataset_num_proc = 2,
    packing = False, # Can make training 5x faster for short sequences
    formatting_func = formatting_prompts_func,
    args = TrainingArguments(
        per_device_train_batch_size = 2,
        gradient_accumulation_steps = 4,
        warmup_steps = 5,
        max_steps = 120, #60,
        # num_train_epochs = 1,
        learning_rate = 2e-4,
        fp16 = not enable_bf16,
        bf16 = enable_bf16,
        logging_steps = 1,
        optim = "adamw_8bit",
        weight_decay = 0.01,
        lr_scheduler_type = "linear",
        seed = 3407,
        output_dir = "outputs",
    )
)

# https://colab.research.google.com/github/unslothai/notebooks/blob/main/nb/Llama3.2_(1B_and_3B)-Conversational.ipynb
trainer = train_on_responses_only(
    trainer,
    instruction_part = "<|start_header_id|>user<|end_header_id|>\n\n",
    response_part = "<|start_header_id|>assistant<|end_header_id|>\n\n",
)

  StockPickler.save(self, obj, save_persistent_id)
  StockPickler.save(self, obj, save_persistent_id)


Unsloth: Tokenizing ["text"] (num_proc=36):   0%|          | 0/7742 [00:00<?, ? examples/s]

Map (num_proc=36):   0%|          | 0/7742 [00:00<?, ? examples/s]

In [7]:
trainer_stats = trainer.train()

==((====))==  Unsloth - 2x faster free finetuning | Num GPUs used = 1
   \\   /|    Num examples = 7,742 | Num Epochs = 1 | Total steps = 120
O^O/ \_/ \    Batch size per device = 2 | Gradient accumulation steps = 4
\        /    Data Parallel GPUs = 1 | Total batch size (2 x 4 x 1) = 8
 "-____-"     Trainable parameters = 24,313,856 of 3,237,063,680 (0.75% trained)


Unsloth: Will smartly offload gradients to save VRAM!


Step,Training Loss
1,3.2298
2,3.457
3,3.9044
4,4.183
5,3.5119
6,3.7399
7,2.3797
8,2.5242
9,2.4527
10,2.5528


In [8]:
model.save_pretrained("lora_model")
tokenizer.save_pretrained("lora_model")

('lora_model/tokenizer_config.json',
 'lora_model/special_tokens_map.json',
 'lora_model/chat_template.jinja',
 'lora_model/tokenizer.json')

In [9]:
from transformers import TextStreamer

FastLanguageModel.for_inference(model)


def make_message(role, content):
    return {"role": role, "content": content}


def get_input_ids(messages):
    return tokenizer.apply_chat_template(
        messages,
        add_generation_prompt = True,
        return_tensors = "pt",
    ).to("cuda")


def generate_response(messages):
    input_ids = get_input_ids(messages)

    output_ids = model.generate(input_ids,
        max_new_tokens = 128,
    )
    out = tokenizer.batch_decode(output_ids)

    #print(out)
    
    out_line = out[0]
    start_token = "<|end_header_id|>"
    end_token = tokenizer.eos_token
    
    start_index = out_line.rindex(start_token) + len(start_token)
    end_index = out_line.rindex(end_token)
    
    return out_line[start_index : end_index].strip()

In [10]:
msgs = [make_message("user", "You are in a room. You see an egg on a table and a chest of drawers.")]
generate_response(msgs)

The attention mask is not set and cannot be inferred from input because pad token is same as eos token. As a consequence, you may observe unexpected behavior. Please pass your input's `attention_mask` to obtain reliable results.


'open chest'

In [11]:
msgs = [make_message("user", "You are in a cave. In front of you lies a sword sticking out of a large boulder.")]
generate_response(msgs)

'get sword'

In [12]:
def run_game(game_filename: str, n_steps: int, with_history: bool):
    env = jericho.FrotzEnv(game_filename)

    messages = []
    
    obs, info = env.reset()
    print(obs)
    
    for i in range(n_steps):
        if not with_history:
            messages.clear()

        messages.append(make_message("user", obs))
        
        response = generate_response(messages)
        print(">", response)
        messages.append(make_message("assistant", response))
        
        obs, reward, done, info = env.step(response)
        print(obs)
        if done:
            break
    
    env.close()

run_game(main_game_file, 100, True)

Copyright (c) 1981, 1982, 1983 Infocom, Inc. All rights reserved.
ZORK is a registered trademark of Infocom, Inc.
Revision 88 / Serial number 840726

West of House
You are standing in an open field west of a white house, with a boarded front door.
There is a small mailbox here.


> GET MAILBOX
It is securely anchored.


> GET MAIL
You can't see any mail here!


> GET MAILBOX
It is securely anchored.


> GET MAILBOX
It is securely anchored.


> GET MAILBOX
It is securely anchored.


> GET MAILBOX
It is securely anchored.


> GET MAILBOX
It is securely anchored.


> GET MAILBOX
It is securely anchored.


> GET MAILBOX
It is securely anchored.


> GET MAILBOX
It is securely anchored.


> GET MAILBOX
It is securely anchored.


> GET MAILBOX
It is securely anchored.


> GET MAILBOX
It is securely anchored.


> GET MAILBOX
It is securely anchored.


> GET MAILBOX
It is securely anchored.


> GET MAILBOX
It is securely anchored.


> GET MAILBOX
It is securely anchored.


> GET MAILBOX
It is s

In [18]:
def make_grpo_dataset(env: jericho.FrotzEnv):
    states = {}
    
    initial_obs, info = env.reset()
    walkthrough = env.get_walkthrough()

    prompts = []
    hashes = []
    
    obs = initial_obs
    for step in walkthrough:
        prompts.append([{"role": "user", "content": obs}])
        
        state_hash = env.get_world_state_hash()
        hashes.append(state_hash)
        if state_hash not in states:
            states[state_hash] = env.get_state()
        
        obs, reward, done, info = env.step(step)

        if done:
            break

    env.close()

    return Dataset.from_dict({"prompt": prompts, "state_hashes": hashes}), states


env = jericho.FrotzEnv(main_game_file)
grpo_dataset, states = make_grpo_dataset(env)


# Workaround for segfault in FrotzEnv.set_state
# Create + reuse a separate env within the GRPO reward function
reward_env = None

def shorten_response(response: str):
    return response.split("\n")[0]

def reward_func(prompts, completions, state_hashes, **kwargs): #(prompts, completions, **kwargs):
    global reward_env
    #env.set_state(states[state_hash])
    #env.step
    if reward_env is None:
        reward_env = jericho.FrotzEnv(main_game_file)
    #local_env = env.copy()
    scores = []
    for prompt, completion, state_hash in zip(prompts, completions, state_hashes):
        reward_env.set_state(states[state_hash])
        command = completion[0]["content"]

        cur_inv_size = len(reward_env.get_inventory())
        obs, reward, done, info = reward_env.step(command)
        new_inv_size = len(reward_env.get_inventory())

        if reward_env.get_world_state_hash() == state_hash:
            # Punish taking an invalid action
            reward -= 1.0
        #if reward_env.get_world_state_hash() != state_hash:
        #    # Reward taking a valid action
        #    reward += 1.0
        if new_inv_size > cur_inv_size:
            # Reward picking up items
            reward += new_inv_size - cur_inv_size

        short_desc = shorten_response(prompt[0]["content"])
        short_obs = shorten_response(obs)
        #print(f"'{short_desc}': '{command}' -> '{short_obs}' {reward}")
        #print(info)

        scores.append(reward)
    #local_env.close()

    #print(scores)
    return scores



In [19]:
# https://docs.unsloth.ai/get-started/reinforcement-learning-rl-guide/tutorial-train-your-own-reasoning-model-with-grpo
# https://colab.research.google.com/github/unslothai/notebooks/blob/main/nb/Advanced_Llama3_2_(3B)_GRPO_LoRA.ipynb
from trl import GRPOConfig, GRPOTrainer

max_seq_length = 2048
max_prompt_length = 287 + 1

grpo_training_args = GRPOConfig(
    use_vllm = True, # use vLLM for fast inference!
    learning_rate = 5e-6,
    #adam_beta1 = 0.9,
    #adam_beta2 = 0.99,
    weight_decay = 0.1,
    warmup_ratio = 0.1,
    lr_scheduler_type = "cosine",
    optim = "adamw_8bit",
    logging_steps = 1,
    per_device_train_batch_size = 1,
    #bf16 = is_bfloat16_supported(),
    #fp16 = not is_bfloat16_supported(),
    gradient_accumulation_steps = 4, # Increase to 4 for smoother training, decrease if OOM
    num_generations = 4, # Decrease if out of memory
    max_prompt_length = max_prompt_length,
    max_completion_length = max_seq_length - max_prompt_length,
    max_steps = 500,
    save_steps = 250,
    max_grad_norm = 1.0,
    report_to = "none",
    output_dir = "outputs",
)

In [20]:
model, tokenizer = load_model("lora_model")

INFO 12-09 23:29:15 [vllm_utils.py:702] Unsloth: Patching vLLM v1 graph capture
INFO 12-09 23:29:15 [vllm_utils.py:732] Unsloth: Patching vLLM v0 graph capture
==((====))==  Unsloth 2025.12.1: Fast Llama patching. Transformers: 4.57.3. vLLM: 0.10.2.
   \\   /|    NVIDIA RTX 4000 Ada Generation. Num GPUs = 1. Max memory: 19.548 GB. Platform: Linux.
O^O/ \_/ \    Torch: 2.8.0+cu128. CUDA: 8.9. CUDA Toolkit: 12.8. Triton: 3.4.0
\        /    Bfloat16 = TRUE. FA [Xformers = 0.0.32.post1. FA2 = False]
 "-____-"     Free license: http://github.com/unslothai/unsloth
Unsloth: Fast downloading is enabled - ignore downloading bars which are red colored!
Unsloth: Your GPU cannot handle sequence lengths of 256 due to limited GPU memory.
Unsloth: Your GPU can only handle approximately the maximum sequence length of 256.
Unsloth: vLLM loading unsloth/llama-3.2-3b-instruct-bnb-4bit with actual GPU utilization = 10.56%
Unsloth: Your GPU has CUDA compute capability 8.9 with VRAM = 19.55 GB.
Unsloth: Us

Loading safetensors checkpoint shards:   0% Completed | 0/1 [00:00<?, ?it/s]


Loading safetensors checkpoint shards:   0% Completed | 0/1 [00:00<?, ?it/s]


INFO 12-09 23:29:19 [gpu_model_runner.py:2392] Model loading took 2.3205 GiB and 0.668185 seconds
INFO 12-09 23:29:23 [backends.py:539] Using cache directory: /s/chopin/a/grad/elewark/.cache/vllm/torch_compile_cache/11a7d752eb/rank_0_0/backbone for vLLM's torch.compile
INFO 12-09 23:29:23 [backends.py:550] Dynamo bytecode transform time: 3.23 s
INFO 12-09 23:29:27 [backends.py:161] Directly load the compiled graph(s) for dynamic shape from the cache, took 4.394 s
INFO 12-09 23:29:28 [monitor.py:34] torch.compile takes 3.23 s in total
INFO 12-09 23:29:29 [gpu_worker.py:298] Available KV cache memory: -0.47 GiB
Unsloth: Retrying vLLM to process 6 sequences and 2048 tokens in tandem.
Error:
No available memory for the cache blocks. Try increasing `gpu_memory_utilization` when initializing the engine.
INFO 12-09 23:29:31 [utils.py:328] non-default args: {'load_format': 'bitsandbytes', 'dtype': torch.bfloat16, 'seed': 0, 'max_model_len': 256, 'enable_prefix_caching': True, 'disable_cascade_

RuntimeError: Duplicate layer name: model.layers.0.self_attn.attn

In [None]:
# https://csolab.research.google.com/github/unslothai/notebooks/blob/main/nb/Advanced_Llama3_2_(3B)_GRPO_LoRA.ipynb

grpo_trainer = GRPOTrainer(
    model = model,
    processing_class = tokenizer,
    reward_funcs = [
        reward_func
    ],
    args = grpo_training_args,
    train_dataset = grpo_dataset,
)
grpo_trainer.train()

In [17]:
run_game(main_game_file, 100, True)

Copyright (c) 1981, 1982, 1983 Infocom, Inc. All rights reserved.
ZORK is a registered trademark of Infocom, Inc.
Revision 88 / Serial number 840726

West of House
You are standing in an open field west of a white house, with a boarded front door.
There is a small mailbox here.


> GET MAILBOX
It is securely anchored.


> GET MAILBOX
It is securely anchored.


> GET MAILBOX
It is securely anchored.


> GET MAILBOX
It is securely anchored.


> GET MAILBOX
It is securely anchored.


> GET MAILBOX
It is securely anchored.


> GET MAILBOX
It is securely anchored.


> GET MAILBOX
It is securely anchored.


> GET MAILBOX
It is securely anchored.


> GET MAILBOX
It is securely anchored.


> GET MAILBOX
It is securely anchored.


> GET MAILBOX
It is securely anchored.


> GET MAILBOX
It is securely anchored.


> GET MAILBOX
It is securely anchored.


> GET MAILBOX
It is securely anchored.


> GET MAILBOX
It is securely anchored.


> GET MAILBOX
It is securely anchored.


> GET MAILBOX
It is se