In [1]:
import glob

game_files = glob.glob("./z-machine-games-master/jericho-game-suite/zork1.z5")
main_game_file = "./z-machine-games-master/jericho-game-suite/zork1.z5"

train_model_name = "unsloth/Llama-3.2-3B-Instruct-bnb-4bit" #"unsloth/llama-3-8b-bnb-4bit"
chat_template_name = "llama-3.2"

In [2]:
import jericho


def get_steps(filename: str):
    env = jericho.FrotzEnv(filename)
    
    initial_obs, info = env.reset()
    walkthrough = env.get_walkthrough()

    steps = []
    
    obs = initial_obs
    for step in walkthrough:
        steps.append((obs, step))
        #print(obs, step)
        obs, reward, done, info = env.step(step)
        #print(reward)
        if done:
            break

    env.close()

    return steps


steps = []
for game_file in game_files:
    steps.append(get_steps(game_file))

In [3]:
from datasets import Dataset
#from unsloth import standardize_sharegpt

def steps_to_dataset(steps: list[list[tuple[str, str]]], length: int):
    convos = []

    for game in steps:
        convo = []
        n = 0
        
        for step in game:
            convo.append({"role": "user", "content": step[0]})
            convo.append({"role": "assistant", "content": step[1]})
            n += 1
            if length > 0 and n >= length:
                n = 0
                convos.append(convo)
                convo = []

        if len(convo) > 0:
            convos.append(convo)

    return Dataset.from_dict({"conversations": convos})

dataset = steps_to_dataset(steps, 5)
print(dataset[0])
#dataset = standardize_sharegpt(dataset)
#print(dataset[0])

{'conversations': [{'content': 'Copyright (c) 1981, 1982, 1983 Infocom, Inc. All rights reserved.\nZORK is a registered trademark of Infocom, Inc.\nRevision 88 / Serial number 840726\n\nWest of House\nYou are standing in an open field west of a white house, with a boarded front door.\nThere is a small mailbox here.\n\n', 'role': 'user'}, {'content': 'N', 'role': 'assistant'}, {'content': 'North of House\nYou are facing the north side of a white house. There is no door here, and all the windows are boarded up. To the north a narrow path winds through the trees.\n\n', 'role': 'user'}, {'content': 'N', 'role': 'assistant'}, {'content': 'Forest Path\nThis is a path winding through a dimly lit forest. The path heads north-south here. One particularly large tree with some low branches stands at the edge of the path.\n\n', 'role': 'user'}, {'content': 'U', 'role': 'assistant'}, {'content': "Up a Tree\nYou are about 10 feet above the ground nestled among some large branches. The nearest branch

In [4]:
# Taken from this article:
# https://docs.unsloth.ai/get-started/fine-tuning-llms-guide/tutorial-how-to-finetune-llama-3-and-use-in-ollama
# https://colab.research.google.com/github/unslothai/notebooks/blob/main/nb/Advanced_Llama3_2_(3B)_GRPO_LoRA.ipynb
from unsloth import FastLanguageModel
from unsloth.chat_templates import get_chat_template
import torch

max_seq_length = 2048
dtype = None
load_in_4bit = True
lora_rank = 64 # Larger rank = smarter, but slower

def load_model(model_name):
    model, tokenizer = FastLanguageModel.from_pretrained(
        model_name = model_name,
        max_seq_length = max_seq_length,
        dtype = dtype,
        load_in_4bit = load_in_4bit,
        fast_inference = True, # required for vLLM, https://github.com/huggingface/open-r1/issues/572
        max_lora_rank = lora_rank,
    )
    
    model = FastLanguageModel.get_peft_model(
        model,
        r = 16,
        target_modules = ["q_proj", "k_proj", "v_proj", "o_proj",
                         "gate_proj", "up_proj", "down_proj"],
        lora_alpha = 16,
        lora_dropout = 0,
        bias = "none",
        use_gradient_checkpointing = "unsloth",
        random_state = 3407,
        use_rslora = False,
        loftq_config = None
    )
    
    tokenizer = get_chat_template(tokenizer, chat_template = chat_template_name)

    return model, tokenizer

model, tokenizer = load_model(train_model_name)

ðŸ¦¥ Unsloth: Will patch your computer to enable 2x faster free finetuning.
INFO 12-09 15:33:12 [__init__.py:216] Automatically detected platform cuda.
ðŸ¦¥ Unsloth Zoo will now patch everything to make training faster!


In [5]:
# https://docs.unsloth.ai/get-started/fine-tuning-llms-guide/datasets-guide
def formatting_prompts_func(examples):
    convos = examples["conversations"]
    texts = [
        tokenizer.apply_chat_template(convo, tokenize=False, add_generation_prompt = False)
        for convo in convos
    ]
    return {'text': texts}

dataset = dataset.map(formatting_prompts_func, batched=True)
#dataset[0]
#dataset[0]['text']

Map:   0%|          | 0/80 [00:00<?, ? examples/s]

NameError: name 'tokenizer' is not defined

In [None]:
from trl import SFTTrainer
from transformers import TrainingArguments
from unsloth import is_bfloat16_supported
from unsloth.chat_templates import train_on_responses_only

enable_bf16 = is_bfloat16_supported()

trainer = SFTTrainer(
    model = model,
    tokenizer = tokenizer,
    train_dataset = dataset,
    dataset_text_field = "text",
    max_seq_length = max_seq_length,
    dataset_num_proc = 2,
    packing = False,
    formatting_func = formatting_prompts_func,
    args = TrainingArguments(
        per_device_train_batch_size = 2,
        gradient_accumulation_steps = 4,
        warmup_steps = 5,
        max_steps = 60,
        # num_train_epochs = 1,
        learning_rate = 2e-4,
        fp16 = not enable_bf16,
        bf16 = enable_bf16,
        logging_steps = 1,
        optim = "adamw_8bit",
        weight_decay = 0.01,
        lr_scheduler_type = "linear",
        seed = 3407,
        output_dir = "outputs",
    )
)

# https://colab.research.google.com/github/unslothai/notebooks/blob/main/nb/Llama3.2_(1B_and_3B)-Conversational.ipynb
trainer = train_on_responses_only(
    trainer,
    instruction_part = "<|start_header_id|>user<|end_header_id|>\n\n",
    response_part = "<|start_header_id|>assistant<|end_header_id|>\n\n",
)

In [None]:
trainer_stats = trainer.train()

In [None]:
model.save_pretrained("lora_model")
tokenizer.save_pretrained("lora_model")

In [11]:
from transformers import TextStreamer

FastLanguageModel.for_inference(model)


def make_message(role, content):
    return {"role": role, "content": content}


def get_input_ids(messages):
    return tokenizer.apply_chat_template(
        messages,
        add_generation_prompt = True,
        return_tensors = "pt",
    ).to("cuda")


def generate_response(messages):
    input_ids = get_input_ids(messages)

    output_ids = model.generate(input_ids,
        max_new_tokens = 128,
    )
    out = tokenizer.batch_decode(output_ids)
    
    out_line = out[0]
    start_token = "<|end_header_id|>"
    end_token = tokenizer.eos_token
    
    start_index = out_line.rindex(start_token) + len(start_token)
    end_index = out_line.rindex(end_token)
    
    return out_line[start_index : end_index].strip()

In [12]:
msgs = [make_message("user", "You are in a room. You see an egg on a table and a chest of drawers.")]
generate_response(msgs)

The attention mask is not set and cannot be inferred from input because pad token is same as eos token. As a consequence, you may observe unexpected behavior. Please pass your input's `attention_mask` to obtain reliable results.


'Get egg'

In [13]:
msgs = [make_message("user", "You are in a cave. In front of you lies a sword sticking out of a large boulder.")]
generate_response(msgs)

'Get sword'

In [15]:
def run_game(game_filename: str, n_steps: int, with_history: bool):
    env = jericho.FrotzEnv(game_filename)

    messages = []
    
    obs, info = env.reset()
    print(obs)
    
    for i in range(n_steps):
        if not with_history:
            messages.clear()

        messages.append(make_message("user", obs))
        
        response = generate_response(messages)
        print(">", response)
        messages.append(make_message("assistant", response))
        
        obs, reward, done, info = env.step(response)
        print(obs)
        if done:
            break
    
    env.close()

run_game(main_game_file, 100, True)

Copyright (c) 1981, 1982, 1983 Infocom, Inc. All rights reserved.
ZORK is a registered trademark of Infocom, Inc.
Revision 88 / Serial number 840726

West of House
You are standing in an open field west of a white house, with a boarded front door.
There is a small mailbox here.


> W
Forest
This is a forest, with trees in all directions. To the east, there appears to be sunlight.


> E
Forest Path
This is a path winding through a dimly lit forest. The path heads north-south here. One particularly large tree with some low branches stands at the edge of the path.


> N
Clearing
You are in a clearing, with a forest surrounding you on all sides. A path leads south.
On the ground is a pile of leaves.


> get leaves
In disturbing the pile of leaves, a grating is revealed.
Taken.


> S
Forest Path


> N
Clearing


> S
Forest Path


> N
Clearing


> S
Forest Path


> N
Clearing


> S
Forest Path


> N
Clearing


> S
Forest Path


> N
Clearing


> S
Forest Path
You hear in the distance the chir

In [7]:
def make_grpo_dataset(env: jericho.FrotzEnv):
    states = {}
    
    initial_obs, info = env.reset()
    walkthrough = env.get_walkthrough()

    prompts = []
    hashes = []
    
    obs = initial_obs
    for step in walkthrough:
        prompts.append([{"role": "user", "content": obs}])
        
        state_hash = env.get_world_state_hash()
        hashes.append(state_hash)
        if state_hash not in states:
            states[state_hash] = env.get_state()
        
        obs, reward, done, info = env.step(step)

        if done:
            break

    env.close()

    return Dataset.from_dict({"prompt": prompts, "state_hashes": hashes}), states


env = jericho.FrotzEnv(main_game_file)
grpo_dataset, states = make_grpo_dataset(env)


# Workaround for segfault in FrotzEnv.set_state
# Create + reuse a separate env within the GRPO reward function
reward_env = None

def shorten_response(response: str):
    return response.split("\n")[0]

def reward_func(prompts, completions, state_hashes, **kwargs): #(prompts, completions, **kwargs):
    global reward_env
    #env.set_state(states[state_hash])
    #env.step
    if reward_env is None:
        reward_env = jericho.FrotzEnv(main_game_file)
    #local_env = env.copy()
    scores = []
    for prompt, completion, state_hash in zip(prompts, completions, state_hashes):
        reward_env.set_state(states[state_hash])
        command = completion[0]["content"]

        cur_inv_size = len(reward_env.get_inventory())
        obs, reward, done, info = reward_env.step(command)
        new_inv_size = len(reward_env.get_inventory())

        if reward_env.get_world_state_hash() != state_hash:
            # Reward taking a valid action
            reward += 1.0
        if new_inv_size > cur_inv_size:
            # Reward picking up items
            reward += new_inv_size - cur_inv_size

        short_desc = shorten_response(prompt[0]["content"])
        short_obs = shorten_response(obs)
        #print(f"'{short_desc}': '{command}' -> '{short_obs}' {reward}")
        #print(info)

        scores.append(reward)
    #local_env.close()

    #print(scores)
    return scores



In [8]:
# https://docs.unsloth.ai/get-started/reinforcement-learning-rl-guide/tutorial-train-your-own-reasoning-model-with-grpo
# https://colab.research.google.com/github/unslothai/notebooks/blob/main/nb/Advanced_Llama3_2_(3B)_GRPO_LoRA.ipynb
from trl import GRPOConfig, GRPOTrainer

max_seq_length = 2048
max_prompt_length = 287 + 1

grpo_training_args = GRPOConfig(
    use_vllm = True, # use vLLM for fast inference!
    learning_rate = 5e-6,
    #adam_beta1 = 0.9,
    #adam_beta2 = 0.99,
    weight_decay = 0.1,
    warmup_ratio = 0.1,
    lr_scheduler_type = "cosine",
    optim = "adamw_8bit",
    logging_steps = 1,
    per_device_train_batch_size = 1,
    #bf16 = is_bfloat16_supported(),
    #fp16 = not is_bfloat16_supported(),
    gradient_accumulation_steps = 4, # Increase to 4 for smoother training, decrease if OOM
    num_generations = 4, # Decrease if out of memory
    max_prompt_length = max_prompt_length,
    max_completion_length = max_seq_length - max_prompt_length,
    max_steps = 500,
    save_steps = 250,
    max_grad_norm = 1.0,
    report_to = "none",
    output_dir = "outputs",
)

In [9]:
model, tokenizer = load_model("lora_model")

INFO 12-09 15:33:36 [vllm_utils.py:702] Unsloth: Patching vLLM v1 graph capture
INFO 12-09 15:33:36 [vllm_utils.py:732] Unsloth: Patching vLLM v0 graph capture
==((====))==  Unsloth 2025.12.1: Fast Llama patching. Transformers: 4.57.3. vLLM: 0.10.2.
   \\   /|    NVIDIA RTX 4000 Ada Generation. Num GPUs = 1. Max memory: 19.548 GB. Platform: Linux.
O^O/ \_/ \    Torch: 2.8.0+cu128. CUDA: 8.9. CUDA Toolkit: 12.8. Triton: 3.4.0
\        /    Bfloat16 = TRUE. FA [Xformers = 0.0.32.post1. FA2 = False]
 "-____-"     Free license: http://github.com/unslothai/unsloth
Unsloth: Fast downloading is enabled - ignore downloading bars which are red colored!
Unsloth: vLLM loading unsloth/llama-3.2-3b-instruct-bnb-4bit with actual GPU utilization = 49.44%
Unsloth: Your GPU has CUDA compute capability 8.9 with VRAM = 19.55 GB.
Unsloth: Using conservativeness = 1.0. Chunked prefill tokens = 2048. Num Sequences = 32.
Unsloth: vLLM's KV Cache can use up to 7.11 GB. Also swap space = 6 GB.
Unsloth: Disabli

`torch_dtype` is deprecated! Use `dtype` instead!


INFO 12-09 15:33:44 [__init__.py:1815] Using max model len 2048
INFO 12-09 15:33:45 [scheduler.py:222] Chunked prefill is enabled with max_num_batched_tokens=4096.
Unsloth: vLLM Bitsandbytes config using kwargs = {'load_in_8bit': False, 'load_in_4bit': True, 'bnb_4bit_compute_dtype': 'bfloat16', 'bnb_4bit_quant_storage': 'uint8', 'bnb_4bit_quant_type': 'nf4', 'bnb_4bit_use_double_quant': True, 'llm_int8_enable_fp32_cpu_offload': False, 'llm_int8_has_fp16_weight': False, 'llm_int8_skip_modules': ['lm_head', 'multi_modal_projector', 'merger', 'modality_projection'], 'llm_int8_threshold': 6.0}
INFO 12-09 15:33:46 [core.py:76] Initializing a V1 LLM engine (v0.10.2) with config: model='unsloth/llama-3.2-3b-instruct-bnb-4bit', speculative_config=None, tokenizer='unsloth/llama-3.2-3b-instruct-bnb-4bit', skip_tokenizer_init=False, tokenizer_mode=auto, revision=None, tokenizer_revision=None, trust_remote_code=False, dtype=torch.bfloat16, max_seq_len=2048, download_dir=None, load_format=bitsandb



INFO 12-09 15:33:46 [gpu_model_runner.py:2370] Loading model from scratch...
INFO 12-09 15:33:46 [cuda.py:362] Using Flash Attention backend on V1 engine.
INFO 12-09 15:33:47 [bitsandbytes_loader.py:758] Loading weights with BitsAndBytes quantization. May take a while ...
INFO 12-09 15:33:47 [weight_utils.py:348] Using model weights format ['*.safetensors']
INFO 12-09 15:33:47 [weight_utils.py:406] No model.safetensors.index.json found in remote.


Loading safetensors checkpoint shards:   0% Completed | 0/1 [00:00<?, ?it/s]


Loading safetensors checkpoint shards:   0% Completed | 0/1 [00:00<?, ?it/s]


INFO 12-09 15:33:47 [punica_selector.py:19] Using PunicaWrapperGPU.
INFO 12-09 15:33:48 [gpu_model_runner.py:2392] Model loading took 2.3519 GiB and 0.990941 seconds
INFO 12-09 15:33:52 [backends.py:539] Using cache directory: /s/chopin/a/grad/elewark/.cache/vllm/torch_compile_cache/dc2c8eddc4/rank_0_0/backbone for vLLM's torch.compile
INFO 12-09 15:33:52 [backends.py:550] Dynamo bytecode transform time: 3.37 s
INFO 12-09 15:33:56 [backends.py:161] Directly load the compiled graph(s) for dynamic shape from the cache, took 4.560 s
INFO 12-09 15:33:58 [monitor.py:34] torch.compile takes 3.37 s in total
INFO 12-09 15:33:58 [gpu_worker.py:298] Available KV cache memory: 6.91 GiB
INFO 12-09 15:33:59 [kv_cache_utils.py:864] GPU KV cache size: 64,640 tokens
INFO 12-09 15:33:59 [kv_cache_utils.py:868] Maximum concurrency for 2,048 tokens per request: 31.56x
INFO 12-09 15:33:59 [vllm_utils.py:707] Unsloth: Running patched vLLM v1 `capture_model`.


Capturing CUDA graphs (mixed prefill-decode, PIECEWISE): 100%|â–ˆâ–ˆâ–ˆâ–ˆâ–ˆâ–ˆâ–ˆâ–ˆâ–ˆ| 11/11 [00:00<00:00, 12.33it/s]
Capturing CUDA graphs (decode, FULL): 100%|â–ˆâ–ˆâ–ˆâ–ˆâ–ˆâ–ˆâ–ˆâ–ˆâ–ˆâ–ˆâ–ˆâ–ˆâ–ˆâ–ˆâ–ˆâ–ˆâ–ˆâ–ˆâ–ˆâ–ˆâ–ˆâ–ˆâ–ˆâ–ˆâ–ˆâ–ˆâ–ˆâ–ˆâ–ˆâ–ˆ| 7/7 [00:00<00:00, 13.00it/s]

INFO 12-09 15:34:00 [gpu_model_runner.py:3118] Graph capturing finished in 1 secs, took 0.35 GiB
INFO 12-09 15:34:00 [vllm_utils.py:714] Unsloth: Patched vLLM v1 graph capture finished in 1 secs.





INFO 12-09 15:34:01 [gpu_worker.py:391] Free memory on device (19.31/19.55 GiB) on startup. Desired GPU memory utilization is (0.49444859013547565, 9.67 GiB). Actual usage is 2.35 GiB for weight, 0.39 GiB for peak activation, 0.02 GiB for non-torch memory, and 0.35 GiB for CUDAGraph memory. Replace gpu_memory_utilization config with `--kv-cache-memory=6881828352` to fit into requested memory, or `--kv-cache-memory=17234976256` to fully utilize gpu memory. Current kv cache memory in use is 7414504960 bytes.
INFO 12-09 15:34:01 [core.py:218] init engine (profile, create kv cache, warmup model) took 12.92 seconds
INFO 12-09 15:34:02 [llm.py:295] Supported_tasks: ('generate',)
INFO 12-09 15:34:02 [__init__.py:36] No IOProcessor plugins requested by the model
Unsloth: Just some info: will skip parsing ['layer_norm1', 'attention_norm', 'post_attention_layernorm', 'ffn_norm', 'norm', 'post_layernorm', 'pre_feedforward_layernorm', 'norm1', 'post_feedforward_layernorm', 'layer_norm2', 'norm2', 

Some weights of LlamaForCausalLM were not initialized from the model checkpoint at unsloth/llama-3.2-3b-instruct-bnb-4bit and are newly initialized: ['lm_head.weight']
You should probably TRAIN this model on a down-stream task to be able to use it for predictions and inference.


Performing substitution for additional_keys=set()
Unsloth: Just some info: will skip parsing ['layer_norm1', 'attention_norm', 'post_attention_layernorm', 'ffn_norm', 'norm', 'post_layernorm', 'pre_feedforward_layernorm', 'norm1', 'post_feedforward_layernorm', 'layer_norm2', 'cross_attn_input_layernorm', 'cross_attn_post_attention_layernorm', 'norm2', 'k_norm', 'input_layernorm', 'q_norm']


Unsloth 2025.12.1 patched 28 layers with 28 QKV layers, 28 O layers and 28 MLP layers.
Unsloth: Already have LoRA adapters! We shall skip this step.


In [10]:
# https://csolab.research.google.com/github/unslothai/notebooks/blob/main/nb/Advanced_Llama3_2_(3B)_GRPO_LoRA.ipynb

grpo_trainer = GRPOTrainer(
    model = model,
    processing_class = tokenizer,
    reward_funcs = [
        reward_func
    ],
    args = grpo_training_args,
    train_dataset = grpo_dataset,
)
grpo_trainer.train()

==((====))==  Unsloth - 2x faster free finetuning | Num GPUs used = 1
   \\   /|    Num examples = 396 | Num Epochs = 2 | Total steps = 500
O^O/ \_/ \    Batch size per device = 1 | Gradient accumulation steps = 4
\        /    Data Parallel GPUs = 1 | Total batch size (1 x 4 x 1) = 4
 "-____-"     Trainable parameters = 24,313,856 of 3,237,063,680 (0.75% trained)


Step,Training Loss,reward,reward_std,completions / mean_length,completions / min_length,completions / max_length,completions / clipped_ratio,completions / mean_terminated_length,completions / min_terminated_length,completions / max_terminated_length,sampling / sampling_logp_difference / mean,sampling / sampling_logp_difference / max,sampling / importance_sampling_ratio / min,sampling / importance_sampling_ratio / mean,sampling / importance_sampling_ratio / max,kl,rewards / reward_func / mean,rewards / reward_func / std
1,0.0073,0.0,0.0,3.5,2.0,4.0,0.0,3.5,2.0,4.0,0,0,0,0,0,7.347731,0.0,0.0
2,0.008,0.0,0.0,3.0,3.0,3.0,0.0,3.0,3.0,3.0,No Log,No Log,No Log,No Log,No Log,8.002329,0.0,0.0
3,0.0038,0.25,0.5,2.25,2.0,3.0,0.0,2.25,2.0,3.0,No Log,No Log,No Log,No Log,No Log,9.198926,0.25,0.5
4,0.0119,0.0,0.0,2.0,2.0,2.0,0.0,2.0,2.0,2.0,No Log,No Log,No Log,No Log,No Log,11.857876,0.0,0.0
5,0.0094,2.25,3.201562,2.75,2.0,4.0,0.0,2.75,2.0,4.0,No Log,No Log,No Log,No Log,No Log,9.149116,2.25,3.201562
6,0.0146,0.5,0.57735,2.25,2.0,3.0,0.0,2.25,2.0,3.0,No Log,No Log,No Log,No Log,No Log,10.018931,0.5,0.57735
7,0.0096,0.0,0.0,3.75,2.0,5.0,0.0,3.75,2.0,5.0,No Log,No Log,No Log,No Log,No Log,9.637768,0.0,0.0
8,0.0093,1.0,0.0,2.0,2.0,2.0,0.0,2.0,2.0,2.0,No Log,No Log,No Log,No Log,No Log,9.349692,1.0,0.0
9,-0.0023,1.0,1.154701,3.0,3.0,3.0,0.0,3.0,3.0,3.0,No Log,No Log,No Log,No Log,No Log,8.992105,1.0,1.154701
10,0.0104,1.0,0.0,2.0,2.0,2.0,0.0,2.0,2.0,2.0,No Log,No Log,No Log,No Log,No Log,10.392747,1.0,0.0


Unsloth: Will smartly offload gradients to save VRAM!


TrainOutput(global_step=500, training_loss=0.010674796713050454, metrics={'train_runtime': 441.8259, 'train_samples_per_second': 4.527, 'train_steps_per_second': 1.132, 'total_flos': 0.0, 'train_loss': 0.010674796713050454})