<a href="https://colab.research.google.com/github/hieunguyen7337/LLM_RL/blob/main/Hangman_game_Inference.ipynb" target="_parent"><img src="https://colab.research.google.com/assets/colab-badge.svg" alt="Open In Colab"/></a>

#Import

In [1]:
from transformers import AutoModelForCausalLM, AutoTokenizer, DataCollatorWithPadding
import gc
import random
import json
import torch
import os
from tqdm.auto import tqdm
from datasets import load_dataset
from torch.utils.data import DataLoader

In [5]:
!git clone https://github.com/hieunguyen7337/LLM_RL.git

Cloning into 'LLM_RL'...
remote: Enumerating objects: 76, done.[K
remote: Counting objects: 100% (76/76), done.[K
remote: Compressing objects: 100% (70/70), done.[K
remote: Total 76 (delta 33), reused 12 (delta 5), pack-reused 0 (from 0)[K
Receiving objects: 100% (76/76), 457.83 KiB | 1.09 MiB/s, done.
Resolving deltas: 100% (33/33), done.


In [None]:
from huggingface_hub import notebook_login

notebook_login()

# Setting

In [3]:
# ---------------------------
# Settings (safe for T4 or CPU)
# ---------------------------
# MODEL_NAME = "Qwen/Qwen3-0.6B"
# MODEL_NAME = "Qwen/Qwen3-8B"
MODEL_NAME = "Qwen/Qwen2.5-0.5B"
# MODEL_NAME = "Qwen/Qwen2.5-0.5B-Instruct"
# MODEL_NAME = "google/gemma-3-270m"
# MODEL_NAME       = "facebook/opt-350m"   # FP32 on T4: start small; try opt-1.3b if VRAM allows
DATA_PATH        = "/content/LLM_RL/hangman_dataset.json"
BATCH_SIZE       = 512                    # raise/lower based on VRAM; FP32 uses ~2x memory vs FP16
MAX_NEW_TOKENS   = 1
TRUNCATE_TO      = 2048                  # shorten if memory-bound
PAD_TO_MULTIPLE  = 8                     # helps kernel efficiency; fine in FP32
NUM_WORKERS      = 0                     # dataloader workers

In [4]:
# Accuracy-first: disable TF32 (not relevant on T4, but safe everywhere)
torch.backends.cuda.matmul.allow_tf32 = False
if hasattr(torch, "set_float32_matmul_precision"):
    torch.set_float32_matmul_precision("highest")  # accuracy-preferring

DEVICE = "cuda" if torch.cuda.is_available() else "cpu"
DTYPE  = torch.float32  # force full precision

#Generate Random word list

In [None]:
!pip install faker

In [2]:
from faker import Faker

fake = Faker()

In [3]:
fake.word()

'town'

In [6]:
with open('/content/LLM_RL/testing_random_words_list.txt', 'r') as f:
  word_list = f.readlines()

# Remove newline characters
word_list = [word.strip() for word in word_list]

In [9]:
random_words = []
while len(random_words) < 500:
    if len(random_words) % 100 == 0:
        print(f"Generated {len(random_words)} words")
    w = fake.word()
    if w not in word_list and w not in random_words:
        random_words.append(w)

Generated 0 words
Generated 100 words
Generated 100 words
Generated 100 words
Generated 200 words
Generated 200 words
Generated 200 words
Generated 300 words
Generated 300 words
Generated 300 words
Generated 300 words
Generated 400 words


In [11]:
random_words[24:31]

['within', 'represent', 'method', 'know', 'guy', 'become', 'whatever']

In [12]:
file_path = "training_random_words_list.txt"

with open(file_path, "w") as f:
    for word in random_words:
        f.write(word + "\n")

#Preprocess Data

In [28]:
import random
import json

In [13]:
with open('/content/LLM_RL/testing_random_words_list.txt', 'r') as f:
  word_list = f.readlines()

# Remove newline characters
word_list = [word.strip() for word in word_list]

In [14]:
print(f"\nNumber of words: {len(word_list)}")


Number of words: 500


In [15]:
# Check for duplicate words
if len(word_list) == len(set(word_list)):
  print("No duplicate words found in the list.")
else:
  print("Duplicate words found in the list.")

No duplicate words found in the list.


In [16]:
# Remove words with hyphens
word_list = [word for word in word_list if '-' not in word and ' ' not in word]

# Check the number of words remaining
print(f"\nNumber of words remaining after removing hyphens: {len(word_list)}")


Number of words remaining after removing hyphens: 500


In [17]:
# # Get a specified number of random words (e.g., 5)
# # num_random_words = 200
# num_random_words = 2
# random_words = random.sample(word_list, num_random_words)
# # print(f"\n{num_random_words} random words from the list")

In [18]:
random_words = word_list

In [25]:
# Create a list to store the input texts
hangman_dataset = []
input_texts = []

for secret_word in random_words:
  for i in range(5):
    word_length = len(secret_word)
    current_state = ["_" for _ in range(word_length)]
    incorrect_guesses_remaining = 6
    guessed_letters = []

    # Simulate some random guesses to create a game in progress
    num_simulated_guesses = random.randint(0, len(secret_word)) # Simulate 0 to 3 guesses
    # num_simulated_guesses = 20
    available_letters = list("abcdefghijklmnopqrstuvwxyz")
    correct_letters = list(set(secret_word))

    for _ in range(num_simulated_guesses):
      if random.random() > 0.5:
        guess = random.choice(available_letters)
      else:
        guess = random.choice(correct_letters)

      correct_letters.remove(guess) if guess in correct_letters else None
      available_letters.remove(guess) if guess in available_letters else None

      guessed_letters.append(guess.upper())

      if guess in secret_word:
        for i in range(word_length):
          if secret_word[i] == guess:
            current_state[i] = guess

      else:
        incorrect_guesses_remaining -= 1
        if incorrect_guesses_remaining == 0:
          break # Game over during simulation

      if len(correct_letters) == 0:
        break

    # Create the input text for the prompt
    input_text = f"""You are playing a game of Hangman.

Your task is to guess a single character.

The word has a certain number of letters.
The current state of the word is shown with guessed letters filled in and blanks for the unknown letters.
The number of incorrect guesses remaining is listed.
All letters that have been guessed so far are listed.

You will format your response as a single uppercase letter at the end

The word has {word_length} letters.
The current state is: {' '.join(current_state)}
Incorrect guesses remaining: {incorrect_guesses_remaining}
Guessed letters: {guessed_letters}

Correct response:"""
    # if True:
    if incorrect_guesses_remaining > 0 and len(correct_letters) > 0 and input_text not in input_texts:
      hangman_dataset.append({"prompt": input_text, "word": secret_word})
      input_texts.append(input_text)

In [29]:
# Print the first generated input text to verify
print(len(hangman_dataset))
print()
i = 2
print(hangman_dataset[i]["prompt"])
print(hangman_dataset[i]["word"])

1825

You are playing a game of Hangman.

Your task is to guess a single character.

The word has a certain number of letters.
The current state of the word is shown with guessed letters filled in and blanks for the unknown letters.
The number of incorrect guesses remaining is listed.
All letters that have been guessed so far are listed.

You will format your response as a single uppercase letter at the end

The word has 5 letters.
The current state is: _ b _ _ _
Incorrect guesses remaining: 6
Guessed letters: ['B']

Correct response:
above


In [30]:
# Save both lists in one file
with open("training_hangman_dataset.json", "w", encoding="utf-8") as f:
    json.dump(hangman_dataset, f, indent=2, ensure_ascii=False)

#Load Preprocessed Data

In [5]:
# Load the JSON file
with open("/content/LLM_RL/hangman_dataset.json", "r", encoding="utf-8") as f:
    hangman_dataset = json.load(f)

In [6]:
len(hangman_dataset)

10429

In [7]:
i = 3
print(hangman_dataset[i]["prompt"])
print(hangman_dataset[i]["word"])

You are playing a game of Hangman.

Your task is to guess a single character.

The word has a certain number of letters.
The current state of the word is shown with guessed letters filled in and blanks for the unknown letters.
The number of incorrect guesses remaining is listed.
All letters that have been guessed so far are listed.

You will format your response as a single uppercase letter at the end

The word has 8 letters.
The current state is: i _ _ o _ _ _ _
Incorrect guesses remaining: 2
Guessed letters: ['S', 'W', 'I', 'Z', 'O', 'H']

Correct response:
imported


In [None]:
# ---------------------------
# Load dataset (JSON array or JSONL)
# ---------------------------
ds = load_dataset("json", data_files=DATA_PATH)["train"]
PROMPT_COL = "prompt"

#Load model

In [None]:
# ---------------------------
# Tokenizer & model (FP32)
# ---------------------------
tokenizer = AutoTokenizer.from_pretrained(MODEL_NAME, use_fast=True)
tokenizer.padding_side = "left"
if tokenizer.pad_token_id is None:
    tokenizer.pad_token = tokenizer.eos_token

In [None]:
model = AutoModelForCausalLM.from_pretrained(
    MODEL_NAME,
    torch_dtype=DTYPE,     # full precision
    device_map="auto" if DEVICE == "cuda" else None,
)
if DEVICE == "cpu":
    model.to(DEVICE)
model.eval()

In [11]:
# Optional compile (doesn't change numerics)
try:
    if DEVICE == "cuda":
        model = torch.compile(model, mode="max-autotune")
except Exception:
    pass

In [12]:
# ---------------------------
# Pre-tokenize (no padding here; collator handles it dynamically)
# ---------------------------
def tok(batch):
    return tokenizer(
        batch[PROMPT_COL],
        truncation=True,
        max_length=TRUNCATE_TO,
        padding=False,
        return_attention_mask=True,
    )

#Inference Model

In [None]:
# IMPORTANT: drop original string columns so the collator only sees tensors
tokenized = ds.map(tok, batched=True, remove_columns=ds.column_names)

In [14]:
collator = DataCollatorWithPadding(
    tokenizer=tokenizer,
    pad_to_multiple_of=PAD_TO_MULTIPLE if DEVICE == "cuda" else None,
    return_tensors="pt",
)

In [15]:
loader = DataLoader(
    tokenized,
    batch_size=BATCH_SIZE,
    shuffle=False,
    drop_last=False,
    num_workers=NUM_WORKERS,
    pin_memory=(DEVICE == "cuda"),
    persistent_workers=NUM_WORKERS > 0,
    collate_fn=collator,
)

In [16]:
# ---------------------------
# Generation (deterministic & fast)
# ---------------------------
gen_kwargs = dict(
    max_new_tokens=MAX_NEW_TOKENS,
    do_sample=False,  # greedy = deterministic & faster
    pad_token_id=tokenizer.pad_token_id,
    eos_token_id=tokenizer.eos_token_id,
    use_cache=True,
)

In [None]:
all_generations = []
with torch.inference_mode():
    pbar = tqdm(total=len(loader.dataset), desc="Generating", unit="ex")
    for batch in loader:
        input_ids = batch["input_ids"].to(DEVICE, non_blocking=True)
        attention_mask = batch["attention_mask"].to(DEVICE, non_blocking=True)

        out = model.generate(
            input_ids=input_ids,
            attention_mask=attention_mask,
            return_dict_in_generate=True,   # <-- add
            output_scores=False,            # <-- optional
            **gen_kwargs,
        )
        seqs = out.sequences                                   # [B, in_len + new_len]
        new_token_ids = seqs[:, input_ids.shape[1]:]           # <-- slice by true input length
        texts = tokenizer.batch_decode(
            new_token_ids,
            skip_special_tokens=True,
            clean_up_tokenization_spaces=True,
        )
        all_generations.extend([t.strip() for t in texts])
        pbar.update(len(texts))

        # ---- memory cleanup per batch ----
        del batch, input_ids, attention_mask, out, new_token_ids, texts
        if DEVICE == "cuda":
            torch.cuda.empty_cache()
            torch.cuda.ipc_collect()
        gc.collect()

    pbar.close()

In [None]:
all_generations

In [19]:
# Attach to dataset and peek
ds = ds.add_column("model_output", all_generations)
print(ds.select(range(min(5, len(ds)))))

Dataset({
    features: ['prompt', 'word', 'model_output'],
    num_rows: 5
})


In [None]:
# (Optional) Save results
ds.to_json("data_with_outputs_" + MODEL_NAME.split("/")[1] + ".jsonl", orient="records", lines=True)

#Evaluation

In [2]:
import json

In [25]:
# Create an empty list to store the parsed JSON objects
out_Qwen2_5_0_5B_Instruct = []
out_Qwen2_5_0_5B = []

# Open the .jsonl file and read each line
with open("/content/data_with_outputs_1_Qwen2.5-0.5B-Instruct.jsonl", "r", encoding="utf-8") as f:
    for line in f:
        # For each line, parse the JSON object and append it to the list
        out_Qwen2_5_0_5B_Instruct.append(json.loads(line))

with open("/content/data_with_outputs_1_Qwen2.5-0.5B.jsonl", "r", encoding="utf-8") as f:
    for line in f:
        # For each line, parse the JSON object and append it to the list
        out_Qwen2_5_0_5B.append(json.loads(line))

In [30]:
len(out_Qwen2_5_0_5B)

10429

In [31]:
import re, ast
from typing import List, Dict, Any

In [118]:
# --- helpers to parse prompt fields ---
def parse_prompt(prompt: str):
    """
    Returns (guessed_letters_upper: List[str], state_tokens_upper: List[str])
    where state tokens are letters or '_' from the 'current state' line.
    """
    guessed_letters = []
    m = re.search(r"Guessed letters:\s*(\[[^\]]*\])", prompt)
    if m:
        try:
            guessed_letters = [s.upper() for s in ast.literal_eval(m.group(1)) if isinstance(s, str)]
        except Exception:
            guessed_letters = []
    guessed_letters = list(dict.fromkeys(guessed_letters))  # dedupe, keep order

    state_tokens = []
    m2 = re.search(r"The current state is:\s*([A-Za-z_ ]+)", prompt)
    if m2:
        state_line = m2.group(1)
        state_tokens = state_line.split()  # e.g. ["_", "_", "P", "O", "_", "_", "E", "_"]

    state_tokens_upper = [t.upper() for t in state_tokens]
    return guessed_letters, state_tokens_upper

def evaluate_entry(entry: Dict[str, Any]) -> Dict[str, Any]:
    """
    Evaluate one record: expects keys 'prompt', 'word', 'model_output'.
    Returns a dict with is_correct, error_types, and extra diagnostics.
    """
    prompt = entry.get("prompt", "")
    word = str(entry.get("word", "")).upper()
    guess_raw = str(entry.get("model_output", "")).strip()

    errors = []
    valid = True

    # --- format checks ---
    if len(guess_raw) == 0:
        errors.append("Guess empty")
        valid = False
    elif len(guess_raw) != 1:
        errors.append("Guess is not a single character")
        valid = False

    guess = guess_raw if len(guess_raw) == 1 else None
    if guess is not None:
        if not guess.isalpha():
            errors.append("Non-alphabetic guess")
            valid = False

    guessed_letters, state_tokens = parse_prompt(prompt)

    # --- logical checks against game state ---
    # Determine if guess reveals at least one new position
    reveals_new = False
    in_word = False
    if guess is not None and guess.isalpha():
        in_word = guess.upper() in set(word)

        # Compare to current state; if any '_' position matches the guess in the target word -> reveals_new
        L = min(len(state_tokens), len(word))
        for i in range(L):
            if state_tokens[i] == "_" and word[i] == guess.upper():
                reveals_new = True
                break
        # If prompt state is shorter than word (edge case), consider remaining as hidden
        if not reveals_new and len(state_tokens) < len(word):
            for i in range(len(state_tokens), len(word)):
                if word[i] == guess.upper():
                    reveals_new = True
                    break

        # Already guessed?
        if guess.upper() in guessed_letters:
            errors.append("Already guessed")
        # Wrong letter?
        elif guess.upper() not in set(word):
            errors.append("Guess is not a character in word")

    is_correct = valid and (guess is not None) and in_word and (guess.upper() not in guessed_letters) and reveals_new

    return {
        "word": entry.get("word"),
        "guess_raw": guess_raw,
        "guess": guess if guess is not None else "",
        "guessed_letters_in_prompt": guessed_letters,
        "state_tokens_in_prompt": state_tokens,
        "is_correct": bool(is_correct),
        "error_types": errors if errors else ["Correct guess"],
    }

# --- evaluate a list of records (e.g., ds.to_list()) ---
def evaluate_records(records: List[Dict[str, Any]]) -> List[Dict[str, Any]]:
    return [evaluate_entry(r) for r in records]

In [119]:
results = evaluate_records(out_Qwen2_5_0_5B_Instruct)
# results = evaluate_records(out_Qwen2_5_0_5B)

In [80]:
# results[:3]

In [81]:
# out_Qwen2_5_0_5B[:3]

In [120]:
error = {}
for r in results:
  if r["error_types"][0] not in error and len(r["error_types"]) != 0:
    error[r["error_types"][0]] = 1
  else:
    error[r["error_types"][0]] += 1

In [121]:
error

{'Already guessed': 8858,
 'Non-alphabetic guess': 285,
 'Guess is not a single character': 1038,
 'Guess is not a character in word': 69,
 'Guess empty': 168,
 'Correct guess': 11}

In [126]:
for error_type in error:
  print("|", error_type, "|", error[error_type], "|", str(round(error[error_type]/104.29, 2))+"%", "|")

| Already guessed | 8858 | 84.94% |
| Non-alphabetic guess | 285 | 2.73% |
| Guess is not a single character | 1038 | 9.95% |
| Guess is not a character in word | 69 | 0.66% |
| Guess empty | 168 | 1.61% |
| Correct guess | 11 | 0.11% |


In [85]:
non_alpha_guess = {}
for r in results:
  if r["error_types"] == ['NON_ALPHA']:
    if r["guess_raw"] not in non_alpha_guess:
      non_alpha_guess[r["guess_raw"]] = 1
    else:
      non_alpha_guess[r["guess_raw"]] += 1

In [86]:
non_alpha_guess

{'_': 285}

#Test

In [None]:
input_text = """You are playing a game of Hangman.

Your task is to guess a single character.

The word has a certain number of letters.
The current state of the word is shown with guessed letters filled in and blanks for the unknown letters.
The number of incorrect guesses remaining is listed.
All letters that have been guessed so far are listed.

You will format your response as a single uppercase letter at the end

Here are a few examples to guide you:

Example 1
Prompt:
The word has 5 letters.
The current state is: A _ _ _ E
Incorrect guesses remaining: 4
Guessed letters: [A, E]
Correct response: R

Example 2
Prompt:
The word has 8 letters.
The current state is: B A _ _ _ _ A
Incorrect guesses remaining: 3
Guessed letters: [B, A]
Correct response: N

Example 3
Prompt:
The word has 7 letters.
The current state is: C H A _ _ _ _
Incorrect guesses remaining: 2
Guessed letters: [C, H, A]
Correct response: L

Example 4
Prompt:
The word has 6 letters.
The current state is: _ O O _ L _
Incorrect guesses remaining: 1
Guessed letters: [O, L]
Correct response: G

Example 5

The word has 10 letters.
The current state is: _ _ _ A _ _ _ I _ N
Incorrect guesses remaining: 4
Guessed letters: [A, I, N]
Correct response: S

Now, let's play the game.

The word has 6 letters.
The current state is: P _ _ _ O _
Incorrect guesses remaining: 5
Guessed letters: [E, P, O]

Correct response:"""
input_ids = tokenizer(input_text, return_tensors="pt").to(model.device)

outputs = model.generate(
    **input_ids,
    max_new_tokens=1)
print(tokenizer.decode(outputs[0])[-20:])


Correct response: R


In [None]:
# prepare the model input
prompt = """You are playing a game of Hangman.

The word has 6 letters.
The current state is: _ _ _ _ _ _
Incorrect guesses remaining: 6
Guessed letters: [ ]

Your task is to guess one letter. Output a single letter at the end."""
messages = [
    {"role": "user", "content": prompt}
]
text = tokenizer.apply_chat_template(
    messages,
    tokenize=False,
    add_generation_prompt=True,
    enable_thinking=True # Switches between thinking and non-thinking modes. Default is True.
)
model_inputs = tokenizer([text], return_tensors="pt").to(model.device)

# conduct text completion
generated_ids = model.generate(
    **model_inputs,
    max_new_tokens=32768
)
output_ids = generated_ids[0][len(model_inputs.input_ids[0]):].tolist()

# parsing thinking content
try:
    # rindex finding 151668 (</think>)
    index = len(output_ids) - output_ids[::-1].index(151668)
except ValueError:
    index = 0

thinking_content = tokenizer.decode(output_ids[:index], skip_special_tokens=True).strip("\n")
content = tokenizer.decode(output_ids[index:], skip_special_tokens=True).strip("\n")

print("thinking content:", thinking_content)
print("content:", content)


thinking content: <think>
Okay, let's see. I need to figure out the correct letter to guess in this Hangman game. The word has 6 letters, and the current state is that there are four underscores, and the remaining guesses are 6. The guessed letters are empty. So the goal is to determine which letter to guess next.

Wait, but the problem says "output a single letter at the end". Hmm, maybe I need to pick a letter that's not guessed yet, not in the wrong position, and not part of the wrong letters. Let me think.

The initial state shows that there are four underscores. So the word has 6 letters, and the underscores are the letters that haven't been guessed yet. So the possible letters to guess could be any of the 6 letters that haven't been guessed yet, but also not in the wrong positions. Wait, but the problem says that the current state is "_ _ _ _ _ _", so maybe the underscores are the letters that need to be filled in. The remaining guesses are 6. So the user has 6 guesses left, but 