In [1]:
from pathlib import Path
from tqdm import tqdm
import jsonlines
import os
import pandas as pd
import torch

from datasets import Dataset
from transformers import (
    AutoModelForCausalLM,
    AutoTokenizer,
    TrainingArguments,
    Trainer,
    BitsAndBytesConfig,
    DataCollatorForLanguageModeling,
    EarlyStoppingCallback
)

from peft import LoraConfig, get_peft_model

# Config

In [2]:
MODEL_NAME = "Qwen/Qwen3-0.6B"
DATASET_PATH = Path("FIREBALL/filtered")

MODEL_SAVE_DIR = Path("../models/core")

In [3]:
MODEL_SAVE_DIR.mkdir(parents=True, exist_ok=True)

# Data

## Load

In [4]:
if not DATASET_PATH.parent.exists():
    !git lfs install && git clone https://huggingface.co/datasets/lara-martin/FIREBALL

from [dataset info](https://huggingface.co/datasets/lara-martin/FIREBALL)
```
{
    "speaker_id": The anonymized user ID of the user who sent the commands in the triple. 
    "before_utterances": A list of strings corresponding to the "preceding" utterances in the triple.
    "combat_state_before": A list of normalized actor states (see below) for each actor in the combat instance at the instant before the command was run.
    "current_actor": (nullable) The normalized actor state of the actor whose turn it currently is.
    "commands_norm": A list of strings corresponding to the "commands" portion of the triple.
    "automation_results": A mechanically generated list of strings representing the results of running the action in the Avrae engine.
    "caster_after": The normalized actor state of the actor who ran the action(s), which may or may not be the current actor.
    "targets_after": A list of normalized actor states for each actor who was targeted by the action.
    "combat_state_after": A list of normalized actor states for each actor in the combat instance at the instant after the command was run.
    "after_utterances": A list of strings corresponding to the "following" utterances in the triple.
    "utterance_history": The last 5 messages in the chat history before the command was run.
    "before_idxs": A list of integers corresponding to the index of the "message" events containing the "preceding" utterances in the raw event file.
    "before_state_idx": The index of the "combat_state_update" event in the raw event file that was used to derive "combat_state_before".
    "command_idxs": The indexes of the "command" events corresponding to the "commands_norm" key.
    "after_state_idx": The index of the "combat_state_update" event corresponding to the "combat_state_after" key.
    "after_idxs": The indexes of the "message" events corresponding to the "after_utterances" key.
    "embed_idxs": (nullable, same length as "automation_results") The indexes of "message" events corresponding to rich results shown to players on Discord for each result in the "automation_results" key.
}
```

## Preprocess

In [102]:
def string_filtrate(s: str) -> str:
    return s.replace("\n", " ").replace("\t", " ").replace("*", "")

In [103]:
def preprocess(obj: dict | pd.Series) -> tuple[str, str]:
    before_utterances = obj.get("before_utterances", [])
    automation_results =  obj.get("automation_results", [])
    after_utterances = obj.get("after_utterances", [])
    return f'''
[CONTEXT]: {string_filtrate(' '.join(before_utterances))}
[COMMAND]: {string_filtrate(' '.join(automation_results))}
''', string_filtrate(' '.join(after_utterances))


In [108]:
samples = []

for file in tqdm(DATASET_PATH.iterdir()):
    with jsonlines.open(file) as reader:
        for obj in reader:
            prompt, response = preprocess(obj)
            if prompt and response:
                samples.append({"prompt": prompt, "response": response})

1471it [00:06, 226.21it/s]


In [124]:
dataset = Dataset.from_list(samples).train_test_split(test_size=0.1)

In [125]:
dataset

DatasetDict({
    train: Dataset({
        features: ['prompt', 'response'],
        num_rows: 39031
    })
    test: Dataset({
        features: ['prompt', 'response'],
        num_rows: 4337
    })
})

# Model

## Tokenization

In [126]:
tokenizer = AutoTokenizer.from_pretrained(
    MODEL_NAME,
    trust_remote_code=True,
    padding_side="right",
    pad_token="[EOT]" # end of text
)

tokenizer.add_special_tokens({
    "pad_token": "[EXTRA_0]",
    "eos_token": "[EOT]",
    "bos_token": "[EOT]"
})


1

In [127]:
bnb_config = BitsAndBytesConfig(
    load_in_4bit=True,
    bnb_4bit_quant_type="nf4",
    bnb_4bit_compute_dtype=torch.float16,
)

model = AutoModelForCausalLM.from_pretrained(
    MODEL_NAME,
    quantization_config=bnb_config,
    device_map={'': torch.cuda.current_device()},
    trust_remote_code=True
)

model.resize_token_embeddings(len(tokenizer))

Embedding(151671, 1024)

In [128]:
def format_and_tokenize(examples):
    texts = [
        f"[im_start]system\nТы Dungeon Master[im_end]\n"
        f"[im_start]user\n{str(prompt)}[im_end]\n"
        f"[im_start]assistant\n{str(response)}[im_end]"
        for prompt, response in zip(examples['prompt'], examples['response'])
    ]

    tokenized = tokenizer(
        texts,
        padding="max_length",
        truncation=True,
        max_length=512,
        return_tensors="pt",
        add_special_tokens=False
    )

    # Убираем requires_grad и преобразуем в списки
    return {
        "input_ids": tokenized["input_ids"].tolist(),
        "attention_mask": tokenized["attention_mask"].tolist(),
        "labels": tokenized["input_ids"].tolist()
    }

In [129]:
dataset = dataset.map(
    format_and_tokenize,
    batched=True,
    remove_columns=["prompt", "response"],
    desc="Formatting and tokenizing"
)

Formatting and tokenizing:   0%|          | 0/39031 [00:00<?, ? examples/s]

Formatting and tokenizing:   0%|          | 0/4337 [00:00<?, ? examples/s]

##  LoRa config

In [130]:
lora_config = LoraConfig(
    r=8,  # Уменьшено с 16
    lora_alpha=16,
    target_modules=["q_proj", "v_proj"],  # Только ключевые модули
    lora_dropout=0.05,
    bias="none",
    task_type="CAUSAL_LM"
)

model = get_peft_model(model, lora_config)
model.print_trainable_parameters()

trainable params: 1,146,880 || all params: 596,925,440 || trainable%: 0.1921


## Train config

In [135]:
training_args = TrainingArguments(
    output_dir=MODEL_SAVE_DIR,
    per_device_train_batch_size=4,  # Увеличен размер батча (если позволяет память GPU)
    gradient_accumulation_steps=2,  # Увеличена аккумуляция градиентов
    learning_rate=2e-4,
    num_train_epochs=1,
    logging_steps=100,
    fp16=True,
    optim="adamw_bnb_8bit",  # Более эффективный оптимизатор
    save_strategy="steps",
    eval_strategy="steps",
    eval_steps=500,
    save_steps=500,
    report_to="none",
    gradient_checkpointing=False,  # Отключено для ускорения
    load_best_model_at_end=True,
    remove_unused_columns=True,
    label_names=["labels"],
    max_grad_norm=0.3,
    dataloader_num_workers=2,  # Параллельная загрузка данных
    torch_compile=False  # Компиляция графа вычислений
)

In [136]:
data_collator = DataCollatorForLanguageModeling(
    tokenizer=tokenizer,
    mlm=False,
    pad_to_multiple_of=8
)

In [137]:
trainer = Trainer(
    model=model,
    args=training_args,
    train_dataset=dataset["train"],
    eval_dataset=dataset["test"],
    data_collator=data_collator,
    processing_class=tokenizer,
)

In [None]:
trainer.train()
model.save_pretrained(OUTPUT_DIR)

huggingface/tokenizers: The current process just got forked, after parallelism has already been used. Disabling parallelism to avoid deadlocks...
	- Avoid using `tokenizers` before the fork if possible
	- Explicitly set the environment variable TOKENIZERS_PARALLELISM=(true | false)
huggingface/tokenizers: The current process just got forked, after parallelism has already been used. Disabling parallelism to avoid deadlocks...
	- Avoid using `tokenizers` before the fork if possible
	- Explicitly set the environment variable TOKENIZERS_PARALLELISM=(true | false)


Step,Training Loss,Validation Loss
500,2.1148,2.177814
1000,2.1272,2.12191
1500,2.0738,2.085763
2000,2.028,2.059729
2500,1.9426,2.037523


huggingface/tokenizers: The current process just got forked, after parallelism has already been used. Disabling parallelism to avoid deadlocks...
	- Avoid using `tokenizers` before the fork if possible
	- Explicitly set the environment variable TOKENIZERS_PARALLELISM=(true | false)
huggingface/tokenizers: The current process just got forked, after parallelism has already been used. Disabling parallelism to avoid deadlocks...
	- Avoid using `tokenizers` before the fork if possible
	- Explicitly set the environment variable TOKENIZERS_PARALLELISM=(true | false)
huggingface/tokenizers: The current process just got forked, after parallelism has already been used. Disabling parallelism to avoid deadlocks...
	- Avoid using `tokenizers` before the fork if possible
	- Explicitly set the environment variable TOKENIZERS_PARALLELISM=(true | false)
huggingface/tokenizers: The current process just got forked, after parallelism has already been used. Disabling parallelism to avoid deadlocks...
	- Av