<a href="https://colab.research.google.com/github/linux-leo/llm-notebooks/blob/main/Copy_of_Chess_LLM_Trainer.ipynb" target="_parent"><img src="https://colab.research.google.com/assets/colab-badge.svg" alt="Open In Colab"/></a>

In [3]:
# Modified version of Maxime Labonnes Chess LLM Trainer, credit goes to him for the original

# @title # ♟️ Chess LLM Trainer
# @markdown <center><h3>💻 <a href="https://gist.github.com/chessllm">GitHub</a>
# @markdown • ⚔️ <a href="https://huggingface.co/spaces/mlabonne/chessllm">Arena</a>
# @markdown • 🏆 <a href="https://gist.github.com/chessllm/696115fe2df47fb2350fcff2663678c9">Leaderboard</a>
# @markdown • ♟️ <a href="https://colab.research.google.com/drive/11UjbfajCzphe707_V7PD-2e5WIzyintf?usp=sharing">Dataset</a>
# @markdown </h3></center><br/>
!git clone -q https://github.com/mlabonne/chessllm.git
!pip install -qr chessllm/requirements.txt --progress-bar off
%cd chessllm
!pip install -e .
!pip install liger-kernel-nightly

import os

from datasets import load_dataset
from google.colab import userdata
from transformers import (
    AutoConfig,
    AutoTokenizer,
    DataCollatorForLanguageModeling,
    Trainer,
    TrainingArguments,
)
from liger_kernel.transformers import AutoLigerKernelForCausalLM
from chessllm.train import TokenizeMap, filter_transcript

try:
    os.environ["WANDB_API_KEY"] = userdata.get("wandb")
    os.environ["WANDB_PROJECT"] = "chess"
except:
    print("Warning: 'wandb' key not found in Google Colab's secrets. Not logging to Weights & Biases.")

# @markdown ## Model
# @markdown Select a small language model and a dataset. You can create your own dataset using [this notebook](https://colab.research.google.com/drive/11UjbfajCzphe707_V7PD-2e5WIzyintf?usp=sharing) or choose pre-made datasets from this [repo](https://huggingface.co/datasets/mlabonne/chessllm/tree/main).
model_id = "reflex-ai/AMD-Llama-350M-Upgraded"  # @param ["EleutherAI/pythia-70m-deduped", "openai-community/gpt2-medium"] {allow-input: true}
dataset_id = "mlabonne/chessllm" # @param {type:"string"}
data_file = "lichess_curriculum_100k_1500-3000.parquet"  # @param ["lichess_random_10k_1500-3000.parquet", "lichess_random_100k_1500-3000.parquet", "lichess_curriculum_10k_1500-3000.parquet", "lichess_curriculum_100k_1500-3000.parquet"] {allow-input: true}
new_model = "amdchess-v2"  # @param {type:"string"}

# @markdown ## Parameters
learning_rate = 2e-5  # @param {type:"number"}
batch_size = 8  # @param {type:"number"}
num_epochs = 0.25  # @param {type:"number"}
context_length = 512  # @param {type:"number"}
TRAIN_FROM_SCRATCH = False  # @param {type:"boolean"}
PUSH_TO_HUB = True  # @param {type:"boolean"}
PRIVATE_REPO = False  # @param {type:"boolean"}

# Load dataset
dataset = load_dataset(dataset_id, data_files=data_file)
print(dataset["train"][0])

# Tokenize dataset
tokenizer = AutoTokenizer.from_pretrained(model_id)
tokenizer.pad_token = tokenizer.unk_token
tmap = TokenizeMap(tokenizer, context_length)

tokenized_dataset = dataset.map(
    tmap.tokenize, batched=True, remove_columns=dataset["train"].column_names
)

# Load dataset
tokenized_dataset = tokenized_dataset["train"].train_test_split(test_size=0.01)

# Collator
data_collator = DataCollatorForLanguageModeling(tokenizer, mlm=False)

# Load model
if TRAIN_FROM_SCRATCH:
    config = AutoConfig.from_pretrained(
        model_id,
        n_ctx=context_length,
        bos_token_id=tokenizer.bos_token_id,
        eos_token_id=tokenizer.eos_token_id,
    )
    model = AutoLigerKernelForCausalLM.from_pretrained(model_id, config=config)
else:
    model = AutoLigerKernelForCausalLM.from_pretrained(model_id)

args = TrainingArguments(
    output_dir=new_model,
    num_train_epochs=num_epochs,
    per_device_train_batch_size=batch_size,
    per_device_eval_batch_size=batch_size,
    learning_rate=learning_rate,
    evaluation_strategy="steps",
    eval_steps=0.01,
    logging_steps=1,
    gradient_accumulation_steps=1,
    weight_decay=0.1,
    warmup_steps=0,
    lr_scheduler_type="cosine",
    report_to="wandb",
    remove_unused_columns=False,  # Fix for Pythia
    hub_private_repo=PRIVATE_REPO,
)

trainer = Trainer(
    model=model,
    tokenizer=tokenizer,
    args=args,
    data_collator=data_collator,
    train_dataset=tokenized_dataset["train"],
    eval_dataset=tokenized_dataset["test"],
)

trainer.train()

if PUSH_TO_HUB:
    trainer.push_to_hub()

Upload 3 LFS files:   0%|          | 0/3 [00:00<?, ?it/s]

model.safetensors:   0%|          | 0.00/1.33G [00:00<?, ?B/s]

tokenizer.model:   0%|          | 0.00/500k [00:00<?, ?B/s]

training_args.bin:   0%|          | 0.00/5.18k [00:00<?, ?B/s]

CommitInfo(commit_url='https://huggingface.co/nlpguy/amdchess-v2/commit/8f057a7360e6f59ea6e2c1b99e59c81d258569a8', commit_message='End of training', commit_description='', oid='8f057a7360e6f59ea6e2c1b99e59c81d258569a8', pr_url=None, pr_revision=None, pr_num=None)