<a href="https://colab.research.google.com/github/hhubert14/chess-ai/blob/main/chess_model.ipynb" target="_parent"><img src="https://colab.research.google.com/assets/colab-badge.svg" alt="Open In Colab"/></a>

In [1]:
!pip install datasets
# !pip install --upgrade torch torchvision torchaudio --index-url https://download.pytorch.org/whl/cu118
# !pip install transformers==4.38.0
!pip install --upgrade transformers
!pip install --upgrade peft

Collecting datasets
  Downloading datasets-3.2.0-py3-none-any.whl.metadata (20 kB)
Collecting dill<0.3.9,>=0.3.0 (from datasets)
  Downloading dill-0.3.8-py3-none-any.whl.metadata (10 kB)
Collecting xxhash (from datasets)
  Downloading xxhash-3.5.0-cp310-cp310-manylinux_2_17_x86_64.manylinux2014_x86_64.whl.metadata (12 kB)
Collecting multiprocess<0.70.17 (from datasets)
  Downloading multiprocess-0.70.16-py310-none-any.whl.metadata (7.2 kB)
Collecting fsspec<=2024.9.0,>=2023.1.0 (from fsspec[http]<=2024.9.0,>=2023.1.0->datasets)
  Downloading fsspec-2024.9.0-py3-none-any.whl.metadata (11 kB)
Downloading datasets-3.2.0-py3-none-any.whl (480 kB)
[2K   [90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━[0m [32m480.6/480.6 kB[0m [31m31.9 MB/s[0m eta [36m0:00:00[0m
[?25hDownloading dill-0.3.8-py3-none-any.whl (116 kB)
[2K   [90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━[0m [32m116.3/116.3 kB[0m [31m10.4 MB/s[0m eta [36m0:00:00[0m
[?25hDownloading fsspec-2024.9.0-py3-none-any.whl 

In [None]:
!huggingface-cli login

In [4]:
from transformers import AutoTokenizer, AutoModelForCausalLM, DataCollatorWithPadding, get_scheduler
from datasets import load_dataset
from torch.utils.data import DataLoader
from torch.optim import AdamW
from accelerate.test_utils.testing import get_backend
from tqdm.auto import tqdm
from peft import LoftQConfig, LoraConfig, get_peft_model
from peft import get_peft_model, LoraConfig, TaskType, prepare_model_for_kbit_training

import torch

# TODO add validation set

# Models tested
# "meta-llama/Meta-Llama-3-8B"
# "google/flan-t5-base"

# Adjustable variables
model_name = "openai-community/gpt2"
batch_size = 5
train_dataset_path = "/content/train_puzzles.csv"
test_dataset_path = "/content/test_puzzles.csv"
num_epochs = 5
learning_rate = 5e-5
model_dir = f"{model_name}_saved"

tokenizer = AutoTokenizer.from_pretrained(model_name)
model = AutoModelForCausalLM.from_pretrained(model_name)
# model.gradient_checkpointing_enable()
# model.half()

peft_config = LoraConfig(inference_mode=False, r=8, lora_alpha=32, lora_dropout=0.1, peft_type=TaskType.CAUSAL_LM)
model = get_peft_model(model, peft_config)
print(model.print_trainable_parameters())


# Load datasets in streaming mode
train_dataset = load_dataset("csv", data_files={"full": train_dataset_path}, streaming=True)["full"]
eval_dataset = load_dataset("csv", data_files={"full": test_dataset_path}, streaming=True)["full"]

# Tokenize dynamically using a collate function
tokenizer.add_special_tokens({'pad_token': '[PAD]'})
model.resize_token_embeddings(len(tokenizer))
tokenizer.pad_token_id = tokenizer.pad_token_id  # Ensure tokenizer uses the same pad token for labels
tokenizer.label_pad_token_id = tokenizer.pad_token_id  # Set

trainable params: 294,912 || all params: 124,734,720 || trainable%: 0.2364
None


The new embeddings will be initialized from a multivariate normal distribution that has old embeddings' mean and covariance. As described in this article: https://nlp.stanford.edu/~johnhew/vocab-expansion.html. To disable this, use `mean_resizing=False`


In [5]:
def tokenize_batch(batch):
    inputs_text = [example["inputs"] for example in batch]
    labels_text = [example["label"] for example in batch]

    inputs = tokenizer(
        inputs_text,
        truncation="only_first",
        padding="max_length",
        max_length=512,
        return_tensors="pt",
        add_special_tokens=True,
        )
    labels = tokenizer(
        labels_text,
        truncation="only_first",
        padding="max_length",
        max_length=512,
        return_tensors="pt",
        add_special_tokens=True,
        )

    inputs["labels"] = labels["input_ids"]

    # print(f"inputs: {inputs}")
    # print(f"labels: {labels}")
    # print(f"Inputs shape: {inputs['input_ids'].shape}")
    # print(f"Labels shape: {labels['input_ids'].shape}")
    return inputs

In [6]:
# DataLoaders
train_dataloader = DataLoader(train_dataset, batch_size=batch_size, collate_fn=tokenize_batch)
eval_dataloader = DataLoader(eval_dataset, batch_size=batch_size, collate_fn=tokenize_batch)

In [7]:
optimizer = AdamW(model.parameters(), lr=learning_rate)

# Scheduler (num_training_steps calculated dynamically)
lr_scheduler = get_scheduler(
    name="linear", optimizer=optimizer, num_warmup_steps=0, num_training_steps=1  # Placeholder
)

# Device setup
device, _, _ = get_backend()
print(f"Using device: {device}")
model.to(device)

Using device: cuda


PeftModel(
  (base_model): LoraModel(
    (model): GPT2LMHeadModel(
      (transformer): GPT2Model(
        (wte): Embedding(50258, 768)
        (wpe): Embedding(1024, 768)
        (drop): Dropout(p=0.1, inplace=False)
        (h): ModuleList(
          (0-11): 12 x GPT2Block(
            (ln_1): LayerNorm((768,), eps=1e-05, elementwise_affine=True)
            (attn): GPT2SdpaAttention(
              (c_attn): lora.Linear(
                (base_layer): Conv1D(nf=2304, nx=768)
                (lora_dropout): ModuleDict(
                  (default): Dropout(p=0.1, inplace=False)
                )
                (lora_A): ModuleDict(
                  (default): Linear(in_features=768, out_features=8, bias=False)
                )
                (lora_B): ModuleDict(
                  (default): Linear(in_features=8, out_features=2304, bias=False)
                )
                (lora_embedding_A): ParameterDict()
                (lora_embedding_B): ParameterDict()
                (l

In [8]:
# Train
progress_bar = tqdm(total=None)  # Dynamic progress bar
model.train()
step_count = 0  # Manually count steps

for epoch in range(num_epochs):
    print(f"Epoch: {epoch + 1}/{num_epochs}")
    for batch in train_dataloader:
        batch = {k: v.to(device) for k, v in batch.items()}

        outputs = model(**batch)
        loss = outputs.loss
        print(f"Loss: {loss.item()}")
        loss.backward()

        optimizer.step()
        lr_scheduler.step()
        optimizer.zero_grad()
        progress_bar.update(1)

        step_count += 1  # Increment step count

progress_bar.close()

# Update lr_scheduler with actual training steps
lr_scheduler = get_scheduler(
    name="linear", optimizer=optimizer, num_warmup_steps=0, num_training_steps=step_count
)

0it [00:00, ?it/s]

Epoch: 1/5
Epoch: 2/5
Epoch: 3/5
Epoch: 4/5
Epoch: 5/5


The attention mask and the pad token id were not set. As a consequence, you may observe unexpected behavior. Please pass your input's `attention_mask` to obtain reliable results.
Setting `pad_token_id` to `eos_token_id`:50256 for open-end generation.
The attention mask is not set and cannot be inferred from input because pad token is same as eos token. As a consequence, you may observe unexpected behavior. Please pass your input's `attention_mask` to obtain reliable results.


Input shape: torch.Size([5, 256])
Labels shape: torch.Size([5, 256])


The attention mask and the pad token id were not set. As a consequence, you may observe unexpected behavior. Please pass your input's `attention_mask` to obtain reliable results.
Setting `pad_token_id` to `eos_token_id`:50256 for open-end generation.


Input shape: torch.Size([5, 256])
Labels shape: torch.Size([5, 256])


The attention mask and the pad token id were not set. As a consequence, you may observe unexpected behavior. Please pass your input's `attention_mask` to obtain reliable results.
Setting `pad_token_id` to `eos_token_id`:50256 for open-end generation.


Input shape: torch.Size([5, 256])
Labels shape: torch.Size([5, 256])


The attention mask and the pad token id were not set. As a consequence, you may observe unexpected behavior. Please pass your input's `attention_mask` to obtain reliable results.
Setting `pad_token_id` to `eos_token_id`:50256 for open-end generation.


Input shape: torch.Size([5, 256])
Labels shape: torch.Size([5, 256])


The attention mask and the pad token id were not set. As a consequence, you may observe unexpected behavior. Please pass your input's `attention_mask` to obtain reliable results.
Setting `pad_token_id` to `eos_token_id`:50256 for open-end generation.


Input shape: torch.Size([5, 256])
Labels shape: torch.Size([5, 256])


The attention mask and the pad token id were not set. As a consequence, you may observe unexpected behavior. Please pass your input's `attention_mask` to obtain reliable results.
Setting `pad_token_id` to `eos_token_id`:50256 for open-end generation.


Input shape: torch.Size([5, 256])
Labels shape: torch.Size([5, 256])


The attention mask and the pad token id were not set. As a consequence, you may observe unexpected behavior. Please pass your input's `attention_mask` to obtain reliable results.
Setting `pad_token_id` to `eos_token_id`:50256 for open-end generation.


Input shape: torch.Size([5, 256])
Labels shape: torch.Size([5, 256])


The attention mask and the pad token id were not set. As a consequence, you may observe unexpected behavior. Please pass your input's `attention_mask` to obtain reliable results.
Setting `pad_token_id` to `eos_token_id`:50256 for open-end generation.


Input shape: torch.Size([5, 256])
Labels shape: torch.Size([5, 256])


The attention mask and the pad token id were not set. As a consequence, you may observe unexpected behavior. Please pass your input's `attention_mask` to obtain reliable results.
Setting `pad_token_id` to `eos_token_id`:50256 for open-end generation.


Input shape: torch.Size([5, 256])
Labels shape: torch.Size([5, 256])


The attention mask and the pad token id were not set. As a consequence, you may observe unexpected behavior. Please pass your input's `attention_mask` to obtain reliable results.
Setting `pad_token_id` to `eos_token_id`:50256 for open-end generation.


Input shape: torch.Size([3, 256])
Labels shape: torch.Size([3, 256])
48
["Black to move. Position is 5r2/2pk2P1/pp1n4/2p5/4PB2/2P2K2/P1P1R3/8 b - - 0 34. What is the best move? Narrative: The first thing I want to say is that I'm not sure what the best move is. I'm not sure what the best move is. I'm not sure what the best move is. I'm not sure what the best move is. I'm not sure what the best move is. I'm not sure what the best move is. I'm not sure what the best move is. I'm not sure what the best move is. I'm not sure what the best move is. I'm not sure what the best move is. I'm not sure what the best move is. I'm not sure what the best move is. I'm not sure what the best move is. I'm not sure what the best move is. I'm not sure what the best move is. I'm not sure what the best move is. I'm not sure what the best move is. I'm not sure what the best move is. I'm not sure what the best move is. I'm not sure what the best move is. I'm not sure what the best move is. I'm not sure what 



Model and tokenizer saved to openai-community/gpt2_saved


In [None]:
# Evaluate
model.eval()
predictions_text = []
for batch in eval_dataloader:
    batch = {k: v.to(device) for k, v in batch.items()}
    with torch.no_grad():
        outputs = model(**batch)
        loss = outputs.loss
        print(f"Loss: {loss.item()}")

    generated_ids = model.generate(input_ids=batch["input_ids"], max_length=512)

    decoded_preds = tokenizer.batch_decode(generated_ids, skip_special_tokens=True)
    predictions_text.extend(decoded_preds)

print(len(predictions_text))
print(predictions_text)


In [None]:
model.save_pretrained(model_dir)  # Save model
tokenizer.save_pretrained(model_dir)  # Save tokenizer

print(f"Model and tokenizer saved to {model_dir}")


In [None]:
torch.cuda.empty_cache()