In [17]:
import torch
from datasets import load_dataset, Dataset
from tokenizers import Tokenizer, models, trainers, pre_tokenizers
from transformers import (
    GPT2Config,
    GPT2LMHeadModel,
    Trainer,
    TrainingArguments,
    PreTrainedTokenizerFast
)
from itertools import islice
import os
from tqdm import tqdm

In [18]:
device = "cuda" if torch.cuda.is_available() else "cpu"
print(f"Using device: {device}")
print(f"CUDA available: {torch.cuda.is_available()}")
if device == "cuda":
    print(torch.cuda.get_device_name(0))

Using device: cuda
CUDA available: True
NVIDIA GeForce RTX 4060 Laptop GPU


In [19]:
dataset = load_dataset(
    "HuggingFaceFW/fineweb-edu",
    split="train",
    streaming=True
)

first_row = next(iter(dataset))
print(first_row.keys())

Resolving data files:   0%|          | 0/2410 [00:00<?, ?it/s]



Resolving data files:   0%|          | 0/2410 [00:00<?, ?it/s]

dict_keys(['text', 'id', 'dump', 'url', 'date', 'file_path', 'language', 'language_score', 'token_count', 'score', 'int_score'])


In [20]:
subset_iter = dataset.take(200_000)
data_list = [row for row in tqdm(subset_iter, desc="Materializing dataset")]

print(f"Total samples: {len(data_list)}")

Materializing dataset: 200000it [02:46, 1199.16it/s]


Total samples: 200000


In [21]:
tokenizer_texts = [row["text"] for row in tqdm(data_list[:50_000], desc="Tokenizer training")]

tokenizer = Tokenizer(models.BPE())
tokenizer.pre_tokenizer = pre_tokenizers.Whitespace()

trainer = trainers.BpeTrainer(
    vocab_size=50_000,
    special_tokens=["<pad>", "<unk>", "<bos>", "<eos>"]
)

tokenizer.train_from_iterator(tokenizer_texts, trainer)
tokenizer.save("out/tokenizer/tokenizer.json")


hf_tokenizer = PreTrainedTokenizerFast(tokenizer_file="out/tokenizer/tokenizer.json")
hf_tokenizer.pad_token = "<pad>"

Tokenizer training: 100%|██████████| 50000/50000 [00:00<00:00, 1249517.09it/s]


In [22]:
tokenized_data = []

for row in tqdm(data_list, desc="Tokenizing"):
    tokenized = hf_tokenizer(
        row["text"],
        truncation=True,
        padding="max_length",
        max_length=512
    )
    tokenized_data.append(tokenized)

# Convert to HuggingFace Dataset
train_dataset = Dataset.from_list(tokenized_data)
print(f"Train dataset length: {len(train_dataset)}")

Tokenizing: 100%|██████████| 200000/200000 [05:48<00:00, 573.87it/s]


Train dataset length: 200000


In [23]:
config = GPT2Config(
    vocab_size=50_000,
    n_positions=512,
    n_ctx=512,
    n_embd=512,
    n_layer=8,
    n_head=8
)

model = GPT2LMHeadModel(config).to(device)

In [24]:
output_dir = "out/models/mini_llm_gpu_fixed"
os.makedirs(output_dir, exist_ok=True)

training_args = TrainingArguments(
    output_dir=output_dir,
    per_device_train_batch_size=2,
    gradient_accumulation_steps=8,
    learning_rate=5e-5,
    num_train_epochs=1,
    fp16=True,
    logging_steps=50,
    save_steps=500,
    save_total_limit=3,
    report_to="none"
)

trainer = Trainer(
    model=model,
    args=training_args,
    train_dataset=train_dataset
)