In [None]:
from datasets import load_dataset
import wandb
import os
import math
import torch
from huggingface_hub import HfApi, notebook_login

from tokenizers import Tokenizer
from tokenizers.models import BPE
from tokenizers.trainers import BpeTrainer
from tokenizers.pre_tokenizers import ByteLevel
from tokenizers.normalizers import NFKC
from tokenizers.decoders import ByteLevel as ByteLevelDecoder

from transformers import (
    PreTrainedTokenizerFast,
    GPT2Config, GPT2LMHeadModel,
    Trainer, TrainingArguments, DataCollatorForLanguageModeling,
    AutoModelForCausalLM, AutoTokenizer
)

In [2]:
notebook_login()

VBox(children=(HTML(value='<center> <img\nsrc=https://huggingface.co/front/assets/huggingface_logo-noborder.sv…

## Preprocessing dataset

### Load the dataset

In [3]:
c4_dataset = load_dataset("datablations/c4-filter-small", split="train")
c4_dataset

README.md:   0%|          | 0.00/791 [00:00<?, ?B/s]

(…)-00000-of-00001-091e566583af27e4.parquet:   0%|          | 0.00/141M [00:00<?, ?B/s]

Generating train split:   0%|          | 0/100000 [00:00<?, ? examples/s]

Dataset({
    features: ['text', 'timestamp', 'url', 'meta', 'text_length', 'domain', 'perplexity', 'dup_ratio', 'pairs', 'repetitions', 'cluster'],
    num_rows: 100000
})

In [4]:
text_dataset = c4_dataset.select_columns("text")
dataset = text_dataset.train_test_split(test_size=0.2)
dataset

DatasetDict({
    train: Dataset({
        features: ['text'],
        num_rows: 80000
    })
    test: Dataset({
        features: ['text'],
        num_rows: 20000
    })
})

###  Training a BPE Tokenizer

In [5]:
# Create tokenizer
tokenizer = Tokenizer(BPE())
tokenizer.pre_tokenizer = ByteLevel()
tokenizer.normalizer = NFKC()
tokenizer.decoder = ByteLevelDecoder()

# Define training configuration for the tokenizer
trainer = BpeTrainer(vocab_size=50257, special_tokens=["<s>", "<pad>", "</s>", "<unk>", "<mask>"])

# Train the tokenizer on the training text data
tokenizer.train_from_iterator(dataset["train"]["text"], trainer=trainer,)
tokenizer.save("gpt_tokenizer.json")

tokenizer






Tokenizer(version="1.0", truncation=None, padding=None, added_tokens=[{"id":0, "content":"<s>", "single_word":False, "lstrip":False, "rstrip":False, "normalized":False, "special":True}, {"id":1, "content":"<pad>", "single_word":False, "lstrip":False, "rstrip":False, "normalized":False, "special":True}, {"id":2, "content":"</s>", "single_word":False, "lstrip":False, "rstrip":False, "normalized":False, "special":True}, {"id":3, "content":"<unk>", "single_word":False, "lstrip":False, "rstrip":False, "normalized":False, "special":True}, {"id":4, "content":"<mask>", "single_word":False, "lstrip":False, "rstrip":False, "normalized":False, "special":True}], normalizer=NFKC(), pre_tokenizer=ByteLevel(add_prefix_space=True, trim_offsets=True, use_regex=True), post_processor=None, decoder=ByteLevel(add_prefix_space=True, trim_offsets=True, use_regex=True), model=BPE(dropout=None, unk_token=None, continuing_subword_prefix=None, end_of_word_suffix=None, fuse_unk=False, byte_fallback=False, ignore_

### Load the tokenizer in HF format

In [6]:
tokenizer = PreTrainedTokenizerFast(tokenizer_file="gpt_tokenizer.json")

# Add special tokens required for language modeling
tokenizer.add_special_tokens({
    "bos_token": "<s>",
    "eos_token": "</s>",
    "unk_token": "<unk>",
    "pad_token": "<pad>",
    "mask_token": "<mask>",
})

tokenizer

PreTrainedTokenizerFast(name_or_path='', vocab_size=50257, model_max_length=1000000000000000019884624838656, is_fast=True, padding_side='right', truncation_side='right', special_tokens={'bos_token': '<s>', 'eos_token': '</s>', 'unk_token': '<unk>', 'pad_token': '<pad>', 'mask_token': '<mask>'}, clean_up_tokenization_spaces=False, added_tokens_decoder={
	0: AddedToken("<s>", rstrip=False, lstrip=False, single_word=False, normalized=False, special=True),
	1: AddedToken("<pad>", rstrip=False, lstrip=False, single_word=False, normalized=False, special=True),
	2: AddedToken("</s>", rstrip=False, lstrip=False, single_word=False, normalized=False, special=True),
	3: AddedToken("<unk>", rstrip=False, lstrip=False, single_word=False, normalized=False, special=True),
	4: AddedToken("<mask>", rstrip=False, lstrip=False, single_word=False, normalized=False, special=True),
}
)

### Tokenize text data using the Trained Tokenizer

In [7]:
def tokenize(example):
    """Tokenize each example"""
    return tokenizer(example["text"])


# Apply the tokenizer to both training and test datasets
tokenized_ds = dataset.map(
    tokenize, remove_columns=["text"], batched=True, num_proc=20
)

tokenized_ds

Map (num_proc=20):   0%|          | 0/80000 [00:00<?, ? examples/s]

Map (num_proc=20):   0%|          | 0/20000 [00:00<?, ? examples/s]

DatasetDict({
    train: Dataset({
        features: ['input_ids', 'token_type_ids', 'attention_mask'],
        num_rows: 80000
    })
    test: Dataset({
        features: ['input_ids', 'token_type_ids', 'attention_mask'],
        num_rows: 20000
    })
})

### Group tokens into fixed-length chunks

In [8]:
block_size = 512 # sequence length

def group_texts(examples):
    """Concatenate and split token sequences into fixed-size blocks"""
    # Concatenate all tokens for each key
    concatenated = {k: sum(examples[k], []) for k in examples.keys()}

    # Truncate to a multiple of block_size
    total_length = len(concatenated["input_ids"])
    total_length = (total_length // block_size) * block_size

    # Split into blocks of fixed length
    result = {
        k: [
            concatenated[k][i: i + block_size] for i in range(0, total_length, block_size)
        ] for k in concatenated.keys()
    }

    # Set input_ids as labels
    result["labels"] = result["input_ids"].copy()
    return result


lm_ds = tokenized_ds.map(group_texts, batched=True, num_proc=20)
lm_ds

Map (num_proc=20):   0%|          | 0/80000 [00:00<?, ? examples/s]

Map (num_proc=20):   0%|          | 0/20000 [00:00<?, ? examples/s]

DatasetDict({
    train: Dataset({
        features: ['input_ids', 'token_type_ids', 'attention_mask', 'labels'],
        num_rows: 70746
    })
    test: Dataset({
        features: ['input_ids', 'token_type_ids', 'attention_mask', 'labels'],
        num_rows: 17884
    })
})

## GPT Model

### Training

In [10]:
project_name = "gpt2-c4-imdb"
run_name = "pretrain-gpt2-small-c4-v1"
model_name = "gpt2-small-c4-pretrained"

os.environ["WANDB_API_KEY"] = ""
os.environ["WANDB_PROJECT"] = project_name

In [11]:
# Configure GPT2 model
config = GPT2Config(
    vocab_size=tokenizer.vocab_size,
    n_positions=512,
    n_ctx=512,
    n_embd=512,
    n_layer=6,
    n_head=8,
    bos_token_id=tokenizer.bos_token_id,
    eos_token_id=tokenizer.eos_token_id,
    loss_type="ForCausalLMLoss"
)

model = GPT2LMHeadModel(config)

In [12]:
data_collator = DataCollatorForLanguageModeling(tokenizer=tokenizer, mlm=False)

training_args = TrainingArguments(
    output_dir=model_name,
    run_name=run_name,
    logging_dir="logs",
    per_device_train_batch_size=16,
    per_device_eval_batch_size=16,
    num_train_epochs=5,
    eval_strategy="steps",
    save_strategy="steps",
    eval_steps=500,
    save_steps=500,
    logging_steps=100,
    save_total_limit=1,
    metric_for_best_model="eval_loss",
    greater_is_better=False,
    load_best_model_at_end=True,
    fp16=True,
    report_to="wandb"
)

trainer = Trainer(
    model=model,
    args=training_args,
    data_collator=data_collator,
    train_dataset=lm_ds["train"],
    eval_dataset=lm_ds["test"],
    processing_class=tokenizer,
)

trainer.train()

[34m[1mwandb[0m: Currently logged in as: [33mlinhlinh-le997[0m ([33mlinhlinh-le997-prime-labo[0m) to [32mhttps://api.wandb.ai[0m. Use [1m`wandb login --relogin`[0m to force relogin
[34m[1mwandb[0m: Using wandb-core as the SDK backend.  Please refer to https://wandb.me/wandb-core for more information.


`loss_type=None` was set in the config but it is unrecognised.Using the default loss: `ForCausalLMLoss`.


Step,Training Loss,Validation Loss
500,3.4646,6.884113
1000,3.3183,6.602923
1500,3.2213,6.437155
2000,3.1711,6.307638
2500,3.0992,6.194398
3000,3.0515,6.1006
3500,3.0237,6.012979
4000,2.9854,5.944919
4500,2.9409,5.885623
5000,2.9246,5.834258


There were missing keys in the checkpoint model loaded: ['lm_head.weight'].


TrainOutput(global_step=11055, training_loss=2.988742828714206, metrics={'train_runtime': 15979.2442, 'train_samples_per_second': 22.137, 'train_steps_per_second': 0.692, 'total_flos': 2.055450308640768e+16, 'train_loss': 2.988742828714206, 'epoch': 5.0})

In [13]:
tokenizer.save_pretrained(model_name)
model.save_pretrained(model_name)

In [14]:
hf_model = f"linhlinhle997/{model_name}"

model.push_to_hub(hf_model)
tokenizer.push_to_hub(hf_model)

model.safetensors:   0%|          | 0.00/180M [00:00<?, ?B/s]

README.md:   0%|          | 0.00/5.17k [00:00<?, ?B/s]

CommitInfo(commit_url='https://huggingface.co/linhlinhle997/gpt2-small-c4-pretrained/commit/6e3c64ba03a0efe948012124dc4667427f64f399', commit_message='Upload tokenizer', commit_description='', oid='6e3c64ba03a0efe948012124dc4667427f64f399', pr_url=None, repo_url=RepoUrl('https://huggingface.co/linhlinhle997/gpt2-small-c4-pretrained', endpoint='https://huggingface.co', repo_type='model', repo_id='linhlinhle997/gpt2-small-c4-pretrained'), pr_revision=None, pr_num=None)

### Inference

In [15]:
# Load the trained model and tokenizer
model = GPT2LMHeadModel.from_pretrained(hf_model)
tokenizer = AutoTokenizer.from_pretrained(hf_model)

config.json:   0%|          | 0.00/790 [00:00<?, ?B/s]

model.safetensors:   0%|          | 0.00/180M [00:00<?, ?B/s]

generation_config.json:   0%|          | 0.00/111 [00:00<?, ?B/s]

tokenizer_config.json:   0%|          | 0.00/1.16k [00:00<?, ?B/s]

tokenizer.json:   0%|          | 0.00/3.60M [00:00<?, ?B/s]

special_tokens_map.json:   0%|          | 0.00/692 [00:00<?, ?B/s]

In [16]:
model.eval()

prompt = "I go to"
inputs = tokenizer(prompt, return_tensors="pt").to(model.device)
input_ids = inputs["input_ids"]

output = model.generate(
    input_ids,
    max_new_tokens=50,
    do_sample=True,
    pad_token_id=tokenizer.eos_token_id,
)
print(tokenizer.decode(output[0], skip_special_tokens=True))

The attention mask is not set and cannot be inferred from input because pad token is same as eos token. As a consequence, you may observe unexpected behavior. Please pass your input's `attention_mask` to obtain reliable results.


I go to check-to make more important products and get to meet a call for it to work. A unique space to find the best price of your way to have done for our company and a number people. I believe is the home with you need to work


In [17]:
# Shift for labels (causal LM setting: predict token t+1 from token t)
labels = output[:, 1:].clone()
inputs = output[:, :-1].clone()

with torch.no_grad():
    outputs = model(inputs)
    logits = outputs.logits

# Compute log softmax over vocabulary
log_probs = torch.nn.functional.log_softmax(logits, dim=-1)

# Gather log-probabilities corresponding to the labels
selected_log_probs = log_probs.gather(2, labels.unsqueeze(-1)).squeeze(-1)

# Sum negative log probs → total NLL
nll = -selected_log_probs.sum().item()
num_tokens = labels.numel()
perplexity = math.exp(nll / num_tokens)
perplexity

51.55217396172117

### Perplexity (PPL)

In [18]:
text = dataset["test"][0]["text"]
print(text)

inputs = tokenizer(text, return_tensors="pt").to(model.device)
input_ids = inputs["input_ids"]
labels = input_ids[:, 1:]

with torch.no_grad():
    outputs = model(input_ids[:, :-1])
    logits = outputs.logits

log_probs = torch.nn.functional.log_softmax(logits, dim=-1)

selected_log_probs = log_probs.gather(2, labels.unsqueeze(-1)).squeeze(-1)

nll = -selected_log_probs.sum().item()
num_tokens = labels.numel()
perplexity = math.exp(nll / num_tokens)
print(f"Perplexity: {perplexity}")

Each week the most recent message (audio) will be posted on our teaching podcast. You can also download Avon messages directly to iTunes or get our Podcast App for your mobile device (smartphone, iPod, tablet, etc.) so you can listen at your leisure while you're driving in your car, working out...or where'er you be.
Perplexity: 315.5844686671047
