In [None]:
!pip install -U transformers datasets accelerate evaluate torch

In [2]:
import os
from dataclasses import dataclass
from typing import Dict


import torch
from datasets import load_dataset
from transformers import (
AutoTokenizer,
AutoModelForCausalLM,
DataCollatorForLanguageModeling,
Trainer,
TrainingArguments,
pipeline
)

In [3]:
MODEL_NAME = "gpt2" # small model for learning; replace with a larger model if you have GPU/RAM
DATASET_NAME = "wikitext"
DATASET_CONFIG = "wikitext-2-raw-v1" # raw text version (no sentencepiece tokenization)
OUTPUT_DIR = "./gpt2-finetuned-wikitext"
BATCH_SIZE = 4 # per device; lower if you run out of memory
GRAD_ACCUMULATION_STEPS = 8 # to simulate larger batch sizes
EPOCHS = 2 # increase for real training
LEARNING_RATE = 5e-5
MAX_SEQ_LENGTH = 512 # collapse/trim text to this length for each example
SEED = 42

In [4]:
def set_seed(seed: int = SEED):
  import random


  random.seed(seed)
  torch.manual_seed(seed)
  if torch.cuda.is_available():
    torch.cuda.manual_seed_all(seed)


set_seed()

In [5]:
print("Loading dataset...")
dataset = load_dataset(DATASET_NAME, DATASET_CONFIG)
print(dataset)

Loading dataset...


README.md: 0.00B [00:00, ?B/s]

wikitext-2-raw-v1/test-00000-of-00001.pa(…):   0%|          | 0.00/733k [00:00<?, ?B/s]

wikitext-2-raw-v1/train-00000-of-00001.p(…):   0%|          | 0.00/6.36M [00:00<?, ?B/s]

wikitext-2-raw-v1/validation-00000-of-00(…):   0%|          | 0.00/657k [00:00<?, ?B/s]

Generating test split:   0%|          | 0/4358 [00:00<?, ? examples/s]

Generating train split:   0%|          | 0/36718 [00:00<?, ? examples/s]

Generating validation split:   0%|          | 0/3760 [00:00<?, ? examples/s]

DatasetDict({
    test: Dataset({
        features: ['text'],
        num_rows: 4358
    })
    train: Dataset({
        features: ['text'],
        num_rows: 36718
    })
    validation: Dataset({
        features: ['text'],
        num_rows: 3760
    })
})


In [7]:
dataset['train'].data

MemoryMappedTable
text: string
----
text: [[""," = Valkyria Chronicles III = 
",""," Senjō no Valkyria 3 : Unrecorded Chronicles ( Japanese : 戦場のヴァルキュリア3 , lit . (... 629 chars omitted)"," The game began development in 2010 , carrying over a large portion of the work done on Valkyria C (... 426 chars omitted)",...," = = Preparation = = 
",""," The officers at the Philadelphia Mint , including Chief Coiner Franklin Peale , were mostly the f (... 303 chars omitted)"," When Longacre began work on the two new coins in early 1849 , he had no one to assist him . Longa (... 546 chars omitted)"," The engraving was unusually minute and required very close and incessant labor for several weeks  (... 321 chars omitted)"],[""," = = Original design = = 
",""," The Type 1 gold dollar depicts a head of Liberty , facing left , with a coronet or tiara on her h (... 229 chars omitted)"," Contemporary reviews of the Type 1 design were generally favorable . The New York Weekly Tribune  (... 857 chars omit

In [10]:
dataset['train'].select(range(2000))

Dataset({
    features: ['text'],
    num_rows: 2000
})

In [11]:
small_train = dataset["train"].select(range(2000)) if len(dataset["train"]) > 2000 else dataset["train"]
small_valid = dataset["validation"].select(range(500)) if len(dataset["validation"]) > 500 else dataset["validation"]

In [12]:
print("Loading tokenizer and model...")
# GPT-2 tokenizer doesn't have a padding token by default; we will add one (required for batching).
tokenizer = AutoTokenizer.from_pretrained(MODEL_NAME)
if tokenizer.pad_token is None:
  tokenizer.add_special_tokens({"pad_token": "<|pad|>"})


model = AutoModelForCausalLM.from_pretrained(MODEL_NAME)
# Resize token embeddings if tokenizer changed (because we added pad_token)
model.resize_token_embeddings(len(tokenizer))

Loading tokenizer and model...


tokenizer_config.json:   0%|          | 0.00/26.0 [00:00<?, ?B/s]

config.json:   0%|          | 0.00/665 [00:00<?, ?B/s]

vocab.json:   0%|          | 0.00/1.04M [00:00<?, ?B/s]

merges.txt:   0%|          | 0.00/456k [00:00<?, ?B/s]

tokenizer.json:   0%|          | 0.00/1.36M [00:00<?, ?B/s]

model.safetensors:   0%|          | 0.00/548M [00:00<?, ?B/s]

generation_config.json:   0%|          | 0.00/124 [00:00<?, ?B/s]

The new embeddings will be initialized from a multivariate normal distribution that has old embeddings' mean and covariance. As described in this article: https://nlp.stanford.edu/~johnhew/vocab-expansion.html. To disable this, use `mean_resizing=False`


Embedding(50258, 768)

In [13]:
# 3) Preprocessing: tokenize and group texts into blocks
# -----------------------------
print("Tokenizing and grouping texts...")


# Tokenize function: we will tokenize and return input_ids


def tokenize_function(examples: Dict[str, list]):
  return tokenizer(examples["text"], return_special_tokens_mask=False)


train_tokenized = small_train.map(
  tokenize_function,
  batched=True,
  remove_columns=[c for c in small_train.column_names if c != "text"],
)
valid_tokenized = small_valid.map(
  tokenize_function,
  batched=True,
  remove_columns=[c for c in small_valid.column_names if c != "text"],
)

Tokenizing and grouping texts...


Map:   0%|          | 0/2000 [00:00<?, ? examples/s]

Map:   0%|          | 0/500 [00:00<?, ? examples/s]

In [14]:
def group_texts(examples: Dict[str, list]):
  # Concatenate all text input_ids
  concatenated = sum(examples["input_ids"], [])
  total_length = len(concatenated)
  # Drop the small remainder to make lengths divisible by block_size
  block_size = MAX_SEQ_LENGTH
  if total_length >= block_size:
    total_length = (total_length // block_size) * block_size
  else:
    total_length = 0
  result = {"input_ids": [], "attention_mask": [], "labels": []}
  for i in range(0, total_length, block_size):
    chunk = concatenated[i : i + block_size]
    result["input_ids"].append(chunk)
    result["attention_mask"].append([1] * block_size)
    result["labels"].append(chunk.copy())
  return result


train_dataset = train_tokenized.map(
    group_texts,
    batched=True,
    batch_size=-1, # Process the entire dataset as one batch
    num_proc=1,
    remove_columns=train_tokenized.column_names, # Remove original columns after grouping
)
valid_dataset = valid_tokenized.map(
    group_texts,
    batched=True,
    batch_size=-1, # Process the entire dataset as one batch
    num_proc=1,
    remove_columns=valid_tokenized.column_names, # Remove original columns after grouping
)


print("Train dataset size (examples):", len(train_dataset))
print("Validation dataset size (examples):", len(valid_dataset))

Map (num_proc=1):   0%|          | 0/2000 [00:00<?, ? examples/s]

Map (num_proc=1):   0%|          | 0/500 [00:00<?, ? examples/s]

Train dataset size (examples): 264
Validation dataset size (examples): 55


In [15]:
data_collator = DataCollatorForLanguageModeling(tokenizer=tokenizer, mlm=False)

In [16]:
# 5) TrainingArguments & Trainer
# -----------------------------
training_args = TrainingArguments(
output_dir=OUTPUT_DIR,
overwrite_output_dir=True,
num_train_epochs=EPOCHS,
per_device_train_batch_size=BATCH_SIZE,
per_device_eval_batch_size=BATCH_SIZE,
gradient_accumulation_steps=GRAD_ACCUMULATION_STEPS,
eval_strategy="epoch",
save_strategy="epoch",
learning_rate=LEARNING_RATE,
weight_decay=0.01,
fp16=torch.cuda.is_available(), # mixed precision if GPU available
push_to_hub=False,
logging_dir=OUTPUT_DIR + "/logs",
logging_steps=100,
report_to=[],
)

In [17]:
trainer = Trainer(
model=model,
args=training_args,
train_dataset=train_dataset,
eval_dataset=valid_dataset,
data_collator=data_collator,
)

In [18]:
# -----------------------------
print("Starting training...")
trainer.train()

Starting training...


`loss_type=None` was set in the config but it is unrecognized. Using the default loss: `ForCausalLMLoss`.


Epoch,Training Loss,Validation Loss
1,No log,3.316941
2,No log,3.285034


TrainOutput(global_step=18, training_loss=3.574346754286024, metrics={'train_runtime': 78.0614, 'train_samples_per_second': 6.764, 'train_steps_per_second': 0.231, 'total_flos': 137962192896000.0, 'train_loss': 3.574346754286024, 'epoch': 2.0})

In [19]:
# -----------------------------
print("Saving model to", OUTPUT_DIR)
trainer.save_model(OUTPUT_DIR)
tokenizer.save_pretrained(OUTPUT_DIR)

Saving model to ./gpt2-finetuned-wikitext


('./gpt2-finetuned-wikitext/tokenizer_config.json',
 './gpt2-finetuned-wikitext/special_tokens_map.json',
 './gpt2-finetuned-wikitext/vocab.json',
 './gpt2-finetuned-wikitext/merges.txt',
 './gpt2-finetuned-wikitext/added_tokens.json',
 './gpt2-finetuned-wikitext/tokenizer.json')

In [20]:
print("Running quick generation test...")
# load from saved dir to simulate real usage
from transformers import AutoModelForCausalLM, AutoTokenizer


model2 = AutoModelForCausalLM.from_pretrained(OUTPUT_DIR)
tokenizer2 = AutoTokenizer.from_pretrained(OUTPUT_DIR)


generator = pipeline("text-generation", model=model2, tokenizer=tokenizer2, device=0 if torch.cuda.is_available() else -1)
prompt = "Artificial intelligence in the future will"
gen = generator(prompt, max_length=100, num_return_sequences=1)
print("\nGeneration result:\n", gen[0]["generated_text"])

Running quick generation test...


Device set to use cuda:0
Truncation was not explicitly activated but `max_length` is provided a specific value, please use `truncation=True` to explicitly truncate examples to max length. Defaulting to 'longest_first' truncation strategy. If you encode pairs of sequences (GLUE-style) with the tokenizer you can select this strategy more precisely by providing a specific strategy to `truncation`.
Both `max_new_tokens` (=256) and `max_length`(=100) seem to have been set. `max_new_tokens` will take precedence. Please refer to the documentation for more information. (https://huggingface.co/docs/transformers/main/en/main_classes/text_generation)



Generation result:
 Artificial intelligence in the future will help to solve problems in the world. With that said, what's important is that the future needs to be seen to be understood.

In addition, AI is not perfect. It requires a lot of effort to build, but it can help people easily with problems and problems with machines.

The AI problem is a complex one, and the problems are not easily solved in software. So it is important to understand that there are many different problems that can be addressed in AI in the future. With that said, I think that the AI problem must be understood in part because of the problem of how to solve it, and that the AI problems must also be understood in part because of the difficulties that it faces.

For example, one problem in AI is that it does not have the power to solve problems in the world. It is able to solve problems in an artificial way, and that is why it is called a "supercomputer". The problem of AI is not that it does not have the power