#Finetuning GPT2 using transformers

##1. import all

In [None]:
from transformers import GPT2LMHeadModel, GPT2Tokenizer, TextDataset, DataCollatorForLanguageModeling
from transformers import Trainer, TrainingArguments
from datasets import load_dataset

## 2. load datasets

In [None]:
train_dataset = load_dataset('wikitext', 'wikitext-103-raw-v1', split='train[:1000]')

Downloading builder script:   0%|          | 0.00/8.48k [00:00<?, ?B/s]

Downloading metadata:   0%|          | 0.00/6.84k [00:00<?, ?B/s]

Downloading readme:   0%|          | 0.00/9.62k [00:00<?, ?B/s]

Downloading data:   0%|          | 0.00/192M [00:00<?, ?B/s]

Generating test split:   0%|          | 0/4358 [00:00<?, ? examples/s]

Generating train split:   0%|          | 0/1801350 [00:00<?, ? examples/s]

Generating validation split:   0%|          | 0/3760 [00:00<?, ? examples/s]

## 3. tokenize datasets

In [None]:
tokenizer = GPT2Tokenizer.from_pretrained('gpt2')
tokenizer.pad_token = tokenizer.eos_token

def tokenize_function(examples):
	output = tokenizer(examples["text"], padding="max_length", truncation=True, max_length=512)
	return output

# Tokenize with the map function
tokenized_datasets = train_dataset.map(tokenize_function, batched=True, remove_columns=["text"])

Downloading (…)olve/main/vocab.json:   0%|          | 0.00/1.04M [00:00<?, ?B/s]

Downloading (…)olve/main/merges.txt:   0%|          | 0.00/456k [00:00<?, ?B/s]

Downloading (…)/main/tokenizer.json:   0%|          | 0.00/1.36M [00:00<?, ?B/s]

Downloading (…)lve/main/config.json:   0%|          | 0.00/665 [00:00<?, ?B/s]

Map:   0%|          | 0/1000 [00:00<?, ? examples/s]

In [None]:
# Set data collator for addressing data batches
data_collator = DataCollatorForLanguageModeling(tokenizer=tokenizer, mlm=False)

In [None]:
data_collator

DataCollatorForLanguageModeling(tokenizer=GPT2Tokenizer(name_or_path='gpt2', vocab_size=50257, model_max_length=1024, is_fast=False, padding_side='right', truncation_side='right', special_tokens={'bos_token': '<|endoftext|>', 'eos_token': '<|endoftext|>', 'unk_token': '<|endoftext|>', 'pad_token': '<|endoftext|>'}, clean_up_tokenization_spaces=True),  added_tokens_decoder={
	50256: AddedToken("<|endoftext|>", rstrip=False, lstrip=False, single_word=False, normalized=True, special=True),
}, mlm=False, mlm_probability=0.15, pad_to_multiple_of=None, tf_experimental_compile=False, return_tensors='pt')

## 4. Training with TrainingArguments and Trainer

In [None]:
model = GPT2LMHeadModel.from_pretrained('gpt2')

Downloading model.safetensors:   0%|          | 0.00/548M [00:00<?, ?B/s]

Downloading (…)neration_config.json:   0%|          | 0.00/124 [00:00<?, ?B/s]

In [None]:
#Setting for training
training_args = TrainingArguments(
	output_dir="./gpt2_finetuned",
	overwrite_output_dir=True,
	num_train_epochs=1,
	per_device_train_batch_size=2,
	save_steps=1000,
	save_total_limit=2,
)

In [None]:
#Set Trainer
trainer = Trainer(
	model=model,
	args=training_args,
	data_collator=data_collator,
	train_dataset=tokenized_datasets,
)

In [None]:
#train
trainer.train()

Step,Training Loss
500,3.3535


TrainOutput(global_step=500, training_loss=3.3535302734375, metrics={'train_runtime': 154.7072, 'train_samples_per_second': 6.464, 'train_steps_per_second': 3.232, 'total_flos': 261292032000000.0, 'train_loss': 3.3535302734375, 'epoch': 1.0})

## 5. Generation

In [None]:
# 1) Prompt
input_text = "Do you think you are charming?"

# 2) Tokenizing and Tensor transformation
input_ids = tokenizer.encode(input_text, return_tensors="pt")
input_ids = input_ids.to('cuda')

# 3) Generate texts
max_length = 100
model = model.to("cuda")
sample_outputs = model.generate(input_ids, do_sample=True, max_length=max_length, temperature=0.7)

# 4) Decoding texts
print(tokenizer.decode(sample_outputs[0], skip_special_tokens=True))

The attention mask and the pad token id were not set. As a consequence, you may observe unexpected behavior. Please pass your input's `attention_mask` to obtain reliable results.
Setting `pad_token_id` to `eos_token_id`:50256 for open-end generation.


Do you think you are charming? 

 
 
  

   
 
 
I am a real talent at creating an original story, and in the process becoming one of the most powerful people I ever met. 

 
 

 

  
I have been an artist since childhood, and I have been a leading art teacher since the early days of my career. 

 

 

 
