# GPT-pretraining tutorial
 - ktlim@seoultech.ac.kr

In [2]:
!pip install transformers datasets accelerate -qqq

[2K     [90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━[0m [32m7.9/7.9 MB[0m [31m19.0 MB/s[0m eta [36m0:00:00[0m
[2K     [90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━[0m [32m493.7/493.7 kB[0m [31m22.9 MB/s[0m eta [36m0:00:00[0m
[2K     [90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━[0m [32m261.4/261.4 kB[0m [31m26.1 MB/s[0m eta [36m0:00:00[0m
[2K     [90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━[0m [32m311.1/311.1 kB[0m [31m18.1 MB/s[0m eta [36m0:00:00[0m
[2K     [90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━[0m [32m3.8/3.8 MB[0m [31m36.8 MB/s[0m eta [36m0:00:00[0m
[2K     [90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━[0m [32m1.3/1.3 MB[0m [31m44.8 MB/s[0m eta [36m0:00:00[0m
[2K     [90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━[0m [32m115.3/115.3 kB[0m [31m7.9 MB/s[0m eta [36m0:00:00[0m
[2K     [90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━[0m [32m134.8/134.8 kB[0m [31m18.5 MB/s[0m eta [36m0:00:00[0m
[2K     [90m━━━━━━━━━━━━━━━

## Import all packages

 - Considering what we need to build GPT from scratch.
  - A dataset
  - A new tokenizer
  - A new configuration
  - A new model randomly initialized

In [3]:
from transformers import GPT2Config, GPT2LMHeadModel, GPT2Tokenizer
from transformers import DataCollatorForLanguageModeling
from datasets import load_dataset
from tokenizers import ByteLevelBPETokenizer

## 02. Data preprocessing
 - For pretraining what kind of data do we need?
  --> **plaintext**

In [9]:
train_dataset = load_dataset('wikitext', 'wikitext-103-raw-v1', split="train[:10000]")

In [10]:
#for saving the local repository
import os

# set a directory
if not os.path.exists('./text_data'):
    os.makedirs('./text_data')

# save as a text file
with open('./text_data/wikitext.txt', 'w', encoding='utf-8') as f:
    for data in train_dataset:
        f.write(data['text'] + '\n')

files = ['./text_data/wikitext.txt']

## 03. Build tokenizer
 - Definately, we need our own tokenizer!

In [11]:
tokenizer = ByteLevelBPETokenizer()

In [23]:
SPECIAL_TOKENS = []
START="<s>"
END="</s>"
PAD="<pad>"
UNK="<unk>"
MASK="<mask>"
SPECIAL_TOKENS.append(START)
SPECIAL_TOKENS.append(END)
SPECIAL_TOKENS.append(PAD)
SPECIAL_TOKENS.append(UNK)
SPECIAL_TOKENS.append(MASK)

In [14]:
tokenizer.train(files, vocab_size=4000, min_frequency=2, special_tokens=SPECIAL_TOKENS)

In [15]:
tokenizer

Tokenizer(vocabulary_size=4000, model=ByteLevelBPE, add_prefix_space=False, lowercase=False, dropout=None, unicode_normalizer=None, continuing_subword_prefix=None, end_of_word_suffix=None, trim_offsets=False)

In [None]:
vocab_size = tokenizer.get_vocab_size()
tokenizer.get_vocab()

In [20]:
# Set a directory for the model and tokenizer
if not os.path.exists('./first_gpt_model'):
    os.makedirs('./first_gpt_model')

In [21]:
tokenizer.save_model('./first_gpt_model')

['./first_gpt_model/vocab.json', './first_gpt_model/merges.txt']

## 04. Build Model
 - In order to build a new model, how to get it easily? just copy GPT2 from Transformers with your own Configuration!


### 04-1. Set your won configuration of GPT2

In [29]:
config = GPT2Config(
    vocab_size = vocab_size,
    bos_token_id = tokenizer.token_to_id(START),
    eos_token_id = tokenizer.token_to_id(END)
)

   - And then we need a model with our configuration! We don't use **from_pretrained** now

In [30]:
model = GPT2LMHeadModel(config)

### 04-2. Data preprocessing on Tokenizer

 - Now it's done but we need to care one more thing. Since our tokenizer is possibley a bit different from Transformer GPT2's tokenizer. So we need build new tokenizer with our dictionary!

In [33]:
# We got an error since our tokenizer does not have pad_token_id variable!
tokenizer.pad_token_id

AttributeError: ignored

In [37]:
tokenizer

Tokenizer(vocabulary_size=4000, model=ByteLevelBPE, add_prefix_space=False, lowercase=False, dropout=None, unicode_normalizer=None, continuing_subword_prefix=None, end_of_word_suffix=None, trim_offsets=False)

In [34]:
#build a new tokenizer by taking into account our dictionary trained by our dataset
new_tokenizer = GPT2Tokenizer.from_pretrained('first_gpt_model')

In [35]:
new_tokenizer

GPT2Tokenizer(name_or_path='first_gpt_model', vocab_size=4000, model_max_length=1000000000000000019884624838656, is_fast=False, padding_side='right', truncation_side='right', special_tokens={'bos_token': '<|endoftext|>', 'eos_token': '<|endoftext|>', 'unk_token': '<|endoftext|>'}, clean_up_tokenization_spaces=True),  added_tokens_decoder={
	4000: AddedToken("<|endoftext|>", rstrip=False, lstrip=False, single_word=False, normalized=True, special=True),
}

In [38]:
new_tokenizer.pad_token_id = tokenizer.token_to_id("<pad>")
new_tokenizer.bos_token_id = tokenizer.token_to_id("<s>")
new_tokenizer.eos_token_id = tokenizer.token_to_id("</s>")

In [39]:
def tokenize_fuction(samples):
  return new_tokenizer(samples["text"])

In [40]:
tokenized_dataset = train_dataset.map(tokenize_fuction, batch_size=True, num_proc=4, remove_columns=["text"])

Map (num_proc=4):   0%|          | 0/10000 [00:00<?, ? examples/s]

In [42]:
tokenized_dataset[1]

{'input_ids': [305, 3542, 3965, 673, 2872, 305, 317],
 'attention_mask': [1, 1, 1, 1, 1, 1, 1]}

### 04-3. Converting our data to the CLM training data

In [43]:
collator = DataCollatorForLanguageModeling(tokenizer=new_tokenizer, mlm=False)

## 05. Training

In [44]:
from transformers import Trainer, TrainingArguments

In [47]:
args = TrainingArguments(
    output_dir="./first_gpt_model/",
    overwrite_output_dir=True,
    num_train_epochs=1,
    per_device_train_batch_size = 12,
    save_steps = 1000,
    save_total_limit=2
)

In [48]:
trainer = Trainer(
    model = model,
    args = args,
    train_dataset = tokenized_dataset,
    data_collator=collator
)

In [49]:
trainer.train()

Step,Training Loss


OutOfMemoryError: ignored

## 06. Generation

In [62]:
# 1) Prompt
input_text = "What's up with you?"

In [64]:
# 2) Tokenizing and Tensor transformation
input_ids = new_tokenizer.encode(input_text, return_tensors="pt")
input_ids = input_ids.to('cuda')

# 3) Generate texts
max_length = 100
model = model.to("cuda")
sample_outputs = model.generate(input_ids, do_sample=True, max_length=max_length, temperature=0.7)

The attention mask and the pad token id were not set. As a consequence, you may observe unexpected behavior. Please pass your input's `attention_mask` to obtain reliable results.
Setting `pad_token_id` to `eos_token_id`:1 for open-end generation.


In [65]:
# 4) Decoding texts
print(new_tokenizer.decode(sample_outputs[0], skip_special_tokens=True))

What's up with you?. as,, theing, and the of of, cap, B, and, ofg and.. and and of the ) anding,,, the of and ofing the the the the ).. to on the, and,. the, ),,, and, the of,, the the.. the the., the, in the. and,,,. of the and the a to, the,
