# L25: Name generation with GPT-2

Steps:
1. Install the dependencies
2. Load the imports
3. Define the model and tokenizer
4. Tokenize all the names
5. Prepare the data for training
6. Set up the training arguments
7. Train the model
8. Generate new names

In [2]:
# !pip install transformers datasets accelerate

In [None]:
# set up the imports
from datasets import load_dataset
from transformers import AutoTokenizer, DataCollatorForLanguageModeling, AutoModelForCausalLM
from transformers import Trainer, TrainingArguments, pipeline

In [6]:
# load data
dataset = load_dataset('text', data_files='names.txt')

# look at first item
dataset['train'][0]

{'text': 'emma'}

In [7]:
# choose model and tokenizer
model_name = 'distilgpt2'
tokenizer = AutoTokenizer.from_pretrained(model_name, pad_token=' ')

In [21]:
# define a function to tokenize a name
def tokenize_name(name):
    tokens = tokenizer(
        name['text'] + tokenizer.eos_token,
    )

    return tokens

tokenized_name = tokenize_name({'text' : 'garrett'})
token_id_list = tokenized_name['input_ids']
token_list = tokenizer.convert_ids_to_tokens(token_id_list)

token_list

['gar', 'rett', '<|endoftext|>']

In [23]:
dataset['train']

Dataset({
    features: ['text'],
    num_rows: 32033
})

In [22]:
tokenized_names = dataset['train'].map(
    tokenize_name,
    batched=False,
    remove_columns=['text']
)

tokenized_names

Map: 100%|██████████| 32033/32033 [00:03<00:00, 9705.74 examples/s] 


Dataset({
    features: ['input_ids', 'attention_mask'],
    num_rows: 32033
})

In [26]:
# prepare dataset for training
data_collator = DataCollatorForLanguageModeling(
    tokenizer=tokenizer,
    mlm=False
)

# data_collator

In [31]:
# set up training arguments
model = AutoModelForCausalLM.from_pretrained(model_name)
model.resize_token_embeddings(len(tokenizer))

Embedding(50258, 768)

In [33]:
training_args = TrainingArguments(
    output_dir='name-gpt',
    per_device_train_batch_size=8,
    max_steps=100,
    fp16=False,
    bf16=False,
)

trainer = Trainer(
    model=model,
    args=training_args,
    train_dataset=tokenized_names,
    data_collator=data_collator
)

trainer.train()

Step,Training Loss


TrainOutput(global_step=100, training_loss=3.1282080078125, metrics={'train_runtime': 79.8535, 'train_samples_per_second': 10.018, 'train_steps_per_second': 1.252, 'total_flos': 926786912256.0, 'train_loss': 3.1282080078125, 'epoch': 0.024968789013732832})

In [34]:
# save the trained model
trainer.save_model('name-gpt')
tokenizer.save_pretrained('name-gpt')

('name-gpt/tokenizer_config.json',
 'name-gpt/special_tokens_map.json',
 'name-gpt/vocab.json',
 'name-gpt/merges.txt',
 'name-gpt/added_tokens.json',
 'name-gpt/tokenizer.json')

In [40]:
# generate new names
generator = pipeline(
    'text-generation',
    model='name-gpt',
    tokenizer=tokenizer,
)

generated_names = generator(
    'jim',
    max_length=20,
    truncation=True,
    do_sample=True,
    num_return_sequences=10,
    top_k=50,
    top_p=0.95
)

for name in generated_names:
    print(name['generated_text'])

Device set to use cpu


jimah
jimian
jimyn
jimley
jimiah
jimiah
jimie
jimah
jimay
jimiah
