# Fine tune GPT-2 for CommonGen Tasks

Load the GPT-2 model.

In [1]:
from transformers import AutoTokenizer, AutoModelForCausalLM
from dataclasses import dataclass
from transformers.tokenization_utils_base import PreTrainedTokenizerBase, PaddingStrategy
from typing import Optional, Union
import torch
from transformers import Trainer, TrainingArguments
from datasets import load_dataset, load_metric

In [34]:
tokenizer = AutoTokenizer.from_pretrained("gpt2")
model = AutoModelForCausalLM.from_pretrained("./gpt2-large-finetuned-commongen" )

Could not locate the tokenizer configuration file, will try to use the model config instead.
loading configuration file config.json from cache at C:\Users\Jingqian/.cache\huggingface\hub\models--gpt2\snapshots\e7da7f221d5bf496a48136c0cd264e630fe9fcc8\config.json
Model config GPT2Config {
  "_name_or_path": "gpt2",
  "activation_function": "gelu_new",
  "architectures": [
    "GPT2LMHeadModel"
  ],
  "attn_pdrop": 0.1,
  "bos_token_id": 50256,
  "embd_pdrop": 0.1,
  "eos_token_id": 50256,
  "initializer_range": 0.02,
  "layer_norm_epsilon": 1e-05,
  "model_type": "gpt2",
  "n_ctx": 1024,
  "n_embd": 768,
  "n_head": 12,
  "n_inner": null,
  "n_layer": 12,
  "n_positions": 1024,
  "reorder_and_upcast_attn": false,
  "resid_pdrop": 0.1,
  "scale_attn_by_inverse_layer_idx": false,
  "scale_attn_weights": true,
  "summary_activation": null,
  "summary_first_dropout": 0.1,
  "summary_proj_to_labels": true,
  "summary_type": "cls_index",
  "summary_use_proj": true,
  "task_specific_params": {

In [7]:
datasets = load_dataset('common_gen')


def process_data(examples):
    concepts = examples["concepts"]
    targets = examples["target"]
    full_string = [" ".join(concepts[i]) + "=" + targets[i] + tokenizer.eos_token
                   for i in
                   range(len(concepts))]
    tokenized = tokenizer(full_string)
    return tokenized


encoded_datasets = datasets.map(process_data, batched=True, remove_columns=['concept_set_idx', 'concepts', 'target'])


def group_texts(examples): # This function is copied directly from the tutorial given by Hugging Face: https://github.com/huggingface/notebooks/blob/main/examples/language_modeling.ipynb
    block_size = 256
    # Concatenate all texts.
    concatenated_examples = {k: sum(examples[k], []) for k in examples.keys()}
    total_length = len(concatenated_examples[list(examples.keys())[0]])
    # We drop the small remainder, we could add padding if the model supported it instead of this drop, you can
    # customize this part to your needs.
    total_length = (total_length // block_size) * block_size
    # Split by chunks of max_len.
    result = {
        k: [t[i: i + block_size] for i in range(0, total_length, block_size)]
        for k, t in concatenated_examples.items()
    }
    result["labels"] = result["input_ids"].copy()
    return result


lm_datasets = encoded_datasets.map(
    group_texts,
    batched=True,
)


Found cached dataset common_gen (C:/Users/Jingqian/.cache/huggingface/datasets/common_gen/default/2020.5.30/1a9e8bdc026c41ce7a9e96260debf7d2809cb7fd63fa02b017e4fac1b00c6b23)


  0%|          | 0/3 [00:00<?, ?it/s]

Loading cached processed dataset at C:\Users\Jingqian\.cache\huggingface\datasets\common_gen\default\2020.5.30\1a9e8bdc026c41ce7a9e96260debf7d2809cb7fd63fa02b017e4fac1b00c6b23\cache-305e29b2e1b9cd14.arrow
Loading cached processed dataset at C:\Users\Jingqian\.cache\huggingface\datasets\common_gen\default\2020.5.30\1a9e8bdc026c41ce7a9e96260debf7d2809cb7fd63fa02b017e4fac1b00c6b23\cache-dd83b94cd04eeec3.arrow
Loading cached processed dataset at C:\Users\Jingqian\.cache\huggingface\datasets\common_gen\default\2020.5.30\1a9e8bdc026c41ce7a9e96260debf7d2809cb7fd63fa02b017e4fac1b00c6b23\cache-b6ebe56be27b9f15.arrow
Loading cached processed dataset at C:\Users\Jingqian\.cache\huggingface\datasets\common_gen\default\2020.5.30\1a9e8bdc026c41ce7a9e96260debf7d2809cb7fd63fa02b017e4fac1b00c6b23\cache-681944d40be4403b.arrow
Loading cached processed dataset at C:\Users\Jingqian\.cache\huggingface\datasets\common_gen\default\2020.5.30\1a9e8bdc026c41ce7a9e96260debf7d2809cb7fd63fa02b017e4fac1b00c6b23\cach

In [35]:
model_name = "gpt2"
training_args = TrainingArguments(
    f"{model_name}-finetuned-commongen",
    evaluation_strategy="epoch",
    learning_rate=2.5e-5,
    weight_decay=0.01,
    push_to_hub=False,
    load_best_model_at_end=True,
    save_strategy="epoch",
    hub_token="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",
    num_train_epochs=5.0,
    per_device_train_batch_size=8,
    per_device_eval_batch_size=8,
)
trainer = Trainer(
    model=model,
    args=training_args,
    train_dataset=lm_datasets["train"],
    eval_dataset=lm_datasets["validation"],
)

PyTorch: setting up devices
The default value for the training argument `--report_to` will change in v5 (from all installed integrations to none). In v5, you will need to use `--report_to all` to get the same behavior as now. You should start updating your code and make this info disappear :-).


In [36]:
trainer.evaluate()

***** Running Evaluation *****
  Num examples = 297
  Batch size = 8


{'eval_loss': 2.0657103061676025,
 'eval_runtime': 8.4512,
 'eval_samples_per_second': 35.143,
 'eval_steps_per_second': 4.496}