<a href="https://colab.research.google.com/github/liuxingkf/Leetcode/blob/master/transformer_gpt2_hugging_face.ipynb" target="_parent"><img src="https://colab.research.google.com/assets/colab-badge.svg" alt="Open In Colab"/></a>

This is a experimental colab to fine tune a NLP model with hugging face interface. We explore OpenAI's GPT-2 api and some other GPT-3 like models hosted by third parties other than OpenAI.

Reference docs:
* [Hugging Face transformer fine tuning](https://huggingface.co/docs/transformers/training)
* [text generation parameters](https://colab.research.google.com/github/huggingface/blog/blob/main/notebooks/02_how_to_generate.ipynb#scrollTo=nxsksOGDpmA0)(How to get reasonable text generation outputs)
* [Hugging Face gpt2](https://huggingface.co/gpt2)
* [gpt2 fine tune example](https://www.philschmid.de/fine-tune-a-non-english-gpt-2-model-with-huggingface)
* [Parallel training huggingface Trainer Integration](https://huggingface.co/docs/transformers/main_classes/deepspeed#deployment-in-notebooks)



In [None]:
# hugging face Transformers installation
# Build with binary.
# ! pip install transformers datasets
# Build with source, slower but can check source code.
!pip install deepspeed accelerate

! pip install git+https://github.com/huggingface/transformers.git
! pip install git+https://github.com/huggingface/datasets.git
! pip install git+https://github.com/huggingface/evaluate.git


Looking in indexes: https://pypi.org/simple, https://us-python.pkg.dev/colab-wheels/public/simple/
Collecting deepspeed
  Downloading deepspeed-0.8.1.tar.gz (759 kB)
[2K     [90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━[0m [32m759.6/759.6 KB[0m [31m12.3 MB/s[0m eta [36m0:00:00[0m
[?25h  Preparing metadata (setup.py) ... [?25l[?25hdone
Collecting accelerate
  Downloading accelerate-0.16.0-py3-none-any.whl (199 kB)
[2K     [90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━[0m [32m199.7/199.7 KB[0m [31m24.9 MB/s[0m eta [36m0:00:00[0m
[?25hCollecting hjson
  Downloading hjson-3.1.0-py3-none-any.whl (54 kB)
[2K     [90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━[0m [32m54.0/54.0 KB[0m [31m7.9 MB/s[0m eta [36m0:00:00[0m
[?25hCollecting ninja
  Downloading ninja-1.11.1-py2.py3-none-manylinux_2_12_x86_64.manylinux2010_x86_64.whl (145 kB)
[2K     [90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━[0m [32m146.0/146.0 KB[0m [31m18.8 MB/s[0m eta [36m0:00:00[0m
Collecting py-cpuin

In [2]:
# Constant values.
MODEL = 'gpt2'
MAX_OUTPUT = 10
INPUT = 'google is a website that'

In [3]:
from transformers import GPT2Tokenizer, GPT2LMHeadModel
import torch

tokenizer = GPT2Tokenizer.from_pretrained(MODEL)
model = GPT2LMHeadModel.from_pretrained(MODEL)
print(model.generation_config)
print(model.config)



Downloading (…)olve/main/vocab.json:   0%|          | 0.00/1.04M [00:00<?, ?B/s]

Downloading (…)olve/main/merges.txt:   0%|          | 0.00/456k [00:00<?, ?B/s]

Downloading (…)lve/main/config.json:   0%|          | 0.00/665 [00:00<?, ?B/s]

Downloading (…)"pytorch_model.bin";:   0%|          | 0.00/548M [00:00<?, ?B/s]

Downloading (…)neration_config.json:   0%|          | 0.00/124 [00:00<?, ?B/s]

GenerationConfig {
  "_from_model_config": true,
  "bos_token_id": 50256,
  "eos_token_id": 50256,
  "transformers_version": "4.27.0.dev0"
}

GPT2Config {
  "_name_or_path": "gpt2",
  "activation_function": "gelu_new",
  "architectures": [
    "GPT2LMHeadModel"
  ],
  "attn_pdrop": 0.1,
  "bos_token_id": 50256,
  "embd_pdrop": 0.1,
  "eos_token_id": 50256,
  "initializer_range": 0.02,
  "layer_norm_epsilon": 1e-05,
  "model_type": "gpt2",
  "n_ctx": 1024,
  "n_embd": 768,
  "n_head": 12,
  "n_inner": null,
  "n_layer": 12,
  "n_positions": 1024,
  "reorder_and_upcast_attn": false,
  "resid_pdrop": 0.1,
  "scale_attn_by_inverse_layer_idx": false,
  "scale_attn_weights": true,
  "summary_activation": null,
  "summary_first_dropout": 0.1,
  "summary_proj_to_labels": true,
  "summary_type": "cls_index",
  "summary_use_proj": true,
  "task_specific_params": {
    "text-generation": {
      "do_sample": true,
      "max_length": 50
    }
  },
  "transformers_version": "4.27.0.dev0",
  "use_c

In [4]:
from transformers import pipeline, set_seed

# Use the pipeline api to get the whole output sentense.
text =INPUT

generator = pipeline('text-generation', model=MODEL)

# Override default generation config by providing generation parameters.
set_seed(42)
generator(text,
          do_sample=True,
          max_length=MAX_OUTPUT, 
          num_return_sequences=5, 
          top_k=50, 
          top_p=0.95)

Downloading (…)/main/tokenizer.json:   0%|          | 0.00/1.36M [00:00<?, ?B/s]

Setting `pad_token_id` to `eos_token_id`:50256 for open-end generation.


[{'generated_text': 'google is a website that keeps track of a user'},
 {'generated_text': 'google is a website that offers you, and its'},
 {'generated_text': 'google is a website that allows you to create your'},
 {'generated_text': 'google is a website that lets you embed your own'},
 {'generated_text': 'google is a website that allows you to embed content'}]

In [5]:
from datasets import load_dataset

# Load some dataset for fine tune.
dataset = load_dataset("yelp_review_full")

# Shrink the data size.
dataset["train"] = dataset["train"].shard(num_shards=10, index=0)
dataset["test"] = dataset["test"].shard(num_shards=10, index=0)

print(dataset["train"])
print(dataset["test"])


Downloading builder script:   0%|          | 0.00/4.41k [00:00<?, ?B/s]

Downloading metadata:   0%|          | 0.00/2.04k [00:00<?, ?B/s]

Downloading readme:   0%|          | 0.00/6.55k [00:00<?, ?B/s]

Downloading and preparing dataset yelp_review_full/yelp_review_full to /root/.cache/huggingface/datasets/yelp_review_full/yelp_review_full/1.0.0/e8e18e19d7be9e75642fc66b198abadb116f73599ec89a69ba5dd8d1e57ba0bf...


Downloading data:   0%|          | 0.00/196M [00:00<?, ?B/s]

Generating train split:   0%|          | 0/650000 [00:00<?, ? examples/s]

Generating test split:   0%|          | 0/50000 [00:00<?, ? examples/s]

Dataset yelp_review_full downloaded and prepared to /root/.cache/huggingface/datasets/yelp_review_full/yelp_review_full/1.0.0/e8e18e19d7be9e75642fc66b198abadb116f73599ec89a69ba5dd8d1e57ba0bf. Subsequent calls will reuse this data.


  0%|          | 0/2 [00:00<?, ?it/s]

Dataset({
    features: ['label', 'text'],
    num_rows: 65000
})
Dataset({
    features: ['label', 'text'],
    num_rows: 5000
})


In [6]:
def tokenize_function(examples):
  """Tokenize the input sequence from strings to integer tensors."""
  return tokenizer(examples["text"], padding="max_length", truncation=True)


In [7]:
# Tokenize a dataset into training data and evaluation data. 
tokenizer.pad_token = tokenizer.eos_token
tokenized_datasets = dataset.map(tokenize_function, batched=True)
small_train_dataset = tokenized_datasets["train"].shuffle(seed=42).select(range(1000))
small_eval_dataset = tokenized_datasets["test"].shuffle(seed=42).select(range(1000))


Map:   0%|          | 0/65000 [00:00<?, ? examples/s]

Map:   0%|          | 0/5000 [00:00<?, ? examples/s]

In [20]:
from transformers.deepspeed import HfDeepSpeedConfig, HfTrainerDeepSpeedConfig
from typing import Dict

ds_config :Dict = {
  "zero_optimization": {
     "stage": 2,
     "offload_optimizer": {
         "device": "cpu",
         "pin_memory": True
     },
     "allgather_partitions": True,
     "allgather_bucket_size": 2e8,
     "reduce_scatter": True,
     "reduce_bucket_size": 2e8,
     "overlap_comm": True,
     "contiguous_gradients": True,
      "train_batch_size": 12,
      "train_micro_batch_size_per_gpu": 4,
  }
}


In [22]:
from transformers import TrainingArguments, Trainer
import numpy as np
import evaluate
import os

# Deepspeed notebook env virables. This must be in the same cell of TrainingArguments init.
os.environ["MASTER_ADDR"] = "localhost"
os.environ["MASTER_PORT"] = "9994"  # modify if RuntimeError: Address already in use
os.environ["RANK"] = "0"
os.environ["LOCAL_RANK"] = "0"
os.environ["WORLD_SIZE"] = "1"

print(ds_config)

training_args = TrainingArguments(
    output_dir="./gpt2-yelp", #The output directory
    overwrite_output_dir=True, #overwrite the content of the output directory
    num_train_epochs=3, # number of training epochs
    per_device_train_batch_size=8, # batch size for training
    per_device_eval_batch_size=8,  # batch size for evaluation
    eval_steps = 400, # Number of update steps between two evaluations.
    save_steps=800, # after # steps model is saved
    warmup_steps=500,# number of warmup steps for learning rate scheduler
    deepspeed =ds_config,
    )

metric = evaluate.load("accuracy")



{'zero_optimization': {'stage': 2, 'offload_optimizer': {'device': 'cpu', 'pin_memory': True}, 'allgather_partitions': True, 'allgather_bucket_size': 200000000.0, 'reduce_scatter': True, 'reduce_bucket_size': 200000000.0, 'overlap_comm': True, 'contiguous_gradients': True, 'train_batch_size': 12, 'train_micro_batch_size_per_gpu': 4}}


In [11]:
def compute_metrics(eval_pred):
    logits, labels = eval_pred
    predictions = np.argmax(logits, axis=-1)
    return metric.compute(predictions=predictions, references=labels)


In [12]:
# Check GPU type.
!nvidia-smi -L

GPU 0: Tesla T4 (UUID: GPU-e5955fe6-c3a6-5bff-2a8b-e08d150eb95b)


In [23]:
# Train the fine tuning dataset.
# TODO(xingliu): OOM here, reduce the size of the dataset for demo.
trainer = Trainer(
    model=model,
    args=training_args,
    train_dataset=small_train_dataset,
    eval_dataset=small_eval_dataset,
    # compute_metrics=compute_metrics,
)
trainer.train()


ValidationError: ignored