In [None]:
# loosely based on https://www.linkedin.com/pulse/fine-tuning-gpt-2-large-language-model-unlocking-its-adamson-mbcs

In [1]:
from datasets import load_dataset
from transformers import AutoTokenizer, AutoModelWithLMHead, AutoModelForCausalLM
from transformers import DataCollatorForLanguageModeling
from transformers import GPT2Tokenizer, GPT2Model, GPT2LMHeadModel, GPT2Config, GPT2ForQuestionAnswering
from transformers import TrainingArguments, Trainer
from torch.utils.data import Dataset
import copy
import pandas as pd
import re
import transformers
import torch

transformers.logging.set_verbosity_error()

base_tokenizer = AutoTokenizer.from_pretrained("gpt2-medium")
base_model = AutoModelForCausalLM.from_pretrained("gpt2-medium", pad_token_id = base_tokenizer.eos_token_id)

2024-02-19 22:42:19.104128: I tensorflow/core/util/port.cc:113] oneDNN custom operations are on. You may see slightly different numerical results due to floating-point round-off errors from different computation orders. To turn them off, set the environment variable `TF_ENABLE_ONEDNN_OPTS=0`.
2024-02-19 22:42:19.134350: E external/local_xla/xla/stream_executor/cuda/cuda_dnn.cc:9261] Unable to register cuDNN factory: Attempting to register factory for plugin cuDNN when one has already been registered
2024-02-19 22:42:19.134390: E external/local_xla/xla/stream_executor/cuda/cuda_fft.cc:607] Unable to register cuFFT factory: Attempting to register factory for plugin cuFFT when one has already been registered
2024-02-19 22:42:19.135347: E external/local_xla/xla/stream_executor/cuda/cuda_blas.cc:1515] Unable to register cuBLAS factory: Attempting to register factory for plugin cuBLAS when one has already been registered
2024-02-19 22:42:19.140836: I tensorflow/core/platform/cpu_feature_guar

In [2]:
print(f"Base model architecture:\n", base_model)

Base model architecture:
 GPT2LMHeadModel(
  (transformer): GPT2Model(
    (wte): Embedding(50257, 1024)
    (wpe): Embedding(1024, 1024)
    (drop): Dropout(p=0.1, inplace=False)
    (h): ModuleList(
      (0-23): 24 x GPT2Block(
        (ln_1): LayerNorm((1024,), eps=1e-05, elementwise_affine=True)
        (attn): GPT2Attention(
          (c_attn): Conv1D()
          (c_proj): Conv1D()
          (attn_dropout): Dropout(p=0.1, inplace=False)
          (resid_dropout): Dropout(p=0.1, inplace=False)
        )
        (ln_2): LayerNorm((1024,), eps=1e-05, elementwise_affine=True)
        (mlp): GPT2MLP(
          (c_fc): Conv1D()
          (c_proj): Conv1D()
          (act): NewGELUActivation()
          (dropout): Dropout(p=0.1, inplace=False)
        )
      )
    )
    (ln_f): LayerNorm((1024,), eps=1e-05, elementwise_affine=True)
  )
  (lm_head): Linear(in_features=1024, out_features=50257, bias=False)
)


In [3]:
# Load the SQuAD dataset, roughly 9:1:1, train:valid:test
train_squad = load_dataset("squad")["train"].train_test_split(test_size=0.12)
train_dataset = train_squad["train"]
valid_dataset = load_dataset("squad")["validation"]
test_dataset = train_squad["test"]

In [4]:
# sanity checking
len(train_dataset), len(valid_dataset), len(test_dataset)
print(train_dataset[0])

{'id': '56d3e4762ccc5a1400d82f21', 'title': 'To_Kill_a_Mockingbird', 'context': 'During the years immediately following the novel\'s publication, Harper Lee enjoyed the attention its popularity garnered her, granting interviews, visiting schools, and attending events honoring the book. In 1961, when To Kill a Mockingbird was in its 41st week on the bestseller list, it was awarded the Pulitzer Prize, stunning Lee. It also won the Brotherhood Award of the National Conference of Christians and Jews in the same year, and the Paperback of the Year award from Bestsellers magazine in 1962. Starting in 1964, Lee began to turn down interviews, complaining that the questions were monotonous, and grew concerned that attention she received bordered on the kind of publicity celebrities sought. Since the, she declined talking with reporters about the book. She also steadfastly refused to provide an introduction, writing in 1995: "Introductions inhibit pleasure, they kill the joy of anticipation, the

In [5]:
base_tokenizer.padding_side = "left"
if base_tokenizer.pad_token is None:
    base_tokenizer.pad_token = base_tokenizer.eos_token

def encode(examples):
    contexes = examples["context"]
    questions = examples["question"]
    answers = examples["answers"]
    samples = [f"{context}\n{question}\n{answer['text'][0]}" for context, question, answer in zip(contexes, questions, answers)]
    return base_tokenizer(samples, truncation=True, padding="max_length")

train_dataset = train_dataset.map(encode, batched=True)
valid_dataset = valid_dataset.map(encode, batched=True)

Map:   0%|          | 0/77087 [00:00<?, ? examples/s]

In [7]:
# # get smaller datasets to test with
small_train_dataset = train_dataset.shuffle(seed=42).select(range(100))
small_valid_dataset = valid_dataset.shuffle(seed=42).select(range(25))

In [6]:
def print_decodes(decodes):
    for i, d in enumerate(decodes):
        print(f"{i}: {d}\n")

def get_question(sample):
    return f'{sample["context"]}\n{sample["question"]}'

def get_prediction(prompt, model, tokenizer, max_tokens=50):
    model.eval()
    input_text = [prompt]
    prompts = [torch.tensor(tokenizer.encode(s)).unsqueeze(0) for s in input_text]
    out0 = [tokenizer.decode(
        model.generate(p, 
                            max_length=p.shape[-1]+max_tokens)[0,:]) for p in prompts]
    print_decodes(out0)
    return out0

def get_model_answer(index, dataset, model, tokenizer, max_tokens=50):
    prompt = get_question(dataset[index])
    prediction = get_prediction(prompt, model, tokenizer, max_tokens)
    print("\nAnswer key: ", dataset[index]["answers"]["text"][0])
    return prediction

In [10]:
# Test base model prediction
INDEX = 1
answer = get_model_answer(INDEX, small_train_dataset, base_model, base_tokenizer)

0: Federal law originates with the Constitution, which gives Congress the power to enact statutes for certain limited purposes like regulating interstate commerce. The United States Code is the official compilation and codification of the general and permanent federal statutes. Many statutes give executive branch agencies the power to create regulations, which are published in the Federal Register and codified into the Code of Federal Regulations. Regulations generally also carry the force of law under the Chevron doctrine. Many lawsuits turn on the meaning of a federal statute or regulation, and judicial interpretations of such meaning carry legal force under the principle of stare decisis.
Who do the statutes give the power of creating regulations?

The statutes give the power to create regulations to the executive branch agencies, which are the federal agencies that are responsible for the administration of the federal government. The statutes also give the power to create regulatio

In [12]:
def make_pruned(model: GPT2Model, layers: list):
    if model.config.n_layer < len(layers):
        print(f"List of layers too long")
        return
    if any([l for l in layers if l >= model.config.n_layer or l < 0]):
        print(f"All layers specified must be indexes _less_ than number of layers available")
        return
    
    layers.sort()
    print(f"Pruning {len(layers)} layer(s)...")
    pruned_config = copy.deepcopy(model.config)
    pruned_config.n_layer -= len(layers)
    pruned_model = GPT2LMHeadModel(pruned_config)

    pruned_states = []
    for layer in layers:
        pruned_states += list(filter(
            lambda s: re.search(f'transformer.h\.{layer}\.',s) is not None,
            model.state_dict().keys()))
    print(f"Dropping these states: {pruned_states}")

    base = dict(model.named_parameters())
    pruned = dict(pruned_model.named_parameters())

    prev_base_idx = -1
    pruned_idx = 0
    prev_skipped = False
    copied_states = []
    
    for k, v in model.named_parameters():
        base_idx = re.search(r".h.([0-9]+).", k)
        if base_idx:
            base_idx = int(base_idx.group(1))
            if base_idx in layers:
                # the next base layer to copy should go into the current pruned layer
                if prev_base_idx != base_idx and not prev_skipped and pruned_idx > 0:
                    pruned_idx += 1
                prev_skipped = True
                continue                
            if prev_base_idx != base_idx and not prev_skipped and base_idx > 0:
                pruned_idx += 1
            prev_skipped = False
            copied_states.append(k)
            k = re.sub(f".h.{base_idx}.", f".h.{pruned_idx}.", k)
            pruned[k].data = copy.deepcopy(v.data)
            prev_base_idx = base_idx
        else:
            copied_states.append(k)
            pruned[k].data = copy.deepcopy(v.data)
            
    print(f"Copied these states into the pruned model: {copied_states}")
    print(f"Pruned model architecture: {pruned_model}")
    return pruned_model

In [13]:
pruned_model = make_pruned(base_model, [23])

Pruning 1 layer(s)...
Dropping these states: ['transformer.h.23.ln_1.weight', 'transformer.h.23.ln_1.bias', 'transformer.h.23.attn.c_attn.weight', 'transformer.h.23.attn.c_attn.bias', 'transformer.h.23.attn.c_proj.weight', 'transformer.h.23.attn.c_proj.bias', 'transformer.h.23.ln_2.weight', 'transformer.h.23.ln_2.bias', 'transformer.h.23.mlp.c_fc.weight', 'transformer.h.23.mlp.c_fc.bias', 'transformer.h.23.mlp.c_proj.weight', 'transformer.h.23.mlp.c_proj.bias']
Copied these states into the pruned model: ['transformer.wte.weight', 'transformer.wpe.weight', 'transformer.h.0.ln_1.weight', 'transformer.h.0.ln_1.bias', 'transformer.h.0.attn.c_attn.weight', 'transformer.h.0.attn.c_attn.bias', 'transformer.h.0.attn.c_proj.weight', 'transformer.h.0.attn.c_proj.bias', 'transformer.h.0.ln_2.weight', 'transformer.h.0.ln_2.bias', 'transformer.h.0.mlp.c_fc.weight', 'transformer.h.0.mlp.c_fc.bias', 'transformer.h.0.mlp.c_proj.weight', 'transformer.h.0.mlp.c_proj.bias', 'transformer.h.1.ln_1.weight',

In [14]:
# Test pruned model prediction
answer_pruned = get_model_answer(INDEX, small_train_dataset, pruned_model, base_tokenizer)

0: Federal law originates with the Constitution, which gives Congress the power to enact statutes for certain limited purposes like regulating interstate commerce. The United States Code is the official compilation and codification of the general and permanent federal statutes. Many statutes give executive branch agencies the power to create regulations, which are published in the Federal Register and codified into the Code of Federal Regulations. Regulations generally also carry the force of law under the Chevron doctrine. Many lawsuits turn on the meaning of a federal statute or regulation, and judicial interpretations of such meaning carry legal force under the principle of stare decisis.
Who do the statutes give the power of creating regulations?
The United States Code is the official compilation and codification of the general and permanent federal statutes. Many statutes give executive branch agencies the power to create regulations, which are published in the Federal Register an

In [19]:
# for setting up wandb:
# import wandb
# wandb.login()

# wandb.init(
#     project="gpt2-pruning",
#     config={
#         "batch_size": BATCH_SIZE,
#         "learning_rate": LEARNING_RATE,
#         "dataset": "SQuAD",
#     },
# )

huggingface/tokenizers: The current process just got forked, after parallelism has already been used. Disabling parallelism to avoid deadlocks...
	- Avoid using `tokenizers` before the fork if possible
	- Explicitly set the environment variable TOKENIZERS_PARALLELISM=(true | false)
[34m[1mwandb[0m: Logging into wandb.ai. (Learn how to deploy a W&B server locally: https://wandb.me/wandb-server)
[34m[1mwandb[0m: You can find your API key in your browser here: https://wandb.ai/authorize
[34m[1mwandb[0m: Paste an API key from your profile and hit enter, or press ctrl+c to quit:

  ········


[34m[1mwandb[0m: Appending key for api.wandb.ai to your netrc file: /home/hannahyl/.netrc


True

In [21]:
BATCH_SIZE = 16
EPOCHS = 1
LEARNING_RATE = 1e-5
LOGGING_STEPS = 2
SAVE_STEPS = 100

In [18]:
def fine_tune_gpt2(model, 
                   tokenizer, 
                   train_dataset, 
                   valid_dataset, 
                   train_output_dir,
                   save_model_dir):
    # Create data collator for language modeling
    data_collator = DataCollatorForLanguageModeling(
        tokenizer=tokenizer, mlm=False)

    # Set training arguments
    training_args = TrainingArguments(
        output_dir=train_output_dir, 
        evaluation_strategy = "steps", 
        logging_steps = LOGGING_STEPS,
        logging_strategy = "steps",
        save_steps = SAVE_STEPS,
        num_train_epochs = EPOCHS,
        per_device_train_batch_size = BATCH_SIZE,
        per_device_eval_batch_size = BATCH_SIZE,
        learning_rate = LEARNING_RATE,
        # optim="paged_adamw_32bit",
        report_to="wandb",
    )

    # Train the model
    trainer = Trainer(
        model=model,
        args=training_args,
        data_collator=data_collator,
        train_dataset=train_dataset,
        eval_dataset=valid_dataset,
    )

    trainer.train()

    log_history = pd.DataFrame(trainer.state.log_history)
    print(log_history)

    # Save the fine-tuned model
    model.save_pretrained(save_model_dir)
    tokenizer.save_pretrained(save_model_dir)

In [19]:
fine_tune_gpt2(base_model, 
               base_tokenizer, 
               small_train_dataset['input_ids'], 
               small_valid_dataset['input_ids'],
               "train_log",
               "trained_model")

[34m[1mwandb[0m: Currently logged in as: [33mhylin[0m ([33muw-hannahyl[0m). Use [1m`wandb login --relogin`[0m to force relogin


{'train_runtime': 472.1118, 'train_samples_per_second': 0.212, 'train_steps_per_second': 0.015, 'train_loss': 6.195960453578404, 'epoch': 1.0}
   train_runtime  train_samples_per_second  train_steps_per_second  \
0       472.1118                     0.212                   0.015   

     total_flos  train_loss  epoch  step  
0  1.857401e+14     6.19596    1.0     7  


In [20]:
answer_finetuned = get_model_answer(INDEX, small_train_dataset, base_model, base_tokenizer)

0: Federal law originates with the Constitution, which gives Congress the power to enact statutes for certain limited purposes like regulating interstate commerce. The United States Code is the official compilation and codification of the general and permanent federal statutes. Many statutes give executive branch agencies the power to create regulations, which are published in the Federal Register and codified into the Code of Federal Regulations. Regulations generally also carry the force of law under the Chevron doctrine. Many lawsuits turn on the meaning of a federal statute or regulation, and judicial interpretations of such meaning carry legal force under the principle of stare decisis.
Who do the statutes give the power of creating regulations?

The statutes give the power to create regulations to the executive branch agencies, which are the federal agencies that are responsible for the administration of the federal government. The statutes also give the power to create regulatio

In [22]:
fine_tune_gpt2(pruned_model, 
               base_tokenizer, 
               small_train_dataset['input_ids'], 
               small_valid_dataset['input_ids'],
               "train_log",
               "trained_model_pruned")

{'loss': 7.4325, 'learning_rate': 7.1428571428571436e-06, 'epoch': 0.29}
{'eval_loss': 6.0187788009643555, 'eval_runtime': 35.4375, 'eval_samples_per_second': 0.705, 'eval_steps_per_second': 0.056, 'epoch': 0.29}
{'loss': 6.1822, 'learning_rate': 4.2857142857142855e-06, 'epoch': 0.57}
{'eval_loss': 5.372560024261475, 'eval_runtime': 34.8354, 'eval_samples_per_second': 0.718, 'eval_steps_per_second': 0.057, 'epoch': 0.57}
{'loss': 5.8106, 'learning_rate': 1.4285714285714286e-06, 'epoch': 0.86}
{'eval_loss': 5.044312953948975, 'eval_runtime': 34.615, 'eval_samples_per_second': 0.722, 'eval_steps_per_second': 0.058, 'epoch': 0.86}
{'train_runtime': 467.221, 'train_samples_per_second': 0.214, 'train_steps_per_second': 0.015, 'train_loss': 6.378403731754848, 'epoch': 1.0}
     loss  learning_rate  epoch  step  eval_loss  eval_runtime  \
0  7.4325       0.000007   0.29     2        NaN           NaN   
1     NaN            NaN   0.29     2   6.018779       35.4375   
2  6.1822       0.000004

In [23]:
answer_finetuned_pruned = get_model_answer(INDEX, small_train_dataset, pruned_model, base_tokenizer)

0: Federal law originates with the Constitution, which gives Congress the power to enact statutes for certain limited purposes like regulating interstate commerce. The United States Code is the official compilation and codification of the general and permanent federal statutes. Many statutes give executive branch agencies the power to create regulations, which are published in the Federal Register and codified into the Code of Federal Regulations. Regulations generally also carry the force of law under the Chevron doctrine. Many lawsuits turn on the meaning of a federal statute or regulation, and judicial interpretations of such meaning carry legal force under the principle of stare decisis.
Who do the statutes give the power of creating regulations?
The United States Code is the official compilation and codification of the general and permanent federal statutes. Many statutes give executive branch agencies the power to create regulations, which are published in the Federal Register an

In [25]:
print("The United States Code is the official compilation and codification of the general and permanent federal statutes. Many statutes give executive branch agencies the power to create regulations, which are published in the Federal Register and codified into the Code of Federal Regulations. Regulations")


The United States Code is the official compilation and codification of the general and permanent federal statutes. Many statutes give executive branch agencies the power to create regulations, which are published in the Federal Register and codified into the Code of Federal Regulations. Regulations
