<a href="https://colab.research.google.com/github/khnhenriette/ProjectADL/blob/math-medium/notebooks/medium_fine_tune_math_hpo_final.ipynb" target="_parent"><img src="https://colab.research.google.com/assets/colab-badge.svg" alt="Open In Colab"/></a>

## Hyper-Parameter optimization: Fine tune gpt2-medium for basic math tasks

In [3]:
from google.colab import drive
drive.mount('/content/drive')

Mounted at /content/drive


In [4]:
!pip install transformers datasets
!pip install torch
!pip install optuna


Collecting datasets
  Downloading datasets-3.1.0-py3-none-any.whl.metadata (20 kB)
Collecting dill<0.3.9,>=0.3.0 (from datasets)
  Downloading dill-0.3.8-py3-none-any.whl.metadata (10 kB)
Collecting xxhash (from datasets)
  Downloading xxhash-3.5.0-cp310-cp310-manylinux_2_17_x86_64.manylinux2014_x86_64.whl.metadata (12 kB)
Collecting multiprocess<0.70.17 (from datasets)
  Downloading multiprocess-0.70.16-py310-none-any.whl.metadata (7.2 kB)
Collecting fsspec<=2024.9.0,>=2023.1.0 (from fsspec[http]<=2024.9.0,>=2023.1.0->datasets)
  Downloading fsspec-2024.9.0-py3-none-any.whl.metadata (11 kB)
Downloading datasets-3.1.0-py3-none-any.whl (480 kB)
[2K   [90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━[0m [32m480.6/480.6 kB[0m [31m31.2 MB/s[0m eta [36m0:00:00[0m
[?25hDownloading dill-0.3.8-py3-none-any.whl (116 kB)
[2K   [90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━[0m [32m116.3/116.3 kB[0m [31m11.4 MB/s[0m eta [36m0:00:00[0m
[?25hDownloading fsspec-2024.9.0-py3-none-any.whl 

Use the dataset math_dataset.json that includes 20000 simple math examples of the form "89 minus 84 equals 5" using addition, subtraction, multiplication and division -- ensure dataset is uploaded to Google Colab before running

In [5]:
import pandas as pd
from datasets import Dataset

# Load the CSV dataset
df = pd.read_csv('math_dataset.csv')  # Update this path if using Google Drive

# Convert DataFrame to Hugging Face Dataset
dataset = Dataset.from_pandas(df)

# Shuffle the dataset
dataset = dataset.shuffle(seed=42)  # Ensure reproducibility with a fixed seed

# Split into training (15000 examples) and the rest
train_dataset = dataset.select(range(15000))
remaining_dataset = dataset.select(range(15000, len(dataset)))

# Split the remaining dataset into validation (2500 examples) and test (2500 examples)
eval_dataset = remaining_dataset.select(range(2500))
test_dataset = remaining_dataset.select(range(2500, 5000))

# Print dataset sizes
print(f"Training examples: {len(train_dataset)}")
print(f"Validation examples: {len(eval_dataset)}")
print(f"Test examples: {len(test_dataset)}")


Training examples: 15000
Validation examples: 2500
Test examples: 2500


In [None]:
#print(train_dataset[0])

### Use the Hugging Face Trainer to fine-tune the available gpt2-medium to perform better on the simple math tasks

In [7]:
from transformers import GPT2Tokenizer, GPT2LMHeadModel
# Load GPT-2 Medium tokenizer and model
tokenizer = GPT2Tokenizer.from_pretrained("gpt2-medium")
model = GPT2LMHeadModel.from_pretrained("gpt2-medium")

The secret `HF_TOKEN` does not exist in your Colab secrets.
To authenticate with the Hugging Face Hub, create a token in your settings tab (https://huggingface.co/settings/tokens), set it as secret in your Google Colab and restart your session.
You will be able to reuse this secret in all of your notebooks.
Please note that authentication is recommended but still optional to access public models or datasets.


tokenizer_config.json:   0%|          | 0.00/26.0 [00:00<?, ?B/s]

vocab.json:   0%|          | 0.00/1.04M [00:00<?, ?B/s]

merges.txt:   0%|          | 0.00/456k [00:00<?, ?B/s]

tokenizer.json:   0%|          | 0.00/1.36M [00:00<?, ?B/s]

config.json:   0%|          | 0.00/718 [00:00<?, ?B/s]

model.safetensors:   0%|          | 0.00/1.52G [00:00<?, ?B/s]

generation_config.json:   0%|          | 0.00/124 [00:00<?, ?B/s]

Start by finding right max_length for padding


In [8]:
# Check the longest sequence in your dataset
max_tokenized_length = max(len(tokenizer(example)['input_ids']) for example in dataset['math_problem'])
print(f"Maximum tokenized length: {max_tokenized_length}")


Maximum tokenized length: 8


In [9]:

# Set the EOS token as the padding token
tokenizer.pad_token = tokenizer.eos_token

# choose max_length slightly higher than longest sequence in dataset
max_length = 32

# Tokenize the dataset
def tokenize_function(examples):
    tokenized_inputs = tokenizer(
        examples["math_problem"],
        truncation=True,
        padding="max_length",
        max_length=max_length
    )
    # Labels are the same as input_ids for causal language modeling
    tokenized_inputs["labels"] = tokenized_inputs["input_ids"].copy()
    return tokenized_inputs


In [10]:
# ensure padding and masking are done correctly

testing = tokenizer("3 plus 65 equals 68", padding="max_length", max_length=10)
print(testing)


{'input_ids': [18, 5556, 6135, 21767, 8257, 50256, 50256, 50256, 50256, 50256], 'attention_mask': [1, 1, 1, 1, 1, 0, 0, 0, 0, 0]}


In [11]:
# tokenize dataset

train_dataset = train_dataset.map(tokenize_function, batched=True)
eval_dataset = eval_dataset.map(tokenize_function, batched=True)
test_dataset = test_dataset.map(tokenize_function, batched=True)

Map:   0%|          | 0/15000 [00:00<?, ? examples/s]

Map:   0%|          | 0/2500 [00:00<?, ? examples/s]

Map:   0%|          | 0/2500 [00:00<?, ? examples/s]

In [12]:
# mask eos tokens in labels to avoid their prediction in final outcome

def mask_eos_in_labels(example):
    example["labels"] = [token if token != tokenizer.eos_token_id else -100 for token in example["labels"]]
    return example

train_dataset = train_dataset.map(mask_eos_in_labels)
eval_dataset = eval_dataset.map(mask_eos_in_labels)
test_dataset = test_dataset.map(mask_eos_in_labels)


print(train_dataset[0])
print(eval_dataset[0])
print(test_dataset[0])

Map:   0%|          | 0/15000 [00:00<?, ? examples/s]

Map:   0%|          | 0/2500 [00:00<?, ? examples/s]

Map:   0%|          | 0/2500 [00:00<?, ? examples/s]

{'math_problem': '77 times 26 equals 2002', 'input_ids': [3324, 1661, 2608, 21767, 6244, 50256, 50256, 50256, 50256, 50256, 50256, 50256, 50256, 50256, 50256, 50256, 50256, 50256, 50256, 50256, 50256, 50256, 50256, 50256, 50256, 50256, 50256, 50256, 50256, 50256, 50256, 50256], 'attention_mask': [1, 1, 1, 1, 1, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0], 'labels': [3324, 1661, 2608, 21767, 6244, -100, -100, -100, -100, -100, -100, -100, -100, -100, -100, -100, -100, -100, -100, -100, -100, -100, -100, -100, -100, -100, -100, -100, -100, -100, -100, -100]}
{'math_problem': '55 plus 61 equals 116', 'input_ids': [2816, 5556, 8454, 21767, 18693, 50256, 50256, 50256, 50256, 50256, 50256, 50256, 50256, 50256, 50256, 50256, 50256, 50256, 50256, 50256, 50256, 50256, 50256, 50256, 50256, 50256, 50256, 50256, 50256, 50256, 50256, 50256], 'attention_mask': [1, 1, 1, 1, 1, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0], 'label

Train with hyperparameter optimization using optuna

In [13]:
import optuna
from transformers import TrainingArguments, Trainer

# Define the objective function for Optuna
def objective(trial):
    # Suggest hyperparameters
    learning_rate = trial.suggest_float("learning_rate", 1e-6, 1e-4, log=True)
    batch_size = trial.suggest_categorical("batch_size", [4, 8, 16])
    num_epochs = trial.suggest_int("num_epochs", 1, 5)

    # Define training arguments
    training_args = TrainingArguments(
        output_dir="./gpt2_finetuned_optuna",
        evaluation_strategy="epoch",
        learning_rate=learning_rate,
        per_device_train_batch_size=batch_size,
        num_train_epochs=num_epochs,
        save_steps=500,
        save_total_limit=2,
        logging_dir="./logs",
        logging_steps=50,
    )

    # Initialize Trainer
    trainer = Trainer(
        model=model,
        args=training_args,
        train_dataset=train_dataset,
        eval_dataset=eval_dataset,
        tokenizer=tokenizer,
    )

    # Train the model and evaluate on validation set
    trainer.train()
    eval_results = trainer.evaluate()

    # Use validation loss as the optimization metric
    return eval_results["eval_loss"]

# Run the hyperparameter search
study = optuna.create_study(direction="minimize")
study.optimize(objective, n_trials=10, show_progress_bar=True)
print("Tested learning rates:", [trial.params["learning_rate"] for trial in study.trials])

# Best hyperparameters
print("Best hyperparameters according to HPO:", study.best_params)



[I 2024-12-07 12:17:16,903] A new study created in memory with name: no-name-84ce0d6f-baff-4663-9c3c-838e0ab5bcf5


  0%|          | 0/10 [00:00<?, ?it/s]

  trainer = Trainer(
[34m[1mwandb[0m: Using wandb-core as the SDK backend.  Please refer to https://wandb.me/wandb-core for more information.


<IPython.core.display.Javascript object>

[34m[1mwandb[0m: Logging into wandb.ai. (Learn how to deploy a W&B server locally: https://wandb.me/wandb-server)
[34m[1mwandb[0m: You can find your API key in your browser here: https://wandb.ai/authorize
wandb: Paste an API key from your profile and hit enter, or press ctrl+c to quit:

 ··········


[34m[1mwandb[0m: Appending key for api.wandb.ai to your netrc file: /root/.netrc


Epoch,Training Loss,Validation Loss
1,2.0704,1.990416
2,1.9733,1.91499
3,1.9537,1.878519


[I 2024-12-07 12:28:23,670] Trial 0 finished with value: 1.8785187005996704 and parameters: {'learning_rate': 8.592574905514928e-06, 'batch_size': 8, 'num_epochs': 3}. Best is trial 0 with value: 1.8785187005996704.


  trainer = Trainer(


Epoch,Training Loss,Validation Loss
1,1.9133,1.864237
2,1.8845,1.847694
3,1.9531,1.84306
4,1.9332,1.840677


[I 2024-12-07 12:57:27,533] Trial 1 finished with value: 1.8406773805618286 and parameters: {'learning_rate': 1.243022616319868e-06, 'batch_size': 4, 'num_epochs': 4}. Best is trial 1 with value: 1.8406773805618286.


  trainer = Trainer(


Epoch,Training Loss,Validation Loss
1,1.8453,1.824041


[I 2024-12-07 13:05:00,119] Trial 2 finished with value: 1.8240410089492798 and parameters: {'learning_rate': 3.382620648962999e-06, 'batch_size': 4, 'num_epochs': 1}. Best is trial 2 with value: 1.8240410089492798.


  trainer = Trainer(


Epoch,Training Loss,Validation Loss
1,1.8884,1.824986
2,1.8422,1.817851
3,1.8602,1.809869
4,1.8692,1.805611
5,1.8392,1.803984


[I 2024-12-07 13:18:06,172] Trial 3 finished with value: 1.8039839267730713 and parameters: {'learning_rate': 1.667715758892236e-06, 'batch_size': 16, 'num_epochs': 5}. Best is trial 3 with value: 1.8039839267730713.


  trainer = Trainer(


Epoch,Training Loss,Validation Loss
1,1.8322,1.786212


[I 2024-12-07 13:20:26,637] Trial 4 finished with value: 1.7862119674682617 and parameters: {'learning_rate': 7.229215907039781e-05, 'batch_size': 16, 'num_epochs': 1}. Best is trial 4 with value: 1.7862119674682617.


  trainer = Trainer(


Epoch,Training Loss,Validation Loss
1,1.8437,1.821896
2,1.6528,1.708068
3,1.5263,1.646891


[I 2024-12-07 13:34:00,591] Trial 5 finished with value: 1.6468908786773682 and parameters: {'learning_rate': 7.924909961513882e-05, 'batch_size': 8, 'num_epochs': 3}. Best is trial 5 with value: 1.6468908786773682.


  trainer = Trainer(


Epoch,Training Loss,Validation Loss
1,1.4724,1.650057
2,1.4157,1.646243
3,1.5078,1.649739


[I 2024-12-07 13:58:53,722] Trial 6 finished with value: 1.6497392654418945 and parameters: {'learning_rate': 6.239042993444806e-06, 'batch_size': 4, 'num_epochs': 3}. Best is trial 5 with value: 1.6468908786773682.


  trainer = Trainer(


Epoch,Training Loss,Validation Loss
1,1.3497,1.666914
2,1.3179,1.672982
3,1.418,1.668067
4,1.4016,1.662256
5,1.3932,1.66112


[I 2024-12-07 14:20:37,327] Trial 7 finished with value: 1.6611196994781494 and parameters: {'learning_rate': 2.6313410210733257e-06, 'batch_size': 8, 'num_epochs': 5}. Best is trial 5 with value: 1.6468908786773682.


  trainer = Trainer(


Epoch,Training Loss,Validation Loss
1,1.3371,1.680477
2,1.3293,1.672905
3,1.4507,1.672016


[I 2024-12-07 14:46:31,173] Trial 8 finished with value: 1.6720163822174072 and parameters: {'learning_rate': 6.186851911360703e-06, 'batch_size': 4, 'num_epochs': 3}. Best is trial 5 with value: 1.6468908786773682.


  trainer = Trainer(


Epoch,Training Loss,Validation Loss
1,1.2185,1.701612


[I 2024-12-07 14:54:55,869] Trial 9 finished with value: 1.701612114906311 and parameters: {'learning_rate': 1.5545504961876603e-06, 'batch_size': 4, 'num_epochs': 1}. Best is trial 5 with value: 1.6468908786773682.
Tested learning rates: [8.592574905514928e-06, 1.243022616319868e-06, 3.382620648962999e-06, 1.667715758892236e-06, 7.229215907039781e-05, 7.924909961513882e-05, 6.239042993444806e-06, 2.6313410210733257e-06, 6.186851911360703e-06, 1.5545504961876603e-06]
Best hyperparameters according to HPO: {'learning_rate': 7.924909961513882e-05, 'batch_size': 8, 'num_epochs': 3}


In [14]:
stop_execution()

NameError: name 'stop_execution' is not defined

Check the HPO outcomes and choose the best hyperparameters: the code above does not take into account overfitting so make sure to choose well!

In [15]:
best_params = {'learning_rate': 7.924909961513882e-05, 'batch_size': 8, 'num_epochs': 3}

In [16]:
# fine tune using best HPs

best_hyperparams = best_params

training_args = TrainingArguments(
    output_dir="./gpt2_finetuned_hpo_final",
    evaluation_strategy="epoch",
    learning_rate=best_hyperparams["learning_rate"],
    per_device_train_batch_size=best_hyperparams["batch_size"],
    num_train_epochs=best_hyperparams["num_epochs"],
    save_steps=500,
    save_total_limit=2,
    logging_dir="./logs",
    logging_steps=50,
    push_to_hub=False,
)

trainer = Trainer(
    model=model,
    args=training_args,
    train_dataset=train_dataset,
    eval_dataset=eval_dataset,
    tokenizer=tokenizer,
)

trainer.train()


  trainer = Trainer(


Epoch,Training Loss,Validation Loss
1,1.5602,1.790612
2,1.4065,1.740288
3,1.3337,1.724125


TrainOutput(global_step=5625, training_loss=1.445875030517578, metrics={'train_runtime': 977.3048, 'train_samples_per_second': 46.045, 'train_steps_per_second': 5.756, 'total_flos': 2611970703360000.0, 'train_loss': 1.445875030517578, 'epoch': 3.0})

In [17]:
# verify tokens are set consistently

print(f"Pad token ID: {tokenizer.pad_token_id}")
print(f"EOS token ID: {tokenizer.eos_token_id}")


Pad token ID: 50256
EOS token ID: 50256


In [18]:
# Try out the model

# Function to predict the next token
def predict_next_token(input_text):
    # Tokenize and move input to GPU
    inputs = tokenizer(input_text, return_tensors="pt").to("cuda")
    # Generate only the next token
    outputs = model.generate(
        **inputs,
        max_new_tokens=1,  # Generate only one token
        pad_token_id=tokenizer.pad_token_id,  # Ensure padding is handled
        eos_token_id=tokenizer.eos_token_id  # Set EOS token explicitly
    )
    # Decode and return the output
    return tokenizer.decode(outputs[0])

# Test cases
input_texts = [
    "76 plus 24 equals",
    "80 minus 24 equals",
    "4 times 10 equals",
    "40 divided by 5 equals"
]

# Generate and print the next token for each input
for input_text in input_texts:
    next_token = predict_next_token(input_text)
    print(f"Input: {input_text}\nNext Token: {next_token}\n")


Input: 76 plus 24 equals
Next Token: 76 plus 24 equals 102

Input: 80 minus 24 equals
Next Token: 80 minus 24 equals 54

Input: 4 times 10 equals
Next Token: 4 times 10 equals 40

Input: 40 divided by 5 equals
Next Token: 40 divided by 5 equals 8



In [31]:
# get performance estiamtion in terms of accuracy

from sklearn.metrics import accuracy_score

# Load tokenized test dataset (example format; replace with your actual data)
# Each entry is a dictionary with 'input_ids' and 'labels', and padding tokens are set to -100 in 'labels'.
test_data = test_dataset
print(test_data[0])

# Prepare data for evaluation
prompts = []
correct_token = []

for item in test_data:
    #print(item)
    # Find the last non-padding token (label not equal to -100)
    last_valid_index = item['labels'].index(-100) - 1
    prompt = item['input_ids'][:last_valid_index] # Input up to but not including the last token
    #print(f"prompt tokens: {prompt}")
    prompts.append(prompt)
    target = item['labels'][:(last_valid_index+1)]  # The complete target of prediction
    #print(f"target token: {target} and decoded taget token: {tokenizer.decode(target)}")
    correct_token.append(tokenizer.decode(target))

# Evaluate accuracy for predicting the last valid token
predicted_tokens = []
for prompt in prompts:
    next_token = predict_next_token(tokenizer.decode(prompt)) # feed prompt into model as decoded string
    predicted_tokens.append(next_token)

# check everything is stored correctly -- should be two mathematical equations (ideally the same)
print(f"target token: {correct_token[0]}")
print(f"predicted token: {predicted_tokens[0]}")

# Calculate accuracy
accuracy = accuracy_score(correct_token, predicted_tokens)
accuracy_percentage = accuracy * 100

# Display results
print(f"Accuracy for predicting the last valid token: {accuracy_percentage:.2f}%")



{'math_problem': '4 plus 27 equals 31', 'input_ids': [19, 5556, 2681, 21767, 3261, 50256, 50256, 50256, 50256, 50256, 50256, 50256, 50256, 50256, 50256, 50256, 50256, 50256, 50256, 50256, 50256, 50256, 50256, 50256, 50256, 50256, 50256, 50256, 50256, 50256, 50256, 50256], 'attention_mask': [1, 1, 1, 1, 1, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0], 'labels': [19, 5556, 2681, 21767, 3261, -100, -100, -100, -100, -100, -100, -100, -100, -100, -100, -100, -100, -100, -100, -100, -100, -100, -100, -100, -100, -100, -100, -100, -100, -100, -100, -100]}
target token: 4 plus 27 equals 31
predicted token: 4 plus 27 equals 31
Accuracy for predicting the last valid token: 52.00%


Save the fine-tuned model for future use

In [20]:
# Save the model and tokenizer
model.save_pretrained("./gpt2_finetuned_hpo_final")
tokenizer.save_pretrained("./gpt2_finetuned_hpo_final")


('./gpt2_finetuned_hpo_final/tokenizer_config.json',
 './gpt2_finetuned_hpo_final/special_tokens_map.json',
 './gpt2_finetuned_hpo_final/vocab.json',
 './gpt2_finetuned_hpo_final/merges.txt',
 './gpt2_finetuned_hpo_final/added_tokens.json')

In [21]:
!zip -r gpt2_finetuned_hpo_final.zip ./gpt2_finetuned_hpo_final


  adding: gpt2_finetuned_hpo_final/ (stored 0%)
  adding: gpt2_finetuned_hpo_final/merges.txt (deflated 53%)
  adding: gpt2_finetuned_hpo_final/checkpoint-5625/ (stored 0%)
  adding: gpt2_finetuned_hpo_final/checkpoint-5625/scheduler.pt (deflated 56%)
  adding: gpt2_finetuned_hpo_final/checkpoint-5625/merges.txt (deflated 53%)
  adding: gpt2_finetuned_hpo_final/checkpoint-5625/model.safetensors (deflated 7%)
  adding: gpt2_finetuned_hpo_final/checkpoint-5625/generation_config.json (deflated 24%)
  adding: gpt2_finetuned_hpo_final/checkpoint-5625/rng_state.pth (deflated 25%)
  adding: gpt2_finetuned_hpo_final/checkpoint-5625/vocab.json (deflated 68%)
  adding: gpt2_finetuned_hpo_final/checkpoint-5625/special_tokens_map.json (deflated 74%)
  adding: gpt2_finetuned_hpo_final/checkpoint-5625/config.json (deflated 52%)
  adding: gpt2_finetuned_hpo_final/checkpoint-5625/optimizer.pt (deflated 8%)
  adding: gpt2_finetuned_hpo_final/checkpoint-5625/tokenizer_config.json (deflated 55%)
  adding

In [22]:
!mv gpt2_finetuned_hpo_final.zip /content/drive/MyDrive/
