<a href="https://colab.research.google.com/github/khnhenriette/ProjectADL/blob/math-medium/notebooks/medium_fine_tune_math_hpo_small.ipynb" target="_parent"><img src="https://colab.research.google.com/assets/colab-badge.svg" alt="Open In Colab"/></a>

## Hyper-Parameter optimization: Fine tune gpt2-medium for basic math tasks

In [None]:
from google.colab import drive
drive.mount('/content/drive')

Drive already mounted at /content/drive; to attempt to forcibly remount, call drive.mount("/content/drive", force_remount=True).


In [None]:
!pip install transformers datasets
!pip install torch
!pip install optuna




Use the dataset math_dataset.json that includes 20000 simple math examples of the form "89 minus 84 equals 5" using addition, subtraction, multiplication and division -- ensure dataset is uploaded to Google Colab before running

In [None]:
import pandas as pd
from datasets import Dataset

# Load the CSV dataset
df = pd.read_csv('math_dataset.csv')  # Update this path if using Google Drive

# Convert DataFrame to Hugging Face Dataset
dataset = Dataset.from_pandas(df)

# Split dataset into training and validation sets
split_dataset = dataset.train_test_split(test_size=0.1)
train_dataset = split_dataset['train']
eval_dataset = split_dataset['test']

print(f"Training examples: {len(train_dataset)}")
print(f"Validation examples: {len(eval_dataset)}")


Training examples: 18000
Validation examples: 2000


In [None]:
print(train_dataset[4])

{'math_problem': '97 plus 29 equals 126'}


### Use the Hugging Face Trainer to fine-tune the available gpt2-medium to perform better on the simple math tasks

In [None]:
from transformers import GPT2Tokenizer, GPT2LMHeadModel
# Load GPT-2 Medium tokenizer and model
tokenizer = GPT2Tokenizer.from_pretrained("gpt2-medium")
model = GPT2LMHeadModel.from_pretrained("gpt2-medium")

Start by finding right max_length for padding


In [None]:
# Check the longest sequence in your dataset
max_tokenized_length = max(len(tokenizer(example)['input_ids']) for example in dataset['math_problem'])
print(f"Maximum tokenized length: {max_tokenized_length}")


Maximum tokenized length: 8


In [None]:

# Set the EOS token as the padding token
tokenizer.pad_token = tokenizer.eos_token

# choose max_length slightly higher than longest sequence in dataset
max_length = 32

# Tokenize the dataset
def tokenize_function(examples):
    tokenized_inputs = tokenizer(
        examples["math_problem"],
        truncation=True,
        padding="max_length",
        max_length=max_length
    )
    # Labels are the same as input_ids for causal language modeling
    tokenized_inputs["labels"] = tokenized_inputs["input_ids"].copy()
    return tokenized_inputs


In [None]:
# ensure padding and masking are done correctly

testing = tokenizer("3 plus 65 equals 68", padding="max_length", max_length=10)
print(testing)


{'input_ids': [18, 5556, 6135, 21767, 8257, 50256, 50256, 50256, 50256, 50256], 'attention_mask': [1, 1, 1, 1, 1, 0, 0, 0, 0, 0]}


In [None]:
# tokenize dataset

train_dataset = train_dataset.map(tokenize_function, batched=True)
eval_dataset = eval_dataset.map(tokenize_function, batched=True)

Map:   0%|          | 0/18000 [00:00<?, ? examples/s]

Map:   0%|          | 0/2000 [00:00<?, ? examples/s]

In [None]:
# mask eos tokens in labels to avoid their prediction in final outcome

def mask_eos_in_labels(example):
    example["labels"] = [token if token != tokenizer.eos_token_id else -100 for token in example["labels"]]
    return example

train_dataset = train_dataset.map(mask_eos_in_labels)
eval_dataset = eval_dataset.map(mask_eos_in_labels)


print(train_dataset[0])
print(eval_dataset[0])


Map:   0%|          | 0/18000 [00:00<?, ? examples/s]

Map:   0%|          | 0/2000 [00:00<?, ? examples/s]

{'math_problem': '45 plus 47 equals 92', 'input_ids': [2231, 5556, 6298, 21767, 10190, 50256, 50256, 50256, 50256, 50256, 50256, 50256, 50256, 50256, 50256, 50256, 50256, 50256, 50256, 50256, 50256, 50256, 50256, 50256, 50256, 50256, 50256, 50256, 50256, 50256, 50256, 50256], 'attention_mask': [1, 1, 1, 1, 1, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0], 'labels': [2231, 5556, 6298, 21767, 10190, -100, -100, -100, -100, -100, -100, -100, -100, -100, -100, -100, -100, -100, -100, -100, -100, -100, -100, -100, -100, -100, -100, -100, -100, -100, -100, -100]}
{'math_problem': '77 times 26 equals 2002', 'input_ids': [3324, 1661, 2608, 21767, 6244, 50256, 50256, 50256, 50256, 50256, 50256, 50256, 50256, 50256, 50256, 50256, 50256, 50256, 50256, 50256, 50256, 50256, 50256, 50256, 50256, 50256, 50256, 50256, 50256, 50256, 50256, 50256], 'attention_mask': [1, 1, 1, 1, 1, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0], 'label

Train with hyperparameter optimization using optuna

In [None]:
import optuna
from transformers import TrainingArguments, Trainer

# Define the objective function for Optuna
def objective(trial):
    # Suggest hyperparameters
    learning_rate = trial.suggest_float("learning_rate", 1e-6, 1e-4, log=True)
    batch_size = trial.suggest_categorical("batch_size", [4, 8, 16])
    num_epochs = trial.suggest_int("num_epochs", 1, 5)

    # Define training arguments
    training_args = TrainingArguments(
        output_dir="./gpt2_finetuned_optuna",
        evaluation_strategy="epoch",
        learning_rate=learning_rate,
        per_device_train_batch_size=batch_size,
        num_train_epochs=num_epochs,
        save_steps=500,
        save_total_limit=2,
        logging_dir="./logs",
        logging_steps=50,
    )

    # Initialize Trainer
    trainer = Trainer(
        model=model,
        args=training_args,
        train_dataset=train_dataset,
        eval_dataset=eval_dataset,
        tokenizer=tokenizer,
    )

    # Train the model and evaluate on validation set
    trainer.train()
    eval_results = trainer.evaluate()

    # Use validation loss as the optimization metric
    return eval_results["eval_loss"]

# Run the hyperparameter search
study = optuna.create_study(direction="minimize")
study.optimize(objective, n_trials=10, show_progress_bar=True)
print("Tested learning rates:", [trial.params["learning_rate"] for trial in study.trials])

# Best hyperparameters
print("Best hyperparameters:", study.best_params)



[I 2024-11-29 12:33:52,863] A new study created in memory with name: no-name-38df1899-9b7b-43c2-8018-4f84dfd88175


  0%|          | 0/10 [00:00<?, ?it/s]

  trainer = Trainer(


Epoch,Training Loss,Validation Loss
1,1.9691,1.919592
2,1.843,1.811422
3,1.7134,1.715102
4,1.6562,1.669262
5,1.6035,1.642913


[I 2024-11-29 12:48:00,318] Trial 0 finished with value: 1.6429134607315063 and parameters: {'learning_rate': 3.1451779331894924e-05, 'batch_size': 16, 'num_epochs': 5}. Best is trial 0 with value: 1.6429134607315063.


  trainer = Trainer(


Epoch,Training Loss,Validation Loss
1,1.5131,1.643974
2,1.4846,1.642166
3,1.4678,1.642921
4,1.4952,1.643495
5,1.5839,1.641744


[I 2024-11-29 13:01:53,160] Trial 1 finished with value: 1.641743540763855 and parameters: {'learning_rate': 1.3011956127684716e-06, 'batch_size': 16, 'num_epochs': 5}. Best is trial 1 with value: 1.641743540763855.


  trainer = Trainer(


Epoch,Training Loss,Validation Loss
1,1.6081,1.638092


[I 2024-11-29 13:10:40,149] Trial 2 finished with value: 1.6380916833877563 and parameters: {'learning_rate': 1.0813150432375988e-05, 'batch_size': 4, 'num_epochs': 1}. Best is trial 2 with value: 1.6380916833877563.


  trainer = Trainer(


Epoch,Training Loss,Validation Loss
1,1.6211,1.721321
2,1.5326,1.696574
3,1.4675,1.663331
4,1.4566,1.652322
5,1.4434,1.634947


[I 2024-11-29 13:25:25,580] Trial 3 finished with value: 1.6349472999572754 and parameters: {'learning_rate': 3.3079548312072015e-05, 'batch_size': 16, 'num_epochs': 5}. Best is trial 3 with value: 1.6349472999572754.


  trainer = Trainer(


Epoch,Training Loss,Validation Loss
1,1.3232,1.6682
2,1.2845,1.66311


[I 2024-11-29 13:31:30,974] Trial 4 finished with value: 1.6631098985671997 and parameters: {'learning_rate': 8.642603674751315e-06, 'batch_size': 16, 'num_epochs': 2}. Best is trial 3 with value: 1.6349472999572754.


  trainer = Trainer(


Epoch,Training Loss,Validation Loss
1,1.6066,1.708052
2,1.4723,1.638396


[I 2024-11-29 13:58:11,350] Trial 5 finished with value: 1.6383955478668213 and parameters: {'learning_rate': 3.508166867022383e-05, 'batch_size': 4, 'num_epochs': 2}. Best is trial 3 with value: 1.6349472999572754.


  trainer = Trainer(


Epoch,Training Loss,Validation Loss
1,1.2852,1.692338
2,1.4624,1.683957


[I 2024-11-29 14:22:22,016] Trial 6 finished with value: 1.6839567422866821 and parameters: {'learning_rate': 5.420736008669886e-06, 'batch_size': 4, 'num_epochs': 2}. Best is trial 3 with value: 1.6349472999572754.


  trainer = Trainer(


Epoch,Training Loss,Validation Loss
1,1.1753,1.730232
2,1.4677,1.695376
3,1.3505,1.677818
4,1.3469,1.674814


[I 2024-11-29 14:52:20,584] Trial 7 finished with value: 1.674813985824585 and parameters: {'learning_rate': 1.1859421945618443e-06, 'batch_size': 4, 'num_epochs': 4}. Best is trial 3 with value: 1.6349472999572754.


  trainer = Trainer(


Epoch,Training Loss,Validation Loss
1,1.1723,1.753888
2,1.4281,1.719554


[I 2024-11-29 15:07:32,724] Trial 8 finished with value: 1.71955406665802 and parameters: {'learning_rate': 8.386671396424004e-06, 'batch_size': 4, 'num_epochs': 2}. Best is trial 3 with value: 1.6349472999572754.


  trainer = Trainer(


Epoch,Training Loss,Validation Loss
1,1.2805,1.701427
2,1.2782,1.686461
3,1.2429,1.686165
4,1.2464,1.691559


[I 2024-11-29 15:18:04,363] Trial 9 finished with value: 1.6915593147277832 and parameters: {'learning_rate': 1.4180346515805114e-05, 'batch_size': 16, 'num_epochs': 4}. Best is trial 3 with value: 1.6349472999572754.
Tested learning rates: [3.1451779331894924e-05, 1.3011956127684716e-06, 1.0813150432375988e-05, 3.3079548312072015e-05, 8.642603674751315e-06, 3.508166867022383e-05, 5.420736008669886e-06, 1.1859421945618443e-06, 8.386671396424004e-06, 1.4180346515805114e-05]
Best hyperparameters: {'learning_rate': 3.3079548312072015e-05, 'batch_size': 16, 'num_epochs': 5}


In [None]:
# fine tune using best HPs

best_hyperparams = study.best_params

training_args = TrainingArguments(
    output_dir="./gpt2_finetuned_hpo",
    evaluation_strategy="epoch",
    learning_rate=best_hyperparams["learning_rate"],
    per_device_train_batch_size=best_hyperparams["batch_size"],
    num_train_epochs=best_hyperparams["num_epochs"],
    save_steps=500,
    save_total_limit=2,
    logging_dir="./logs",
    logging_steps=50,
    push_to_hub=False,
)

trainer = Trainer(
    model=model,
    args=training_args,
    train_dataset=train_dataset,
    eval_dataset=eval_dataset,
    tokenizer=tokenizer,
)

trainer.train()


  trainer = Trainer(


Epoch,Training Loss,Validation Loss
1,1.2497,1.807393
2,1.242,1.760213
3,1.2315,1.743834
4,1.2529,1.725759
5,1.2612,1.722523


TrainOutput(global_step=5625, training_loss=1.2144504153781468, metrics={'train_runtime': 780.772, 'train_samples_per_second': 115.271, 'train_steps_per_second': 7.204, 'total_flos': 5223941406720000.0, 'train_loss': 1.2144504153781468, 'epoch': 5.0})

Save the fine-tuned model for future use

In [None]:
# Save the model and tokenizer
model.save_pretrained("./gpt2_finetuned_hpo")
tokenizer.save_pretrained("./gpt2_finetuned_hpo")


('./gpt2_finetuned_hpo/tokenizer_config.json',
 './gpt2_finetuned_hpo/special_tokens_map.json',
 './gpt2_finetuned_hpo/vocab.json',
 './gpt2_finetuned_hpo/merges.txt',
 './gpt2_finetuned_hpo/added_tokens.json')

In [None]:
!zip -r gpt2_finetuned_hpo.zip ./gpt2_finetuned_hpo


  adding: gpt2_finetuned_hpo/ (stored 0%)
  adding: gpt2_finetuned_hpo/tokenizer_config.json (deflated 55%)
  adding: gpt2_finetuned_hpo/special_tokens_map.json (deflated 74%)
  adding: gpt2_finetuned_hpo/merges.txt (deflated 53%)
  adding: gpt2_finetuned_hpo/checkpoint-5500/ (stored 0%)
  adding: gpt2_finetuned_hpo/checkpoint-5500/rng_state.pth (deflated 25%)
  adding: gpt2_finetuned_hpo/checkpoint-5500/tokenizer_config.json (deflated 55%)
  adding: gpt2_finetuned_hpo/checkpoint-5500/special_tokens_map.json (deflated 74%)
  adding: gpt2_finetuned_hpo/checkpoint-5500/trainer_state.json (deflated 79%)
  adding: gpt2_finetuned_hpo/checkpoint-5500/merges.txt (deflated 53%)
  adding: gpt2_finetuned_hpo/checkpoint-5500/vocab.json (deflated 68%)
  adding: gpt2_finetuned_hpo/checkpoint-5500/model.safetensors (deflated 7%)
  adding: gpt2_finetuned_hpo/checkpoint-5500/training_args.bin (deflated 51%)
  adding: gpt2_finetuned_hpo/checkpoint-5500/scheduler.pt (deflated 55%)
  adding: gpt2_finetun

In [None]:
!mv gpt2_finetuned_hpo.zip /content/drive/MyDrive/


In [None]:
# Try out the model

# Function to predict the next token
def predict_next_token(input_text):
    # Tokenize and move input to GPU
    inputs = tokenizer(input_text, return_tensors="pt").to("cuda")
    # Generate only the next token
    outputs = model.generate(
        **inputs,
        max_new_tokens=1,  # Generate only one token
        pad_token_id=tokenizer.pad_token_id,  # Ensure padding is handled
        eos_token_id=tokenizer.eos_token_id  # Set EOS token explicitly
    )
    # Decode and return the output
    return tokenizer.decode(outputs[0])

# Test cases
input_texts = [
    "76 plus 24 equals",
    "80 minus 24 equals",
    "4 times 10 equals",
    "40 divided by 5 equals"
]

# Generate and print the next token for each input
for input_text in input_texts:
    next_token = predict_next_token(input_text)
    print(f"Input: {input_text}\nNext Token: {next_token}\n")


Input: 76 plus 24 equals
Next Token: 76 plus 24 equals 100

Input: 80 minus 24 equals
Next Token: 80 minus 24 equals 56

Input: 4 times 10 equals
Next Token: 4 times 10 equals 40

Input: 40 divided by 5 equals
Next Token: 40 divided by 5 equals 8



In [None]:
# verify tokens are set consistently

print(f"Pad token ID: {tokenizer.pad_token_id}")
print(f"EOS token ID: {tokenizer.eos_token_id}")


Pad token ID: 50256
EOS token ID: 50256
