Import necessary things

In [1]:
#THIS IS FOR USING JUST ONE GPU
import os
os.environ["CUDA_VISIBLE_DEVICES"] = "0"


In [2]:
from datasets import load_dataset, DatasetDict
from transformers import GPT2Tokenizer, GPT2LMHeadModel, Trainer, TrainingArguments
import torch, re, json
from sklearn.model_selection import train_test_split
from tqdm import tqdm


  from .autonotebook import tqdm as notebook_tqdm


Verify if we have GPU

In [None]:
#HERE WE CONFIRM THAT WE ARE JUST USING 1 GPU
print("CUDA available?", torch.cuda.is_available())

if torch.cuda.is_available():
    n_gpus = torch.cuda.device_count()
    print(f"Number of CUDA devices: {n_gpus}")


CUDA available? True
Number of CUDA devices: 1


Load dataset and split it

In [4]:
# Load and mix dataset
gsm8k = load_dataset("gsm8k", "main")
all_data = gsm8k["train"].shuffle(seed=42)

# Select quantity
max_samples = min(8000, len(all_data))
all_data = all_data.select(range(max_samples))

# Divide trainig, val and test `train_test_split` de Hugging Face
split = all_data.train_test_split(test_size=200, seed=42)
train_val = split["train"]
test_data = split["test"]

split2 = train_val.train_test_split(test_size=800, seed=42)
train_data = split2["train"]
val_data = split2["test"]

print(f"Splits - Train: {len(train_data)}, Val: {len(val_data)}, Test: {len(test_data)}")


Splits - Train: 6473, Val: 800, Test: 200


Load tokenizer and model GPT-2

In [5]:
tokenizer = GPT2Tokenizer.from_pretrained("gpt2")
tokenizer.pad_token = tokenizer.eos_token

model = GPT2LMHeadModel.from_pretrained("gpt2")
model.resize_token_embeddings(len(tokenizer))
device = torch.device("cuda" if torch.cuda.is_available() else "cpu")
model.to(device)


GPT2LMHeadModel(
  (transformer): GPT2Model(
    (wte): Embedding(50257, 768)
    (wpe): Embedding(1024, 768)
    (drop): Dropout(p=0.1, inplace=False)
    (h): ModuleList(
      (0-11): 12 x GPT2Block(
        (ln_1): LayerNorm((768,), eps=1e-05, elementwise_affine=True)
        (attn): GPT2Attention(
          (c_attn): Conv1D(nf=2304, nx=768)
          (c_proj): Conv1D(nf=768, nx=768)
          (attn_dropout): Dropout(p=0.1, inplace=False)
          (resid_dropout): Dropout(p=0.1, inplace=False)
        )
        (ln_2): LayerNorm((768,), eps=1e-05, elementwise_affine=True)
        (mlp): GPT2MLP(
          (c_fc): Conv1D(nf=3072, nx=768)
          (c_proj): Conv1D(nf=768, nx=3072)
          (act): NewGELUActivation()
          (dropout): Dropout(p=0.1, inplace=False)
        )
      )
    )
    (ln_f): LayerNorm((768,), eps=1e-05, elementwise_affine=True)
  )
  (lm_head): Linear(in_features=768, out_features=50257, bias=False)
)

5. Preprocessing Function
This function prepares the data for training:

- It creates input prompts in the format "Q: <question>\nA:".

- It uses the tokenizer to convert both the input and the answer (target) into token IDs.

- It adds those target token IDs as "labels" — which is what the model will try to predict.

In [6]:
def preprocess_fn(examples):
    # Format: Q: <question>\nA: <answer>
    texts = ["Q: " + q + "\nA: " + a for q, a in zip(examples["question"], examples["answer"])]
    # Tokenize full sequence (input + target)
    tokenized = tokenizer(
        texts,
        padding="max_length",
        truncation=True,
        max_length=512
    )
    # We use the same tokens for both input and labels (causal LM)
    tokenized["labels"] = tokenized["input_ids"].copy()
    return tokenized


6. Tokenizing the dataset

Applies the preprocess_fn to the training and validation datasets to convert them into tokenized versions that are ready for training.

In [7]:
tokenized_train = train_data.map(preprocess_fn, batched=True, remove_columns=train_data.column_names)
tokenized_val   = val_data.map(preprocess_fn, batched=True, remove_columns=val_data.column_names)


Map: 100%|██████████| 6473/6473 [00:05<00:00, 1236.42 examples/s]
Map: 100%|██████████| 800/800 [00:00<00:00, 1165.63 examples/s]


7. Training Configuration

Sets up the training parameters:

How many epochs to train
Batch sizes
Learning rate
Whether to evaluate and save every epoch
Logging frequency

In [9]:
from transformers import TrainingArguments
training_args = TrainingArguments(
    output_dir="./gpt2-gsm8k-finetuned",
    num_train_epochs=1,
    per_device_train_batch_size=4,
    per_device_eval_batch_size=4,
    learning_rate=5e-5,
    weight_decay=0.01,
    eval_strategy="epoch",
    save_strategy="epoch",
    logging_steps=100,
    push_to_hub=False
)


8. Training the model

- Wraps the model and data in a Trainer from Hugging Face.

- Begins training (fine-tuning) GPT-2 on the GSM8K dataset.

- Saves the fine-tuned model when finished.



In [10]:
trainer = Trainer(
    model=model,
    args=training_args,
    train_dataset=tokenized_train,
    eval_dataset=tokenized_val,
    tokenizer=tokenizer
)

print("Starting fine-tuning…")
trainer.train()
trainer.save_model("./gpt2-gsm8k-finetuned")


  trainer = Trainer(
Detected kernel version 5.4.0, which is below the recommended minimum of 5.5.0; this can cause the process to hang. It is recommended to upgrade the kernel to the minimum version or higher.
`loss_type=None` was set in the config but it is unrecognised.Using the default loss: `ForCausalLMLoss`.


Starting fine-tuning…


Epoch,Training Loss,Validation Loss
1,0.4841,0.460805


 9. Load the fine-tuned model for testing

 Loads the model that was just fine-tuned and puts it into evaluation mode (so it doesn't keep training or updating weights)

In [11]:
model = GPT2LMHeadModel.from_pretrained("./gpt2-gsm8k-finetuned")
model.to(device)
model.eval()


GPT2LMHeadModel(
  (transformer): GPT2Model(
    (wte): Embedding(50257, 768)
    (wpe): Embedding(1024, 768)
    (drop): Dropout(p=0.1, inplace=False)
    (h): ModuleList(
      (0-11): 12 x GPT2Block(
        (ln_1): LayerNorm((768,), eps=1e-05, elementwise_affine=True)
        (attn): GPT2Attention(
          (c_attn): Conv1D(nf=2304, nx=768)
          (c_proj): Conv1D(nf=768, nx=768)
          (attn_dropout): Dropout(p=0.1, inplace=False)
          (resid_dropout): Dropout(p=0.1, inplace=False)
        )
        (ln_2): LayerNorm((768,), eps=1e-05, elementwise_affine=True)
        (mlp): GPT2MLP(
          (c_fc): Conv1D(nf=3072, nx=768)
          (c_proj): Conv1D(nf=768, nx=3072)
          (act): NewGELUActivation()
          (dropout): Dropout(p=0.1, inplace=False)
        )
      )
    )
    (ln_f): LayerNorm((768,), eps=1e-05, elementwise_affine=True)
  )
  (lm_head): Linear(in_features=768, out_features=50257, bias=False)
)

10. Generate predictions on test set

For each test question:

- Generates an answer using the fine-tuned model.

- Extracts the numeric answer from both the prediction and the ground truth.

- Compares them and stores the result if correct.

In [13]:
def extract_number(text):
    m = re.search(r"####\s*(-?\d+)", text)
    return int(m.group(1)) if m else None

predictions, correct_preds = [], []
print("Generating answers on test set…")
for item in tqdm(test_data):
    q, gold = item["question"], item["answer"]
    prompt = f"Q: {q}\nA:"
    inputs = tokenizer(prompt, return_tensors="pt", padding=True).to(device)

    with torch.no_grad():
        out = model.generate(
            **inputs,
            max_new_tokens=128,
            temperature=0.7,
            do_sample=False,
            pad_token_id=tokenizer.eos_token_id
        )

    decoded = tokenizer.decode(out[0], skip_special_tokens=True)
    pred_text = decoded.split("A:")[-1].strip()
    predictions.append({"question": q, "predicted_answer": pred_text, "gold_answer": gold})
    if (pn := extract_number(pred_text)) is not None and pn == extract_number(gold):
        correct_preds.append(predictions[-1])


Generating answers on test set…


100%|██████████| 200/200 [01:46<00:00,  1.87it/s]


11. Calculate accuracy and save results

- Prints how many answers were correct.

- Calculates and displays the accuracy.

- Saves all predictions and correct predictions to .json files.

In [14]:
print(f"\nTest size: {len(predictions)}")
print(f"Correct:   {len(correct_preds)}")
print(f"Accuracy:  {len(correct_preds)/len(predictions):.2%}")

with open("finetuned_predictions.json", "w") as f:
    json.dump(predictions, f, indent=2)

with open("finetuned_correct.json", "w") as f:
    json.dump(correct_preds, f, indent=2)

print("Done.")



Test size: 200
Correct:   2
Accuracy:  1.00%
Done.
