In [1]:
!pip install gradio
!pip install --upgrade datasets



In [2]:
from datasets import load_dataset
import pandas as pd
from torch.utils.data import Dataset
from transformers import T5Tokenizer
import torch
from torch.utils.data import DataLoader
from transformers import T5ForConditionalGeneration
from torch.optim import AdamW
import gradio as gr


In [None]:
dataset = load_dataset("gsm8k", "main")

# Convert to pandas DataFrames
df_train = pd.DataFrame(dataset['train'])
df_val = pd.DataFrame(dataset['test'])

# Rename columns and add a prompt prefix
df_train = df_train.rename(columns={'question': 'source_text', 'answer': 'target_text'})
df_val = df_val.rename(columns={'question': 'source_text', 'answer': 'target_text'})
df_train['source_text'] = 'solve: ' + df_train['source_text']
df_val['source_text'] = 'solve: ' + df_val['source_text']


The secret `HF_TOKEN` does not exist in your Colab secrets.
To authenticate with the Hugging Face Hub, create a token in your settings tab (https://huggingface.co/settings/tokens), set it as secret in your Google Colab and restart your session.
You will be able to reuse this secret in all of your notebooks.
Please note that authentication is recommended but still optional to access public models or datasets.


README.md:   0%|          | 0.00/7.94k [00:00<?, ?B/s]

train-00000-of-00001.parquet:   0%|          | 0.00/2.31M [00:00<?, ?B/s]

test-00000-of-00001.parquet:   0%|          | 0.00/419k [00:00<?, ?B/s]

Generating train split:   0%|          | 0/7473 [00:00<?, ? examples/s]

Generating test split:   0%|          | 0/1319 [00:00<?, ? examples/s]

In [4]:
class MathDataset(Dataset):
    def __init__(self, dataframe, tokenizer, source_len, target_len):
        self.tokenizer = tokenizer
        self.data = dataframe
        self.source_len = source_len
        self.target_len = target_len

    def __len__(self):
        return len(self.data)

    def __getitem__(self, index):
        source_text = self.data.iloc[index]['source_text']
        target_text = self.data.iloc[index]['target_text']

        source = self.tokenizer.encode_plus(
            source_text,
            max_length=self.source_len,
            padding='max_length',
            truncation=True,
            return_tensors="pt"
        )

        target = self.tokenizer.encode_plus(
            target_text,
            max_length=self.target_len,
            padding='max_length',
            truncation=True,
            return_tensors="pt"
        )

        return {
            'input_ids': source['input_ids'].squeeze(),
            'attention_mask': source['attention_mask'].squeeze(),
            'labels': target['input_ids'].squeeze()
        }


In [5]:
# Parameters
SOURCE_LEN = 256
TARGET_LEN = 256
BATCH_SIZE = 8
EPOCHS = 20
LEARNING_RATE = 5e-5

# Initialize tokenizer and model
tokenizer = T5Tokenizer.from_pretrained('t5-small')
model = T5ForConditionalGeneration.from_pretrained('t5-small')

# Prepare datasets and dataloaders
train_dataset = MathDataset(df_train, tokenizer, SOURCE_LEN, TARGET_LEN)
val_dataset = MathDataset(df_val, tokenizer, SOURCE_LEN, TARGET_LEN)
train_loader = DataLoader(train_dataset, batch_size=BATCH_SIZE, shuffle=True)
val_loader = DataLoader(val_dataset, batch_size=BATCH_SIZE)

# Move model to GPU if available
device = torch.device('cuda' if torch.cuda.is_available() else 'cpu')
model = model.to(device)

# Define optimizer
optimizer = AdamW(model.parameters(), lr=LEARNING_RATE)

# Training loop
for epoch in range(EPOCHS):
    model.train()
    total_loss = 0
    for batch in train_loader:
        optimizer.zero_grad()
        input_ids = batch['input_ids'].to(device)
        attention_mask = batch['attention_mask'].to(device)
        labels = batch['labels'].to(device)

        outputs = model(input_ids=input_ids, attention_mask=attention_mask, labels=labels)
        loss = outputs.loss
        total_loss += loss.item()

        loss.backward()
        optimizer.step()

    avg_loss = total_loss / len(train_loader)
    print(f"Epoch {epoch+1}/{EPOCHS}, Loss: {avg_loss:.4f}")

# Save the fine-tuned model
model.save_pretrained('./t5_gsm8k_model')
tokenizer.save_pretrained('./t5_gsm8k_model')



tokenizer_config.json:   0%|          | 0.00/2.32k [00:00<?, ?B/s]

spiece.model:   0%|          | 0.00/792k [00:00<?, ?B/s]

tokenizer.json:   0%|          | 0.00/1.39M [00:00<?, ?B/s]

You are using the default legacy behaviour of the <class 'transformers.models.t5.tokenization_t5.T5Tokenizer'>. This is expected, and simply means that the `legacy` (previous) behavior will be used so nothing changes for you. If you want to use the new behaviour, set `legacy=False`. This should only be set if you understand what it means, and thoroughly read the reason why this was added as explained in https://github.com/huggingface/transformers/pull/24565


config.json:   0%|          | 0.00/1.21k [00:00<?, ?B/s]

model.safetensors:   0%|          | 0.00/242M [00:00<?, ?B/s]

generation_config.json:   0%|          | 0.00/147 [00:00<?, ?B/s]

Passing a tuple of `past_key_values` is deprecated and will be removed in Transformers v4.48.0. You should pass an instance of `EncoderDecoderCache` instead, e.g. `past_key_values=EncoderDecoderCache.from_legacy_cache(past_key_values)`.


Epoch 1/20, Loss: 1.4317
Epoch 2/20, Loss: 0.8418
Epoch 3/20, Loss: 0.7575
Epoch 4/20, Loss: 0.7075
Epoch 5/20, Loss: 0.6733
Epoch 6/20, Loss: 0.6457
Epoch 7/20, Loss: 0.6239
Epoch 8/20, Loss: 0.6046
Epoch 9/20, Loss: 0.5874
Epoch 10/20, Loss: 0.5734
Epoch 11/20, Loss: 0.5597
Epoch 12/20, Loss: 0.5472
Epoch 13/20, Loss: 0.5368
Epoch 14/20, Loss: 0.5275
Epoch 15/20, Loss: 0.5168
Epoch 16/20, Loss: 0.5086
Epoch 17/20, Loss: 0.4998
Epoch 18/20, Loss: 0.4920
Epoch 19/20, Loss: 0.4852
Epoch 20/20, Loss: 0.4784


('./t5_gsm8k_model/tokenizer_config.json',
 './t5_gsm8k_model/special_tokens_map.json',
 './t5_gsm8k_model/spiece.model',
 './t5_gsm8k_model/added_tokens.json')

In [6]:
!zip -r t5_gsm8k_model.zip t5_gsm8k_model

  adding: t5_gsm8k_model/ (stored 0%)
  adding: t5_gsm8k_model/generation_config.json (deflated 29%)
  adding: t5_gsm8k_model/spiece.model (deflated 48%)
  adding: t5_gsm8k_model/special_tokens_map.json (deflated 85%)
  adding: t5_gsm8k_model/tokenizer_config.json (deflated 94%)
  adding: t5_gsm8k_model/model.safetensors (deflated 8%)
  adding: t5_gsm8k_model/config.json (deflated 63%)
  adding: t5_gsm8k_model/added_tokens.json (deflated 83%)
