In [None]:
!pip install transformers datasets torch
!pip install peft==0.3.0


Collecting datasets
  Downloading datasets-3.1.0-py3-none-any.whl.metadata (20 kB)
Collecting dill<0.3.9,>=0.3.0 (from datasets)
  Downloading dill-0.3.8-py3-none-any.whl.metadata (10 kB)
Collecting xxhash (from datasets)
  Downloading xxhash-3.5.0-cp310-cp310-manylinux_2_17_x86_64.manylinux2014_x86_64.whl.metadata (12 kB)
Collecting multiprocess<0.70.17 (from datasets)
  Downloading multiprocess-0.70.16-py310-none-any.whl.metadata (7.2 kB)
Collecting fsspec<=2024.9.0,>=2023.1.0 (from fsspec[http]<=2024.9.0,>=2023.1.0->datasets)
  Downloading fsspec-2024.9.0-py3-none-any.whl.metadata (11 kB)
Downloading datasets-3.1.0-py3-none-any.whl (480 kB)
[2K   [90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━[0m [32m480.6/480.6 kB[0m [31m21.5 MB/s[0m eta [36m0:00:00[0m
[?25hDownloading dill-0.3.8-py3-none-any.whl (116 kB)
[2K   [90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━[0m [32m116.3/116.3 kB[0m [31m10.8 MB/s[0m eta [36m0:00:00[0m
[?25hDownloading fsspec-2024.9.0-py3-none-any.whl 

In [None]:
from transformers import T5Tokenizer, T5ForConditionalGeneration
from datasets import load_dataset

# Load the T5-small model and tokenizer
model = T5ForConditionalGeneration.from_pretrained("t5-small")
tokenizer = T5Tokenizer.from_pretrained("t5-small")

# Load your dataset (replace with your own dataset or use a Hugging Face dataset)
dataset = load_dataset("scientific_papers", "arxiv", split="train[:1%]")  # Use a subset for quick testing


The secret `HF_TOKEN` does not exist in your Colab secrets.
To authenticate with the Hugging Face Hub, create a token in your settings tab (https://huggingface.co/settings/tokens), set it as secret in your Google Colab and restart your session.
You will be able to reuse this secret in all of your notebooks.
Please note that authentication is recommended but still optional to access public models or datasets.


config.json:   0%|          | 0.00/1.21k [00:00<?, ?B/s]

model.safetensors:   0%|          | 0.00/242M [00:00<?, ?B/s]

generation_config.json:   0%|          | 0.00/147 [00:00<?, ?B/s]

tokenizer_config.json:   0%|          | 0.00/2.32k [00:00<?, ?B/s]

spiece.model:   0%|          | 0.00/792k [00:00<?, ?B/s]

tokenizer.json:   0%|          | 0.00/1.39M [00:00<?, ?B/s]

You are using the default legacy behaviour of the <class 'transformers.models.t5.tokenization_t5.T5Tokenizer'>. This is expected, and simply means that the `legacy` (previous) behavior will be used so nothing changes for you. If you want to use the new behaviour, set `legacy=False`. This should only be set if you understand what it means, and thoroughly read the reason why this was added as explained in https://github.com/huggingface/transformers/pull/24565


README.md:   0%|          | 0.00/8.27k [00:00<?, ?B/s]

scientific_papers.py:   0%|          | 0.00/5.35k [00:00<?, ?B/s]

The repository for scientific_papers contains custom code which must be executed to correctly load the dataset. You can inspect the repository content at https://hf.co/datasets/scientific_papers.
You can avoid this prompt in future by passing the argument `trust_remote_code=True`.

Do you wish to run the custom code? [y/N] y


Downloading data:   0%|          | 0.00/3.62G [00:00<?, ?B/s]

Downloading data:   0%|          | 0.00/880M [00:00<?, ?B/s]

Generating train split:   0%|          | 0/203037 [00:00<?, ? examples/s]

Generating validation split:   0%|          | 0/6436 [00:00<?, ? examples/s]

Generating test split:   0%|          | 0/6440 [00:00<?, ? examples/s]

In [None]:
def preprocess_data(example):
    # Concatenate 'summarize: ' prefix to the input text
    input_text = "summarize: " + example['article']
    target_text = example['abstract']  # target summary text

    # Tokenize inputs and labels
    inputs = tokenizer(input_text, max_length=1024, truncation=True, padding="max_length")
    targets = tokenizer(target_text, max_length=256, truncation=True, padding="max_length")

    inputs['labels'] = targets['input_ids']
    return inputs

# Apply the preprocessing function to the dataset
tokenized_dataset = dataset.map(preprocess_data, remove_columns=["article", "abstract"])

Map:   0%|          | 0/2030 [00:00<?, ? examples/s]

In [None]:
# Install the necessary library for PEFT/LoRA if not installed
!pip install peft

from transformers import Trainer, TrainingArguments
from peft import get_peft_model, LoraConfig  # Import PEFT library for LoRA
import os

# Disable wandb if you don’t need logging
os.environ["WANDB_MODE"] = "disabled"

# Define LoRA configuration
lora_config = LoraConfig(
    r=8,  # rank
    lora_alpha=32,  # scaling factor
    lora_dropout=0.1,  # dropout for LoRA layers
    bias="none"  # use "none" for no additional bias parameters
)

# Apply LoRA to your model
model = get_peft_model(model, lora_config)

# Define training arguments
training_args = TrainingArguments(
    output_dir="./results",
    eval_strategy="epoch",           # Evaluate only at the end of each epoch
    learning_rate=2e-5,
    per_device_train_batch_size=8,   # Increased batch size
    num_train_epochs=2,              # Reduced number of epochs
    weight_decay=0.01,
    logging_dir='./logs',
    logging_steps=50,                # Reduced logging frequency
    fp16=True,                       # Mixed precision for faster training
)
# Use the preprocessed dataset (tokenized_dataset) instead of the original dataset
train_dataset = tokenized_dataset.train_test_split(test_size=0.1)['train']
eval_dataset = tokenized_dataset.train_test_split(test_size=0.1)['test']

# Initialize Trainer
trainer = Trainer(
    model=model,
    args=training_args,
    train_dataset=train_dataset,
    eval_dataset=eval_dataset  # Pass the eval_dataset
)

# Start training
trainer.train()




  self.scaler = torch.cuda.amp.GradScaler(**kwargs)


Epoch,Training Loss,Validation Loss
1,6.2748,No log
2,4.8543,No log


TrainOutput(global_step=458, training_loss=6.0769510977132875, metrics={'train_runtime': 225.3316, 'train_samples_per_second': 16.216, 'train_steps_per_second': 2.033, 'total_flos': 995698711461888.0, 'train_loss': 6.0769510977132875, 'epoch': 2.0})

In [None]:
from google.colab import drive
drive.mount('/content/drive')

model.save_pretrained("./fine_tuned_t5_summarizer")
tokenizer.save_pretrained("./fine_tuned_t5_tockenizer")
!cp -r ./fine_tuned_t5_summarizer /content/drive/MyDrive/
!cp -r ./fine_tuned_t5_tockenizer /content/drive/MyDrive/


Mounted at /content/drive


In [None]:
import os
os.getcwd()

In [None]:
save_path = '/content/drive/My Drive/fine_tuned_t5_summarizer'

# Save the fine-tuned model
model.save_pretrained(save_path)
tokenizer.save_pretrained(save_path)

In [None]:
!ls

sample_data


In [None]:
from google.colab import drive
drive.mount('/content/drive')