In [None]:
from transformers import pipeline
question_answerer = pipeline("question-answering", model='distilbert-base-cased-distilled-squad')

context = r"""
Extractive Question Answering is the task of extracting an answer from a text given a question. An example     of a
question answering dataset is the SQuAD dataset, which is entirely based on that task. If you would like to fine-tune
a model on a SQuAD task, you may leverage the examples/pytorch/question-answering/run_squad.py script.
"""

result = question_answerer(question="What is a good example of a question answering dataset?",     context=context)
print(f"Answer: '{result['answer']}', score: {round(result['score'], 4)}, start: {result['start']}, end: {result['end']}")




Device set to use cpu


Answer: 'SQuAD dataset', score: 0.5152, start: 151, end: 164


In [2]:
!pip install transformers
!pip install datasets
!pip install evaluate
!pip install peft
!pip install accelerate
!pip install bitsandbytes

Collecting datasets
  Downloading datasets-3.2.0-py3-none-any.whl.metadata (20 kB)
Collecting dill<0.3.9,>=0.3.0 (from datasets)
  Downloading dill-0.3.8-py3-none-any.whl.metadata (10 kB)
Collecting xxhash (from datasets)
  Downloading xxhash-3.5.0-cp310-cp310-manylinux_2_17_x86_64.manylinux2014_x86_64.whl.metadata (12 kB)
Collecting multiprocess<0.70.17 (from datasets)
  Downloading multiprocess-0.70.16-py310-none-any.whl.metadata (7.2 kB)
Collecting fsspec<=2024.9.0,>=2023.1.0 (from fsspec[http]<=2024.9.0,>=2023.1.0->datasets)
  Downloading fsspec-2024.9.0-py3-none-any.whl.metadata (11 kB)
Downloading datasets-3.2.0-py3-none-any.whl (480 kB)
[2K   [90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━[0m [32m480.6/480.6 kB[0m [31m18.0 MB/s[0m eta [36m0:00:00[0m
[?25hDownloading dill-0.3.8-py3-none-any.whl (116 kB)
[2K   [90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━[0m [32m116.3/116.3 kB[0m [31m5.1 MB/s[0m eta [36m0:00:00[0m
[?25hDownloading fsspec-2024.9.0-py3-none-any.whl (

In [15]:
from datasets import load_dataset

ds = load_dataset("bitext/Bitext-retail-banking-llm-chatbot-training-dataset")

In [16]:
from transformers import (
    AutoTokenizer,
    AutoConfig,
    AutoModelForSequenceClassification,
    DataCollatorWithPadding,
    TrainingArguments,
    Trainer)

In [74]:
from sklearn.model_selection import train_test_split
import pandas as pd

df = ds["train"].to_pandas()

train_df, test_df = train_test_split(df, test_size=0.5, stratify=df["intent"])

from datasets import Dataset, DatasetDict
ds_dataset = DatasetDict({
    "train": Dataset.from_pandas(train_df[:1000]),
    "test": Dataset.from_pandas(test_df[:1000])
})


print(ds_dataset)

DatasetDict({
    train: Dataset({
        features: ['tags', 'instruction', 'category', 'intent', 'response', '__index_level_0__'],
        num_rows: 1000
    })
    test: Dataset({
        features: ['tags', 'instruction', 'category', 'intent', 'response', '__index_level_0__'],
        num_rows: 1000
    })
})


In [75]:
from transformers import T5ForConditionalGeneration, T5Tokenizer

# Load the pre-trained T5 small model and tokenizer
model_name = "t5-small"  # You can choose other sizes like t5-base, t5-large, etc.
model = T5ForConditionalGeneration.from_pretrained(model_name)
tokenizer = T5Tokenizer.from_pretrained(model_name)

# Resize the tokenizer if necessary (in case you added special tokens)
tokenizer.pad_token = tokenizer.eos_token
model.resize_token_embeddings(len(tokenizer))

Embedding(32100, 512)

In [76]:
from peft import get_peft_model, LoraConfig
from transformers import T5ForConditionalGeneration

# Define LoRA configuration for T5, targeting all layers
lora_config = LoraConfig(
    r=8,
    lora_alpha=32,
    lora_dropout=0.1)

# Convert T5 to a PEFT model with LoRA
lora_model = get_peft_model(model, lora_config)

In [77]:
# Sample a smaller subset of the dataset (for example, 10% of the original dataset)

# Tokenize function
def tokenize_function(examples):
    inputs = [instruction + " </s> " + response for instruction, response in zip(examples["instruction"], examples["response"])]
    model_inputs = tokenizer(inputs, padding="max_length", truncation=True, max_length=512)

    # Create labels
    labels = model_inputs["input_ids"].copy()
    labels = [[(l if l != tokenizer.pad_token_id else -100) for l in label] for label in labels]
    model_inputs["labels"] = labels

    return model_inputs

# Apply the tokenization to both 'train' and 'test' datasets
ds_dataset = ds_dataset.map(tokenize_function, batched=True)

Map:   0%|          | 0/1000 [00:00<?, ? examples/s]

Map:   0%|          | 0/1000 [00:00<?, ? examples/s]

In [78]:
from transformers import DataCollatorForSeq2Seq

data_collator = DataCollatorForSeq2Seq(tokenizer, model=lora_model)

In [79]:
from transformers import Trainer, TrainingArguments

# Set training arguments
training_args = TrainingArguments(
    output_dir="./t5-lora-finetuned",  # Directory to save model checkpoints
    num_train_epochs=3,               # Number of epochs to train
    per_device_train_batch_size=2,    # Batch size for training
    per_device_eval_batch_size=2,     # Batch size for evaluation
    weight_decay=0.01,                # Apply weight decay for regularization
    learning_rate=2e-5,
    evaluation_strategy="epoch",
    logging_dir="./logs",  # Where logs are stored
    logging_strategy="epoch",  # Log after every epoch
    save_strategy="epoch",  # Save checkpoints after every epoch
)

# Initialize the Trainer
trainer = Trainer(
    model=lora_model,                   # Use the LoRA-adapted model
    args=training_args,                 # Training arguments
    train_dataset=ds_dataset['train'],        # Training dataset
    tokenizer=tokenizer,                # Tokenizer for the model
    eval_dataset=ds_dataset['test'],
    data_collator=data_collator,
)

  trainer = Trainer(


In [None]:
# Train the model
trainer.train()

Epoch,Training Loss,Validation Loss


In [73]:
# Evaluate the model after training
evaluation_results = trainer.evaluate()
print(evaluation_results)

{'eval_runtime': 19.5662, 'eval_samples_per_second': 0.511, 'eval_steps_per_second': 0.256, 'epoch': 3.0}
