In [4]:
from transformers import pipeline
question_answerer = pipeline("question-answering", model='distilbert-base-cased-distilled-squad')

context = r"""
Extractive Question Answering is the task of extracting an answer from a text given a question. An example     of a
question answering dataset is the SQuAD dataset, which is entirely based on that task. If you would like to fine-tune
a model on a SQuAD task, you may leverage the examples/pytorch/question-answering/run_squad.py script.
"""

result = question_answerer(question="What is a good example of a question answering dataset?",     context=context)
print(f"Answer: '{result['answer']}', score: {round(result['score'], 4)}, start: {result['start']}, end: {result['end']}")




Device set to use cpu


Answer: 'SQuAD dataset', score: 0.5152, start: 151, end: 164


In [5]:
from datasets import load_dataset

ds = load_dataset("bitext/Bitext-retail-banking-llm-chatbot-training-dataset")

README.md:   0%|          | 0.00/11.7k [00:00<?, ?B/s]

To support symlinks on Windows, you either need to activate Developer Mode or to run Python as an administrator. In order to activate developer mode, see this article: https://docs.microsoft.com/en-us/windows/apps/get-started/enable-your-device-for-development


(…)ing-llm-chatbot-training-dataset.parquet:   0%|          | 0.00/7.87M [00:00<?, ?B/s]

Generating train split:   0%|          | 0/25545 [00:00<?, ? examples/s]

In [20]:
ds['train']

Dataset({
    features: ['tags', 'instruction', 'category', 'intent', 'response'],
    num_rows: 25545
})

In [78]:
from sklearn.model_selection import train_test_split
import pandas as pd

df = ds["train"].to_pandas()

train_df, test_df = train_test_split(df, test_size=0.1, stratify=df["intent"])

from datasets import Dataset
ds_dataset = {
    "train": Dataset.from_pandas(train_df),
    "test": Dataset.from_pandas(test_df)
}

print(ds_dataset)

{'train': Dataset({
    features: ['tags', 'instruction', 'category', 'intent', 'response', '__index_level_0__'],
    num_rows: 22990
}), 'test': Dataset({
    features: ['tags', 'instruction', 'category', 'intent', 'response', '__index_level_0__'],
    num_rows: 2555
})}


In [79]:
set(ds_dataset['train']['intent'])

{'activate_card',
 'activate_card_international_usage',
 'apply_for_loan',
 'apply_for_mortgage',
 'block_card',
 'cancel_card',
 'cancel_loan',
 'cancel_mortgage',
 'cancel_transfer',
 'check_card_annual_fee',
 'check_current_balance_on_card',
 'check_fees',
 'check_loan_payments',
 'check_mortgage_payments',
 'check_recent_transactions',
 'close_account',
 'create_account',
 'customer_service',
 'dispute_ATM_withdrawal',
 'find_ATM',
 'find_branch',
 'get_password',
 'human_agent',
 'make_transfer',
 'recover_swallowed_card',
 'set_up_password'}

In [80]:
set(ds_dataset['test']['intent'])

{'activate_card',
 'activate_card_international_usage',
 'apply_for_loan',
 'apply_for_mortgage',
 'block_card',
 'cancel_card',
 'cancel_loan',
 'cancel_mortgage',
 'cancel_transfer',
 'check_card_annual_fee',
 'check_current_balance_on_card',
 'check_fees',
 'check_loan_payments',
 'check_mortgage_payments',
 'check_recent_transactions',
 'close_account',
 'create_account',
 'customer_service',
 'dispute_ATM_withdrawal',
 'find_ATM',
 'find_branch',
 'get_password',
 'human_agent',
 'make_transfer',
 'recover_swallowed_card',
 'set_up_password'}

In [83]:
from transformers import GPT2LMHeadModel, GPT2Tokenizer

# Load the pre-trained GPT-2 model and tokenizer
model_name = "gpt2"  # You can choose smaller or larger versions of GPT-2
model = GPT2LMHeadModel.from_pretrained(model_name)
tokenizer = GPT2Tokenizer.from_pretrained(model_name)

# Resize the tokenizer if necessary (in case you added special tokens)
tokenizer.add_special_tokens({'pad_token': '[PAD]'})
model.resize_token_embeddings(len(tokenizer))

The new embeddings will be initialized from a multivariate normal distribution that has old embeddings' mean and covariance. As described in this article: https://nlp.stanford.edu/~johnhew/vocab-expansion.html. To disable this, use `mean_resizing=False`


Embedding(50258, 768)

In [86]:
from peft import get_peft_model, LoraConfig

# Define LoRA configuration
lora_config = LoraConfig(
    r=8,          # Rank for LoRA layers
    lora_alpha=32, # Scaling factor for LoRA
    lora_dropout=0.1,  # Dropout for LoRA layers
    target_modules=["c_proj"],  # These are the attention layers to apply LoRA to
)

# Convert GPT-2 to a PEFT model with LoRA
lora_model = get_peft_model(model, lora_config)



In [96]:
def tokenize_function(examples):
    # Concatenate 'instruction' and 'response' with a separator
    inputs = [instruction + " </s> " + response for instruction, response in zip(examples["instruction"], examples["response"])]
    
    # Tokenize the concatenated input-response pair
    model_inputs = tokenizer(inputs, padding="max_length", truncation=True)

    # Create labels by shifting input_ids (set labels to input_ids, same as GPT-2 autoregressive)
    # This is done by copying the 'input_ids' and assigning them as the 'labels' for the model
    model_inputs["labels"] = model_inputs["input_ids"].copy()  # Use the same input_ids as labels

    return model_inputs

# Apply the tokenization to both 'train' and 'test' datasets
train_dataset = ds_dataset['train'].map(tokenize_function, batched=True)
test_dataset = ds_dataset['test'].map(tokenize_function, batched=True)

Map:   0%|          | 0/22990 [00:00<?, ? examples/s]

Map:   0%|          | 0/2555 [00:00<?, ? examples/s]

In [97]:
from transformers import Trainer, TrainingArguments

# Set training arguments
training_args = TrainingArguments(
    output_dir="./gpt2-lora-finetuned",  # Directory to save model checkpoints
    eval_strategy="steps",         # When to evaluate the model
    num_train_epochs=3,                 # Number of epochs to train
    per_device_train_batch_size=2,      # Batch size for training
    save_steps=10_000,                  # Save checkpoint every 10,000 steps
    save_total_limit=2,                 # Keep only the last 2 saved models
    logging_dir="./logs",               # Directory to save logs
    logging_steps=500,                  # Log every 500 steps
    weight_decay=0.01,                  # Apply weight decay for regularization
    warmup_steps=2000,                  # Number of steps to perform learning rate warmup
    learning_rate=5e-5,                 # Learning rate
)

# Initialize the Trainer
trainer = Trainer(
    model=lora_model,                   # Use the LoRA-adapted model
    args=training_args,                 # Training arguments
    train_dataset=train_dataset,        # Training dataset
    processing_class=tokenizer,                # Tokenizer for the model
    eval_dataset=test_dataset
)

In [98]:
# Start training
trainer.train()

Step,Training Loss,Validation Loss


KeyboardInterrupt: 

In [1]:
from datasets import load_dataset

# Load the BoolQ dataset
dataset = load_dataset("boolq")

# Display the first few rows of the dataset
print(dataset['train'].to_pandas().head())

README.md:   0%|          | 0.00/6.57k [00:00<?, ?B/s]

To support symlinks on Windows, you either need to activate Developer Mode or to run Python as an administrator. In order to activate developer mode, see this article: https://docs.microsoft.com/en-us/windows/apps/get-started/enable-your-device-for-development


train-00000-of-00001.parquet:   0%|          | 0.00/3.69M [00:00<?, ?B/s]

validation-00000-of-00001.parquet:   0%|          | 0.00/1.26M [00:00<?, ?B/s]

Generating train split:   0%|          | 0/9427 [00:00<?, ? examples/s]

Generating validation split:   0%|          | 0/3270 [00:00<?, ? examples/s]

                                            question  answer  \
0    do iran and afghanistan speak the same language    True   
1  do good samaritan laws protect those who help ...    True   
2  is windows movie maker part of windows essentials    True   
3  is confectionary sugar the same as powdered sugar    True   
4         is elder scrolls online the same as skyrim   False   

                                             passage  
0  Persian (/ˈpɜːrʒən, -ʃən/), also known by its ...  
1  Good Samaritan laws offer legal protection to ...  
2  Windows Movie Maker (formerly known as Windows...  
3  Powdered sugar, also called confectioners' sug...  
4  As with other games in The Elder Scrolls serie...  
