In [1]:
import pandas as pd
import torch
import warnings
warnings.filterwarnings('ignore')

from datasets import load_dataset
from transformers import GPT2Tokenizer, GPT2LMHeadModel, Trainer, TrainingArguments, DataCollatorForLanguageModeling

from nltk.translate.bleu_score import sentence_bleu
from rouge_score import rouge_scorer

## Training.

In [None]:
dataset = load_dataset("Kaludi/Customer-Support-Responses")

In [None]:
# Concatenate query and response for fine-tuning
def concatenate_examples(examples):
    return {'text': [q + " " + r for q, r in zip(examples['query'], examples['response'])]}

# Apply the concatenation function to the dataset
dataset = dataset.map(concatenate_examples, batched=True, remove_columns=["query", "response"])

In [None]:
# Load the tokenizer and model
tokenizer = GPT2Tokenizer.from_pretrained('gpt2')

tokenizer.add_special_tokens({'pad_token': '[PAD]'})

model = GPT2LMHeadModel.from_pretrained('gpt2')

device = torch.device("mps" if torch.backends.mps.is_available() else "cpu")
model = model.to(device)


# Tokenize the dataset
def tokenize_function(examples):
    return tokenizer(examples['text'], truncation=True, padding='max_length', max_length=512)

tokenized_datasets = dataset.map(tokenize_function, batched=True)

In [None]:
# Data collator
data_collator = DataCollatorForLanguageModeling(tokenizer=tokenizer, mlm=False)

# Define training arguments
training_args = TrainingArguments(
    output_dir='./results',
    overwrite_output_dir=True,
    num_train_epochs=10,
    per_device_train_batch_size=2,
    save_steps=10_000,
    save_total_limit=2,
    learning_rate=0.01
)

# Initialize Trainer
trainer = Trainer(
    model=model,
    args=training_args,
    data_collator=data_collator,
    train_dataset=tokenized_datasets['train'],
)

In [None]:
# Save the model
trainer.save_model("./fine-tuned-gpt2")
tokenizer.save_pretrained("./fine-tuned-gpt2-tokenizer")

('./fine-tuned-gpt2-tokenizer/tokenizer_config.json',
 './fine-tuned-gpt2-tokenizer/special_tokens_map.json',
 './fine-tuned-gpt2-tokenizer/vocab.json',
 './fine-tuned-gpt2-tokenizer/merges.txt',
 './fine-tuned-gpt2-tokenizer/added_tokens.json')

## Evaluating.

In [2]:
# Load the fine-tuned model and tokenizer
model = GPT2LMHeadModel.from_pretrained("./fine-tuned-gpt2")

tokenizer = GPT2Tokenizer.from_pretrained("./fine-tuned-gpt2-tokenizer")
# tokenizer.add_special_tokens({'pad_token': '[PAD]'})



In [3]:
# Load the evaluation dataset
eval_data = [
    {"query": "My order hasn't arrived yet.", "response": "We apologize for the inconvenience. Can you please provide your order number?"},
    {"query":"I need to return an item.","response":"Certainly. Please provide your order number and reason for return, and we will provide you with instructions on how to proceed."},
    {"query":"I want to change my shipping address.","response":"No problem. Can you please provide your order number and the new shipping address you'd like to use?"},  
]

# Function to generate responses
def generate_response(query):
    inputs = tokenizer.encode(query, return_tensors="pt")
    outputs = model.generate(inputs, max_length=50, num_return_sequences=1)
    return tokenizer.decode(outputs[0], skip_special_tokens=True)

# Calculate Perplexity
def calculate_perplexity(model, tokenizer, text):
    inputs = tokenizer(text, return_tensors='pt')
    with torch.no_grad():
        outputs = model(**inputs, labels=inputs["input_ids"])
        loss = outputs.loss
    return torch.exp(loss)

# Initialize metrics
bleu_scores = []
rouge_scorer_1 = rouge_scorer.RougeScorer(['rouge1', 'rougeL'], use_stemmer=True)

# Evaluate the model
for example in eval_data:
    query = example["query"]
    reference_response = example["response"]
    generated_response = generate_response(query)
    
    # Calculate Perplexity
    perplexity = calculate_perplexity(model, tokenizer, generated_response)
    
    # Calculate BLEU Score
    reference_tokens = reference_response.split()
    generated_tokens = generated_response.split()
    bleu_score = sentence_bleu([reference_tokens], generated_tokens)
    bleu_scores.append(bleu_score)
    
    # Calculate ROUGE Score
    rouge_score = rouge_scorer_1.score(reference_response, generated_response)
    
    print(f"Query: {query}")
    print(f"Reference Response: {reference_response}")
    print(f"Generated Response: {generated_response}")
    print(f"Perplexity: {perplexity}")
    print(f"BLEU Score: {bleu_score}")
    print(f"ROUGE Score: {rouge_score}")
    print("-" * 50)

# Calculate average BLEU Score
average_bleu = sum(bleu_scores) / len(bleu_scores)
print(f"Average BLEU Score: {average_bleu}")


The attention mask and the pad token id were not set. As a consequence, you may observe unexpected behavior. Please pass your input's `attention_mask` to obtain reliable results.
Setting `pad_token_id` to `eos_token_id`:50256 for open-end generation.
The attention mask is not set and cannot be inferred from input because pad token is same as eos token.As a consequence, you may observe unexpected behavior. Please pass your input's `attention_mask` to obtain reliable results.
The attention mask and the pad token id were not set. As a consequence, you may observe unexpected behavior. Please pass your input's `attention_mask` to obtain reliable results.
Setting `pad_token_id` to `eos_token_id`:50256 for open-end generation.


Query: My order hasn't arrived yet.
Reference Response: We apologize for the inconvenience. Can you please provide your order number?
Generated Response: My order hasn't arrived yet. I'm not sure if it's because I'm not sure what to do with it, or if it's just a little too much for me. I'm not sure if it's because I'm not sure what
Perplexity: 4.518495559692383
BLEU Score: 8.669612184277444e-232
ROUGE Score: {'rouge1': Score(precision=0.0425531914893617, recall=0.16666666666666666, fmeasure=0.06779661016949153), 'rougeL': Score(precision=0.02127659574468085, recall=0.08333333333333333, fmeasure=0.03389830508474576)}
--------------------------------------------------


The attention mask and the pad token id were not set. As a consequence, you may observe unexpected behavior. Please pass your input's `attention_mask` to obtain reliable results.
Setting `pad_token_id` to `eos_token_id`:50256 for open-end generation.


Query: I need to return an item.
Reference Response: Certainly. Please provide your order number and reason for return, and we will provide you with instructions on how to proceed.
Generated Response: I need to return an item.

If you have a problem with the item, please contact us.

If you have a problem with the item, please contact us.

If you have a problem with the item, please contact
Perplexity: 3.548099994659424
BLEU Score: 9.65701126654974e-232
ROUGE Score: {'rouge1': Score(precision=0.13157894736842105, recall=0.23809523809523808, fmeasure=0.1694915254237288), 'rougeL': Score(precision=0.07894736842105263, recall=0.14285714285714285, fmeasure=0.1016949152542373)}
--------------------------------------------------
Query: I want to change my shipping address.
Reference Response: No problem. Can you please provide your order number and the new shipping address you'd like to use?
Generated Response: I want to change my shipping address. I want to change my shipping address.

I wa

**<u>Observations:</u>**

1. Based on the metrics on few data points, it can be seen that GPT-2 is not peforming well.

**<u>Next steps to improve the performance:</u>**

1. Increase dataset size either by generating synthetic data or getting more data.

2. Using a bigger model or different model.