In [9]:
import torch
from transformers import (
    GPT2Tokenizer,
    GPT2LMHeadModel,
    Trainer,
    TrainingArguments,
    logging,
)
from torchinfo import summary
import json

from collections import Counter
from transformers import logging as transformers_logging

from datasets import load_dataset
from save_results import SaveResults
from read_config import get_args
from utils import set_seed
import warnings
import logging
import random
import numpy as np
import transformers
import matplotlib.pyplot as plt

In [2]:
device = "cuda" if torch.cuda.is_available() else "cpu"
print(f"Training on {device}")

Training on cuda


In [3]:
warnings.filterwarnings("ignore", category=FutureWarning, module="huggingface_hub.file_download")
logging.getLogger("transformers.modeling_utils").setLevel(logging.ERROR)
transformers_logging.set_verbosity_error()

In [4]:
seed = 0
random.seed(seed)
np.random.seed(seed)
torch.manual_seed(seed)
transformers.set_seed(seed)

In [36]:
# Load the dataset
dataset = load_dataset("multi_woz_v22", trust_remote_code=True)

In [37]:
dataset 

DatasetDict({
    train: Dataset({
        features: ['dialogue_id', 'services', 'turns'],
        num_rows: 8437
    })
    validation: Dataset({
        features: ['dialogue_id', 'services', 'turns'],
        num_rows: 1000
    })
    test: Dataset({
        features: ['dialogue_id', 'services', 'turns'],
        num_rows: 1000
    })
})

In [137]:
def extract_singleturn(dialogue):
    dialogues=[]
    for i in range(0, len(dialogue["utterance"]) - 1, 2):  # Each user-bot exchange is 2 turns
        if dialogue["speaker"][i] == 0 and dialogue["speaker"][i + 1] == 1:
            user_utterance = dialogue["utterance"][i].strip()
            bot_response = dialogue["utterance"][i + 1].strip()
            single_turn = f"<|user|> {user_utterance} <|system|> {bot_response} <|endoftext|>"
            dialogues.append(single_turn)
    return dialogues

def extract_multiturn(dialogue,n_turn=4):
    context = ""  # Persistent context across turns
    for id, speaker, utterance in zip(dialogue['turn_id'],dialogue["speaker"], dialogue["utterance"]):
        if int(id) < n_turn :
            if speaker == 0:  # User input
                context += f"<|user|> {utterance.strip()} "
            elif speaker == 1:  # System response
                if context.strip():  # Ensure there's context to process
                    context += f"<|system|> {utterance.strip()} <|endoftext|>"  # Append system response to context Add endo of text token
                    
    return context

def extract_text_segment(tokens, max_length, n, tokenizer):
    start_idx = (n - 1) * max_length
    end_idx = start_idx + max_length

    segment_tokens = tokens[start_idx:end_idx]

    # Decode back to text
    segment_text = tokenizer.decode(segment_tokens, skip_special_tokens=True)

    # Pad if necessary
    if len(segment_tokens) < max_length:
        pad_length = max_length - len(segment_tokens)
        pad_token_id = tokenizer.convert_tokens_to_ids('<|pad|>')
        segment_tokens.extend([pad_token_id] * pad_length)
        segment_text = tokenizer.decode(segment_tokens, skip_special_tokens=True)
    return segment_text




In [139]:
# Example usage
file_dir = 'Data/samuel_data.txt'
with open(file_dir, 'r', encoding='utf-8') as file:
    full_text = file.read()
# Tokenize the text once
tokens = tokenizer(full_text)['input_ids']

extracted_text = extract_text_segment(tokens, 128,5, tokenizer)
print(extracted_text)

 you don't fill it in, you are not allowed to subsequently complain about the schedule. You may notice I've sent this to everyone, so you can see who else is attending, which may help you answer the question about sharing rooms. The deadline is 23:59 Sunday 23rd June as we'll work out the programme the following week. I will send a reminder nearer the time. Any questions, let me know. Looking forward to seeing you in August! Samuel Hi all One of the things I suppose needs to happen before I leave is that a(n un)willing successor takes over the group Twitter account, which I have


In [142]:
def preprocess_data_balanced(examples, max_context_exchanges=12):

    dialogues = []
    block=1
    # Read the text file once
    with open(file_dir, 'r', encoding='utf-8') as file:
        full_text = file.read()

    # Tokenize the text once
    tokens = tokenizer(full_text)['input_ids']
    for dialogue in examples['turns']:
        single_list=extract_singleturn(dialogue)
        dialogues.extend(single_list)
        if block%2==0:
            multi_list=extract_multiturn(dialogue,n_turn=4)
            dialogues.append(multi_list)
            sam_text = extract_text_segment(tokens, 128, block, tokenizer)
            dialogues.append(sam_text)
        if block%3==0:
            multi_list=extract_multiturn(dialogue,n_turn=8)
            dialogues.append(multi_list)
            
        block+=1
   

    return dialogues,block

In [143]:
example= dataset['test']
preprocess_data_balanced(example)

(['<|user|> I need train reservations from norwich to cambridge <|system|> I have 133 trains matching your request. Is there a specific day and time you would like to travel? <|endoftext|>',
  "<|user|> I'd like to leave on Monday and arrive by 18:00. <|system|> There are 12 trains for the day and time you request. Would you like to book it now? <|endoftext|>",
  '<|user|> Before booking, I would also like to know the travel time, price, and departure time please. <|system|> There are 12 trains meeting your needs with the first leaving at 05:16 and the last one leaving at 16:16. Do you want to book one of these? <|endoftext|>',
  '<|user|> No hold off on booking for now. Can you help me find an attraction called cineworld cinema? <|system|> Yes it is a cinema located in the south part of town what information would you like on it? <|endoftext|>',
  '<|user|> Yes, that was all I needed. Thank you very much! <|system|> Thank you for using our system. <|endoftext|>',
  '<|user|> Hello, I 

In [77]:
tokenizer = GPT2Tokenizer.from_pretrained('gpt2')
special_tokens_dict = {'additional_special_tokens': ['<|user|>', '<|system|>'], 'pad_token': '<|pad|>'}
tokenizer.add_special_tokens(special_tokens_dict)

3

In [None]:
examples = dataset['train']

dialogues = []
for dialogue in examples['turns']:
    dialogue_text = ""
    for turn_id, speaker, utterance in zip(dialogue["turn_id"], dialogue["speaker"], dialogue["utterance"]):
        if speaker == 0:  # User input
            dialogue_text += f"<|user|> {utterance.strip()} "
        elif speaker == 1:  # System response
            dialogue_text += f"<|system|> {utterance.strip()} "
    dialogues.append(dialogue_text + "<|endoftext|>")
print("The number of dialogues in the dataset: ", len(dialogues))

In [None]:
def plot_token_histogram(dialogues, tokenizer, num_bins=10):

    # Tokenizing the dialogues and counting tokens in each dialogue
    num_tokens_per_dialogue = [len(tokenizer.tokenize(dialogue)) for dialogue in dialogues]

    # Create histogram bins
    token_min = min(num_tokens_per_dialogue)
    token_max = max(num_tokens_per_dialogue)
    bins = np.linspace(token_min, token_max, num_bins + 1)  # Create `num_bins` equally spaced bins

    # Count occurrences of each token count
    token_counts, _ = np.histogram(num_tokens_per_dialogue, bins=bins)

    # Count occurrences of [USER] in each dialogue
    num_user_tokens_per_dialogue = [dialogue.count('<|user|>') for dialogue in dialogues]
    user_token_counts = Counter(num_user_tokens_per_dialogue)

    # Create a figure with two subplots
    fig, axs = plt.subplots(1, 2, figsize=(12, 5))

    # Plot the histogram of token counts using bins
    axs[0].bar(bins[:-1], token_counts, width=np.diff(bins), align='edge')  # Plot with bins
    axs[0].set_xlabel('Number of Tokens')
    axs[0].set_ylabel('Number of Dialogues')
    axs[0].set_title('Histogram of Number of Tokens per Dialogue')

    # Plot the histogram of [USER] token counts
    axs[1].bar(user_token_counts.keys(), user_token_counts.values())
    axs[1].set_xlabel('Number of <|user|> Tokens')
    axs[1].set_ylabel('Number of Dialogues')
    axs[1].set_title('Histogram of Number of [USER] Tokens per Dialogue')

    # Adjust layout to prevent overlapping
    plt.tight_layout()

    # Show the plot
    plt.show()

plot_token_histogram(dialogues, tokenizer, num_bins=100)

Prepare the data in a dialogue format with maximum 512 tokens per conversation (this number may be increased to allow longer conversations) and maximum 12 user-bot exchanges per conversation. 

Terminology:
- 1 turn = 1 message (either from the user or the bot)
- 1 user-bot exchange = 2-turn conversation (1 message from the user followed by 1 message from the bot)

In [None]:
def preprocess_data_balanced(examples, max_context_exchanges=12):
    """
    This function processes dialogues and generates a balanced mix of single-turn and multi-turn conversations.
    Handles up to 12 user-bot exchanges (24 turns) in conversations within 512 tokens, ensuring efficient use of context.

    Args:
    - examples: The dataset examples containing dialogues.
    - max_context_exchanges: Maximum number of user-bot exchanges to include in the sliding window (12 user-bot exchanges).

    Returns:
    - tokenized_inputs: Tokenized inputs for training the model.
    """
    dialogues = []

    for dialogue in examples['turns']:
        num_turns = len(dialogue["utterance"])

        # Step 1: Single-turn dialogue (focus on concise bot responses, 1 user-bot exchange = 2 turns)
        for i in range(0, num_turns - 1, 2):  # Each user-bot exchange is 2 turns
            if dialogue["speaker"][i] == 0 and dialogue["speaker"][i + 1] == 1:
                user_utterance = dialogue["utterance"][i].strip()
                bot_response = dialogue["utterance"][i + 1].strip()
                single_turn = f"<|user|> {user_utterance} <|system|> {bot_response} <|endoftext|>"
                dialogues.append(single_turn)

        # Step 2: Multi-turn conversations (sliding window approach with only 1 user-bot exchange overlap)
        # Select windows starting with at least 2 user-bot exchanges (4 turns) up to max_context_exchanges
        for num_exchanges in range(2, min(max_context_exchanges + 1, num_turns // 2 + 1)):  # Sliding window in terms of user-bot exchanges
            for start_turn_index in range(0, num_turns - (num_exchanges * 2) + 1, 2 * (num_exchanges - 1)):  # Ensure valid window sizes
                dialogue_text = ""
                for exchange_index in range(num_exchanges):  # Each iteration captures 1 user-bot exchange (2 turns)
                    user_utterance = dialogue["utterance"][start_turn_index + 2 * exchange_index].strip()
                    bot_response = dialogue["utterance"][start_turn_index + 2 * exchange_index + 1].strip()
                    dialogue_text += f"<|user|> {user_utterance} <|system|> {bot_response} "
                dialogues.append(dialogue_text + "<|endoftext|>")

    # Tokenize the combined dialogue list
    tokenized_inputs = tokenizer(dialogues, padding="max_length", truncation=True, max_length=512)  # Max length set to 512
    
    # Add labels as a copy of input_ids
    tokenized_inputs["labels"] = tokenized_inputs["input_ids"].copy()

    return tokenized_inputs

In [None]:
train_data = dataset['train'].map(lambda x: preprocess_data_balanced(x), batched=True, remove_columns=['dialogue_id', 'services', 'turns'])
val_data = dataset['validation'].map(lambda x: preprocess_data_balanced(x), batched=True, remove_columns=['dialogue_id', 'services', 'turns'])

train_data = train_data.shuffle(seed=42)
val_data = val_data.shuffle(seed=42)

In [None]:
train_data

In [None]:
print("There are", train_data.num_rows, "conversations in total.")

Detokenize a few examples from the tokenized train_data:

In [None]:
for i in range(2):  # Adjust the range if you want more or fewer examples
    input_ids = train_data[i]['input_ids']
    
    # Filter out the padding tokens manually
    input_ids_no_pad = [token_id for token_id in input_ids if token_id != tokenizer.pad_token_id]
    
    # Detokenize the sequence without the padding tokens
    detokenized_sentence = tokenizer.decode(input_ids_no_pad, skip_special_tokens=False)
    
    print(f"Example {i + 1}: {detokenized_sentence}")

# The GPT model

In [None]:
model = GPT2LMHeadModel.from_pretrained("gpt2")
model.resize_token_embeddings(len(tokenizer))

In [None]:
summary(model, input_data=torch.zeros((1, 512), dtype=torch.long), device='cpu')

In [None]:
training_args = TrainingArguments(
    output_dir="ensemble_results/MyGPT2chat",  # Directory to save the model checkpoints and other outputs.
    max_steps=1000,  # Total number of training steps. The model will stop training once this number is reached.
    optim="adamw_torch",  # Optimizer to use during training. 'adamw_torch' refers to AdamW implemented in PyTorch.
    learning_rate=5e-5,  # Learning rate used for the optimizer, which controls how much to adjust the weights with respect to the gradient.
    weight_decay=0.01,  # Weight decay (L2 regularization) to prevent overfitting by penalizing large weights.
    per_device_train_batch_size=16,  # Number of samples per batch for training on each device (e.g., GPU).
    per_device_eval_batch_size=16,  # Number of samples per batch for evaluation on each device.
    gradient_accumulation_steps=4,  # Number of steps to accumulate gradients before updating model weights, allowing larger effective batch sizes.
    gradient_checkpointing=True,  # Save memory by checkpointing gradients, which trades compute time for memory.
    warmup_steps=100,  # Number of warmup steps during which the learning rate linearly increases from 0 to the specified value.
    lr_scheduler_type="linear",  # Learning rate schedule, with 'linear' meaning it decreases linearly after the warmup phase.
    evaluation_strategy="steps",  # Perform evaluation at regular steps, as opposed to other strategies like 'epoch'.
    eval_steps=50,  # Number of training steps between evaluations (to check performance on the validation set).
    logging_steps=50,  # Number of steps between logging events, used to monitor training progress.
    log_level="info",  # The verbosity of logging, 'passive' logging will only occur if you manually enable it.
    save_steps=100,  # Number of steps between saving model checkpoints.
    save_total_limit=2,  # Maximum number of model checkpoints to keep. Older checkpoints will be deleted when this limit is exceeded.
    disable_tqdm=False,  # Whether or not to disable the progress bar ('tqdm'). False means the progress bar will be displayed.
    report_to="none",  # This ensures no reporting to any integrations
)

In [None]:
trainer = Trainer(
    model=model,
    args=training_args,
    tokenizer=tokenizer,
    train_dataset=train_data,
    eval_dataset=val_data)

In [None]:
train_now = "TRUE"

In [None]:
if train_now == "TRUE":
    trainer.train()
    model.save_pretrained("ensemble_results/MyGPT2chat")
    tokenizer.save_pretrained("ensemble_results/MyGPT2chat")
    log_history = trainer.state.log_history
    with open("ensemble_results/MyGPT2chat/"+'log_history.json', 'w') as f: json.dump(log_history, f)
else:
    with open("ensemble_results/MyGPT2chat/"+'log_history.json', 'r') as f: log_history = json.load(f) 

In [None]:
steps = sorted(set(log['step'] for log in log_history if 'step' in log))
losses = [log['loss'] for log in log_history if 'loss' in log]
val_losses = [log['eval_loss'] for log in log_history if 'eval_loss' in log]

plt.figure(figsize=(12, 6))
plt.plot(steps, losses, label='Training Loss')
plt.plot(steps, val_losses, label='Validation Loss')
plt.xlabel('Step')
plt.ylabel('Loss')
plt.legend()
plt.title('Training and Validation Loss over Steps')
plt.savefig("ensemble_results/MyGPT2chat/loss_curves.png", format='png')
plt.show()