In [1]:
# Step 0: Install dependencies
!pip install transformers datasets torch

# Step 0.5: Import required libraries
from google.colab import drive
from datasets import load_dataset
from transformers import AutoModelForCausalLM, AutoTokenizer, Trainer, TrainingArguments, TrainerCallback
import torch
import os

# Step 0.75: Mount Google Drive
drive.mount('/content/drive')

Collecting datasets
  Downloading datasets-3.2.0-py3-none-any.whl.metadata (20 kB)
Collecting dill<0.3.9,>=0.3.0 (from datasets)
  Downloading dill-0.3.8-py3-none-any.whl.metadata (10 kB)
Collecting xxhash (from datasets)
  Downloading xxhash-3.5.0-cp310-cp310-manylinux_2_17_x86_64.manylinux2014_x86_64.whl.metadata (12 kB)
Collecting multiprocess<0.70.17 (from datasets)
  Downloading multiprocess-0.70.16-py310-none-any.whl.metadata (7.2 kB)
Collecting fsspec<=2024.9.0,>=2023.1.0 (from fsspec[http]<=2024.9.0,>=2023.1.0->datasets)
  Downloading fsspec-2024.9.0-py3-none-any.whl.metadata (11 kB)
Downloading datasets-3.2.0-py3-none-any.whl (480 kB)
[2K   [90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━[0m [32m480.6/480.6 kB[0m [31m23.2 MB/s[0m eta [36m0:00:00[0m
[?25hDownloading dill-0.3.8-py3-none-any.whl (116 kB)
[2K   [90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━[0m [32m116.3/116.3 kB[0m [31m9.2 MB/s[0m eta [36m0:00:00[0m
[?25hDownloading fsspec-2024.9.0-py3-none-any.whl (

In [2]:
# Step 1: Load Dataset
dataset_name = "charles828/vertex-ai-customer-support-training-dataset"
dataset = load_dataset(dataset_name)

# Show Dataset
print("Dataset:")
dataset
print("First Line of Train Split:")
print(dataset['train'][0])
print("First Line of Test Split:")
print(dataset['test'][0])

README.md:   0%|          | 0.00/582 [00:00<?, ?B/s]

train-00000-of-00001.parquet:   0%|          | 0.00/14.6M [00:00<?, ?B/s]

test-00000-of-00001.parquet:   0%|          | 0.00/1.62M [00:00<?, ?B/s]

Generating train split:   0%|          | 0/24184 [00:00<?, ? examples/s]

Generating test split:   0%|          | 0/2688 [00:00<?, ? examples/s]

Dataset:
First Line of Train Split:
{'flags': 'BL', 'instruction': 'I need support modifying the address', 'category': 'SHIPPING', 'intent': 'change_shipping_address', 'response': 'I apologize for the inconvenience caused by the error in modifying your address. To assist you further with modifying your address, please provide me with the correct address details and I will ensure it is updated in our system.', 'text': '### Human: I need support modifying the address### Assistant: I apologize for the inconvenience caused by the error in modifying your address. To assist you further with modifying your address, please provide me with the correct address details and I will ensure it is updated in our system.'}
First Line of Test Split:
{'flags': 'BILMP', 'instruction': 'could you help me to check what delivery methods I have?', 'category': 'DELIVERY', 'intent': 'delivery_options', 'response': 'I\'ll do my best! I\'m here to assist you in discovering the delivery methods available to you. L

In [3]:
# Step 2: Define model_name for flexibility
model_name = "gpt2"  # Replace 'gpt2' with your desired model name

# Load the tokenizer and model
tokenizer = AutoTokenizer.from_pretrained(model_name)
model = AutoModelForCausalLM.from_pretrained(
    model_name,
    ignore_mismatched_sizes=True
)

# Fix the padding token issue
tokenizer.pad_token = tokenizer.eos_token  # Set the padding token to the end-of-sequence token
model.resize_token_embeddings(len(tokenizer))  # Resize token embeddings
model.to("cuda" if torch.cuda.is_available() else "cpu")  # Move to GPU if available


tokenizer_config.json:   0%|          | 0.00/26.0 [00:00<?, ?B/s]

config.json:   0%|          | 0.00/665 [00:00<?, ?B/s]

vocab.json:   0%|          | 0.00/1.04M [00:00<?, ?B/s]

merges.txt:   0%|          | 0.00/456k [00:00<?, ?B/s]

tokenizer.json:   0%|          | 0.00/1.36M [00:00<?, ?B/s]

model.safetensors:   0%|          | 0.00/548M [00:00<?, ?B/s]

generation_config.json:   0%|          | 0.00/124 [00:00<?, ?B/s]

GPT2LMHeadModel(
  (transformer): GPT2Model(
    (wte): Embedding(50257, 768)
    (wpe): Embedding(1024, 768)
    (drop): Dropout(p=0.1, inplace=False)
    (h): ModuleList(
      (0-11): 12 x GPT2Block(
        (ln_1): LayerNorm((768,), eps=1e-05, elementwise_affine=True)
        (attn): GPT2SdpaAttention(
          (c_attn): Conv1D(nf=2304, nx=768)
          (c_proj): Conv1D(nf=768, nx=768)
          (attn_dropout): Dropout(p=0.1, inplace=False)
          (resid_dropout): Dropout(p=0.1, inplace=False)
        )
        (ln_2): LayerNorm((768,), eps=1e-05, elementwise_affine=True)
        (mlp): GPT2MLP(
          (c_fc): Conv1D(nf=3072, nx=768)
          (c_proj): Conv1D(nf=768, nx=3072)
          (act): NewGELUActivation()
          (dropout): Dropout(p=0.1, inplace=False)
        )
      )
    )
    (ln_f): LayerNorm((768,), eps=1e-05, elementwise_affine=True)
  )
  (lm_head): Linear(in_features=768, out_features=50257, bias=False)
)

In [4]:
# Step 3: Preprocess Dataset
def preprocess_function(examples, history_len=2):
    processed_examples = {"input_ids": [], "labels": []}

    for conversation in examples["text"]:
        turns = conversation.split("###")  # Split by speaker turns

        # Create context by combining the last `history_len` exchanges
        history = []
        for turn in turns:
            if turn.strip().startswith("Human:") or turn.strip().startswith("Assistant:"):
                history.append(turn.strip())

        # Generate training samples with context
        for i in range(len(history)):
            if history[i].startswith("Assistant:"):  # Only train the model to generate assistant replies
                # Combine the last `history_len` exchanges as input
                context = " ".join(history[max(0, i - history_len):i])
                input_text = f"{context} ### {history[i]}"
                input_ids = tokenizer(input_text, truncation=True, padding="max_length", max_length=512)["input_ids"]
                processed_examples["input_ids"].append(input_ids)
                processed_examples["labels"].append(input_ids)  # Labels are the same as input_ids for causal LM

    return processed_examples

# Tokenize the dataset with the preprocess function
tokenized_dataset = dataset.map(
    lambda examples: preprocess_function(examples),
    batched=True,
    remove_columns=dataset["train"].column_names
)

Map:   0%|          | 0/24184 [00:00<?, ? examples/s]

Map:   0%|          | 0/2688 [00:00<?, ? examples/s]

In [5]:
# Step 4: Split into train and test datasets
train_test_split = tokenized_dataset["train"].train_test_split(test_size=0.1)
train_dataset = train_test_split["train"]
eval_dataset = train_test_split["test"]

In [6]:
# Step 5: Define Training Arguments
training_args = TrainingArguments(
    output_dir="/content/drive/MyDrive/chatbot_model",
    save_strategy="steps",
    save_steps=500,
    eval_strategy="steps",
    eval_steps=500,
    logging_dir="/content/drive/MyDrive/logs",
    logging_steps=100,
    learning_rate=5e-5,
    per_device_train_batch_size=4,
    per_device_eval_batch_size=4,
    num_train_epochs=3,
    weight_decay=0.01,
    load_best_model_at_end=True,
    metric_for_best_model="loss",
    greater_is_better=False,
    report_to="none"
)

In [7]:
# Step 6: Define Callbacks
class SaveCheckpointWithLMHeadCallback(TrainerCallback):
    def __init__(self, tokenizer):
        super().__init__()
        self.tokenizer = tokenizer

    def on_save(self, args, state, control, **kwargs):
        checkpoint_dir = os.path.join(args.output_dir, f"checkpoint-{state.global_step}")
        model.save_pretrained(checkpoint_dir)
        self.tokenizer.save_pretrained(checkpoint_dir)

# Debug callback to log device placement
class DebugCallback(TrainerCallback):
    def on_train_begin(self, args, state, control, **kwargs):
        print("Model device:", next(model.parameters()).device)

# Initialize Trainer
trainer = Trainer(
    model=model,
    args=training_args,
    train_dataset=train_dataset,
    eval_dataset=eval_dataset,
    callbacks=[SaveCheckpointWithLMHeadCallback(tokenizer), DebugCallback()]
)


In [8]:
# Step 7: Resume Training Logic
from transformers.trainer_utils import get_last_checkpoint

last_checkpoint = get_last_checkpoint(training_args.output_dir)
if last_checkpoint:
    print(f"Resuming from checkpoint: {last_checkpoint}")
    trainer.train(resume_from_checkpoint=last_checkpoint)
else:
    trainer.train()

# Save the final model and tokenizer explicitly
model_save_path = "/content/drive/MyDrive/chatbot_model"
print("Saving the final model...")
model.save_pretrained(model_save_path)
tokenizer.save_pretrained(model_save_path)


Model device: cuda:0


Step,Training Loss,Validation Loss
500,0.3545,0.320709
1000,0.305,0.287886
1500,0.2867,0.267681
2000,0.2843,0.255544
2500,0.2603,0.248488
3000,0.2651,0.241461
3500,0.249,0.236317
4000,0.2513,0.232767
4500,0.2481,0.228225
5000,0.2433,0.226021


There were missing keys in the checkpoint model loaded: ['lm_head.weight'].


Saving the final model...


('/content/drive/MyDrive/chatbot_model/tokenizer_config.json',
 '/content/drive/MyDrive/chatbot_model/special_tokens_map.json',
 '/content/drive/MyDrive/chatbot_model/vocab.json',
 '/content/drive/MyDrive/chatbot_model/merges.txt',
 '/content/drive/MyDrive/chatbot_model/added_tokens.json',
 '/content/drive/MyDrive/chatbot_model/tokenizer.json')

In [13]:
# Step 8: Chatbot Inference
def chat_with_bot(context, prompt, max_length=None, temperature=0.7, top_p=0.9, history_len=2):
    if not context:  # No prior context
        context = ["### Assistant: Hello! How can I assist you today?"]
    context = context[-history_len:]  # Keep only the last `history_len` exchanges
    full_input = " ".join(context + [f"### Human: {prompt}"])
    max_length = max_length or (len(tokenizer.encode(full_input)) + 200)
    max_length = min(max_length, 512)  # Ensure max_length does not exceed model limit

    inputs = tokenizer.encode(full_input, return_tensors="pt").to(model.device)
    output = model.generate(
        inputs,
        max_length=max_length,
        temperature=temperature,
        top_p=top_p,
        do_sample=True,
        pad_token_id=tokenizer.pad_token_id,
        eos_token_id=tokenizer.eos_token_id
    )
    response = tokenizer.decode(output[0], skip_special_tokens=True)
    return response.split("### Human:")[-1].strip()

# Save the tokenizer configuration and dataset mapping
print("Saving the tokenizer configuration...")
tokenizer.save_pretrained(model_save_path)

Saving the tokenizer configuration...


('/content/drive/MyDrive/chatbot_model/tokenizer_config.json',
 '/content/drive/MyDrive/chatbot_model/special_tokens_map.json',
 '/content/drive/MyDrive/chatbot_model/vocab.json',
 '/content/drive/MyDrive/chatbot_model/merges.txt',
 '/content/drive/MyDrive/chatbot_model/added_tokens.json',
 '/content/drive/MyDrive/chatbot_model/tokenizer.json')

In [15]:
# Step 9: Test the Chatbot
print("Chatbot is ready! Type 'exit' to end the chat.")
conversation_context = []
while True:
    try:
        user_input = input("You: ")
        if user_input.strip().lower() == "exit":
            print("Chatbot: Goodbye!")
            break
        elif not user_input.strip():
            print("Chatbot: Please type something!")
        else:
            bot_response = chat_with_bot(conversation_context, user_input)
            conversation_context.append(f"### Human: {user_input}")
            conversation_context.append(f"### Assistant: {bot_response}")
            print(f"Chatbot: {bot_response}")
    except Exception as e:
        print(f"Chatbot: Sorry, an error occurred: {e}")

Chatbot is ready! Type 'exit' to end the chat.
You: what are my delivery options
Chatbot: what are my delivery options? I'm here to guide you through the process of selecting the best delivery option. Let's dive into the details together.

1. Standard Shipping: This option is perfect for non-urgent items. It typically arrives within {{Date Range}} business days. It's a reliable choice for non-urgent items.
2. Expedited Shipping: If you're looking for a faster delivery, this option guarantees delivery within {{Date Range}} business days. It's a great choice for those who prefer a more relaxed pace.
3. Overnight Shipping: For those urgent needs, we offer overnight shipping, ensuring your items arrive the next business day. It's a reliable choice for urgent needs.
4. In-Store Pickup: If you prefer a more personalized experience, you can choose the in-store pickup option. Simply select this option during checkout and select the nearest store location to your location.

Remember, delivery t