In [1]:
# Import libraries
import pandas as pd
import numpy as np
import matplotlib.pyplot as plt
from datasets import load_dataset
from transformers import T5Tokenizer, T5ForConditionalGeneration
from wordcloud import WordCloud
import re
import unicodedata
import tensorflow as tf
from sklearn.model_selection import train_test_split

In [4]:
# load and display the dataset from hugging face
data = load_dataset("bitext/Bitext-customer-support-llm-chatbot-training-dataset")
data

Bitext_Sample_Customer_Support_Training_(…):   0%|          | 0.00/19.2M [00:00<?, ?B/s]

RuntimeError: Data processing error: CAS service error : Reqwest Error: HTTP status server error (500 Internal Server Error), domain: https://cas-server.xethub.hf.co/reconstructions/7c51ebbeb4c96943eccc2e20ae58d53c23958b3e2e096222ac23b46a7cd5297c

# Overview of the Dataset

This hybrid synthetic dataset is designed to be used to fine-tune Large Language Models such as GPT, Mistral and OpenELM, and has been generated using  NLP/NLG technology and our automated Data Labeling (DAL) tools.

## Dataset Specifications

* **Use Case:** Intent Detection
* **Vertical:** Customer Service
* **Intents:** 27 intents assigned to 10 categories
* **Dataset Size:** 26,872 question/answer pairs, around 1,000 per intent
* **Entity Types:** 30 entity/slot types
* **Language Tags:** 12 different types of language generation tags

## Categories and Intents

The categories and intents have been selected from Bitext's collection of 20 vertical-specific datasets, covering the intents that are common across all 20 verticals.

### Verticals Covered:
* Automotive
* Retail Banking
* Education
* Events & Ticketing
* Field Services
* Healthcare
* Hospitality
* Insurance
* Legal Services
* Manufacturing
* Media Streaming
* Mortgages & Loans
* Moving & Storage
* Real Estate/Construction
* Restaurant & Bar Chains
* Retail/E-commerce
* Telecommunications
* Travel
* Utilities
* Wealth Management

## Generation Methodology

The question/answer pairs have been generated using a hybrid methodology that uses natural texts as source text, NLP technology to extract seeds from these texts, and NLG technology to expand the seed texts. All steps in the process are curated by computational linguists.

In [None]:
# change the data format to pandas for easier manipulation and analysis
data.set_format(type='pandas')

# Extract the train set and display the first five rows
df = data['train'][:]
df.head()

# Data Exploration

In [None]:
"""Basic Inspection"""

# Simple overview of the dataset showing the number of columns, null values and data types for each column
df.info()

In [None]:
"""Missing Data Analysis"""

# confirm number of null values for each column
def check_missing_values(dataframe):
  print("Missing Values per column:\n")
  print(dataframe.isnull().sum())
  print("\n")
  # Check for empty strings in the data
  empty_instructions = (dataframe['instruction'].str.strip() == '').sum()
  empty_responses = (dataframe['response'].str.strip() == '').sum()
  print(f"Number of empty strings in instructions column: {empty_instructions}")
  print(f"Number of empty strings in  responses column: {empty_responses}")

check_missing_values(df)

In [None]:
"""Duplicate detection"""

# check for duplicated rows to prevent training bias
def check_duplicates(df):
  print(f"Number of duplicate rows: {df.duplicated().sum()}")

check_duplicates(df)

In [None]:
# set figsize for matplotlib graphs
plt.rcParams['figure.figsize'] = (12, 6)

In [None]:
"""Category and Intent Analysis"""


def check_distribution(df, column:str, xlabel:str, ylabel:str, title:str):
  # Display distribution of categories in the data
  distribution = df[column].value_counts()
  print(f"===== {title} =====\n")
  print(distribution)
  print("\n")
  # plot category distribution
  distribution.plot.bar()
  plt.xlabel(xlabel)
  plt.ylabel(ylabel)
  plt.title(title)
  plt.show()



check_distribution(df, 'category', 'Category', 'Count', 'Category Distribution')
check_distribution(df, 'intent', 'Intent', 'Count', 'Intent Distribution')

In [None]:
"""Text Length Analysis"""

# character count for instruction and response columns
df['instruction_length'] = df['instruction'].str.len()
df['response_length'] = df['response'].str.len()

# word counts for instruction and response columns
df['instruction_word_count'] = df['instruction'].str.split().str.len()
df['response_word_count'] = df['response'].str.split().str.len()


# display character length statistics for instructions and words
print("\n--- Character Length Statistics ---")
print("\nInstructions:")
print(df['instruction_length'].describe())
print("\nResponses:")
print(df['response_length'].describe())

# display character length statistics for instructions and words
print("\n--- Word Count Statistics ---")
print("\nInstructions:")
print(df['instruction_word_count'].describe())
print("\nResponses:")
print(df['response_word_count'].describe())

In [None]:
"""Vocabulary Wordcloud"""

# Word Clouds for Positive vs Negative Reviews
instruction_text = " ".join(df["instruction"].values)
response_text = " ".join(df['response'].values)

instruction_wordcloud = WordCloud(width=800, height=400, background_color='white').generate(instruction_text)
response_wordcloud = WordCloud(width=800, height=400, background_color='black').generate(response_text)

plt.figure(figsize=(12,6))
plt.subplot(1,2,1)
plt.imshow(instruction_wordcloud, interpolation='bilinear')
plt.axis('off')
plt.title("Instruction")

plt.subplot(1,2,2)
plt.imshow(response_wordcloud, interpolation='bilinear')
plt.axis('off')
plt.title("Response")
plt.show()

In [None]:
"""Data Quality Check for corrupt or malformed data"""

# check for Very short texts
short_instructions = (df['instruction'].str.split().str.len() < 3).sum() # less than 3 words in the instruction
short_responses = (df['response'].str.split().str.len() < 5).sum()       # less than 5 words in the response
print(f"\n1. Very short texts:")
print(f"   Instructions < 3 words: {short_instructions}")
print(f"   Responses < 5 words: {short_responses}")

# check for Special characters (excluding placeholders)
def has_unusual_chars(text):
    text_no_placeholders = re.sub(r'\{\{[^}]+\}\}', '', text)   # exclude plaholders in {{.....}} format
    return bool(re.search(r'[^\w\s.,!?\'-]', text_no_placeholders)) #  check for specialcharacters that aren't defined by the regex

# display number of unusual characters
unusual_instructions = df['instruction'].apply(has_unusual_chars).sum()
unusual_responses = df['response'].apply(has_unusual_chars).sum()
print(f"\n3. Unusual characters (non-standard):")
print(f"   Instructions: {unusual_instructions}")
print(f"   Responses: {unusual_responses}")

In [None]:
"""Display unusual characters that may potentially break training"""
# Find specific unusual characters
def find_unusual_chars(text):
    text_no_placeholders = re.sub(r'\{\{[^}]+\}\}', '', text)
    unusual = set(re.findall(r'[^\w\s.,!?\'-]', text_no_placeholders))
    return unusual

# Unusual characters in responses
def print_unusual_characters(column):
  all_unusual = set()
  for text in df[column]:
      all_unusual.update(find_unusual_chars(text))
  print(f"Unusual characters in {column}: {sorted(all_unusual)}\n\n")



print_unusual_characters('instruction')
print_unusual_characters('response')




# Key Insights and decisions from data analysis

1. **Basic Inspection** - There is no null data in the the dataset so there will be no need for handling null data in preprocessing

2. **Missing Data Analysis** -  Confirm no null data, and also confirms that there are no empty strings in the dataset.

3. **Duplicate detection**  - No duplicates in the dataset, therefore no need for handling duplicates in preprocessing.

4. **Category and Intent Analysis** - There was some imbalance in the category distribution with the most common category "ACCOUNT" CONSISTIG OF 22% of the data and the least common "CANCEL" accounting for 3.5%, thus, there is an imbalance ratio of  6.3x difference. The intent category however, has much less imbalance. Each of the 27 intents has ~ 1000 instructions with the lowest being 950.

    **Insight:** Use a stratified split for intents during training and validation

5. **Text Length Analysis** - Customer instructions average around 8-10 words, while agent responses are significantly longer, averaging 105 words. The distribution shows most instructions fall between 7-11 words and responses between 72-124 words.

    **Insight:** These patterns inform tokenizer configuration: setting **MAX_INPUT_LENGTH=128 tokens** which should cover all the instructions,  while **MAX_OUTPUT_LENGTH=512** tokens accommodates the longer, more detailed responses without unnecessary padding or information loss and also accomodates the max word count of 402 words.


6. **Data Quality Check:** 6. **Data Quality Check:** About 104 examples have instructions with less than 3 words, 1,038 instructions contain unusual characters[#, '$'], and 10,868 responses (40% of dataset) contain special characters including ['"', '#', '$', '&', '(', ')', '*', '+', '/', ':', ';', '>', '@', '[', ']', '`', '{', '}', '¡', '–', '—', ''', '☺', '✨', '️', '🌟', '👍', '💡', '💪', '🔐', '🔒', '🗝', '😊', '🙁', '🙏', '🛡', '🤗', '🤝'].

    **Insight:** Remove the 104 short instructions as they represent only 0.4% of the dataset and are likely incomplete queries. For special characters, normalize double quotes to single (" to '), weird apostrophes to consistent ones  (` to '), and double dashes to single (—– to -) for consistency. Keep functional characters like '$', '#', '@', '()', and ':' as they're used for prices, order numbers, emails, and formatting. Emojis can be retained as they add warmth and approachability to customer service responses.The tokenizer will handle them appropriately during encoding.
  


# Data Preprocessing

## Preprocessing Steps NOT Needed for FLAN-T5

Modern transformer models like FLAN-T5 are designed to learn from minimally processed text. The following traditional NLP preprocessing steps are **unnecessary and can actually hurt performance**:

1. **Lowercasing** - The model learns that capitalization carries meaning (e.g., "ORDER" vs "order" may indicate urgency or emphasis). Lowercasing removes this signal.

2. **Removing Punctuation** - Punctuation conveys tone and meaning ("Help!" vs "Help." vs "Help?"). The model uses these cues for context.

3. **Removing Stopwords** - Words like "the", "a", "is" are crucial for grammar and sentence structure. Removing them breaks natural language patterns.

4. **Stemming/Lemmatization** - Converting "running" → "run" is unnecessary as the model's subword tokenizer already handles word variations and morphology.

5. **Removing Numbers** - Numbers are important in customer service (order IDs, amounts, dates). The model needs to learn when and how to use them.

6. **Aggressive Special Character Removal** - Characters like $, #, @, (), : have semantic meaning (prices, tags, emails, phone numbers). Keep them.

**What You DO Need:**
- Unicode normalization (NFKC) for consistency
- Remove extra whitespace
- Remove control characters
- Drop incomplete examples (<3 words)
- Normalizing punctuations into a standard format

**Philosophy:** FLAN-T5's tokenizer and architecture are designed to handle raw, natural text. Over-preprocessing removes information the model can learn from, reducing its ability to understand context and nuance.

In [None]:
# normalize tokens that may cause the trainining in the model to break, most unusual tokens are valid tokens just not included in the regex for bypassing them.

# normalizes all punctuations to standard form automatically
def normalize_special_chars(text):
    """Normalize unicode to standard form"""
    # NFKC handles most quote/dash normalization automatically
    text = unicodedata.normalize('NFKC', text)
    text = ''.join(char for char in text
               if unicodedata.category(char)[0] != 'C'  # Remove control chars that are invisible to the human eye but confuse models, eg null bytes, form feeds etc.
               or char in '\n\t ') # allow new lines and tabs

    return text

In [None]:
def remove_extra_whitespace(text):
    # Remove extra whitespace in data
    text = re.sub(r'\s+', ' ', text)
    return text.strip()

In [None]:
# function to clean and normalize the data
def clean_and_normalize(df):

  # normalize unicode characters in the dats
  df['instruction_clean'] = df['instruction'].apply(normalize_special_chars)
  df['response_clean'] = df['response'].apply(normalize_special_chars)

  # remove really short instructions and responses
  initial_rows = df.shape[0]
  df = df[df['instruction_clean'].str.split().str.len() >= 3] # remove intructions less than 3 words
  df = df[df['response_clean'].str.split().str.len() >= 5]    # remove responses less than 5 words
  final_rows = df.shape[0]
  print(f"Removed {initial_rows - final_rows} rows\n")
  print(f"New Shape: {df.shape}\n")

  # remove extra white space to ensure consistent spacing in the data
  df['instruction_clean'] = df['instruction_clean'
  ].apply(remove_extra_whitespace)
  df['response_clean']= df['response_clean'].apply(remove_extra_whitespace)

  return df

df_clean = clean_and_normalize(df)

In [None]:
# Create sentence pairs to match the desired input of the model
def create_sentence_pairs(df):
  sentence_pairs = []
  for index, row in df.iterrows():
      instruction = row['instruction_clean']
      response = row['response_clean']

      input_text = f"Answer the customer service query: {instruction}" # add prefix describin gquery because modelwas trained using data in this structure
      output_text = response


      sentence_pairs.append({
              'input': input_text,
              'output': output_text
          })
  return sentence_pairs

sentence_pairs = create_sentence_pairs(df_clean)
sentence_pairs[0:2]


In [None]:
MAX_LENGTH = 512 # max lenth for model responses

# Initialize google flan t5 tokenizer
tokenizer = T5Tokenizer.from_pretrained("google/flan-t5-base")
print(f"Vocab size: {tokenizer.vocab_size}")

def tokenize_data(sentence_pairs):
    inputs = [pair['input'] for pair in sentence_pairs]
    outputs = [pair['output'] for pair in sentence_pairs]

    # This tokenizes both input and target together
    model_inputs = tokenizer(
        inputs,
        text_target=outputs,
        max_length=MAX_LENGTH,
        padding="max_length",
        truncation=True,
        return_tensors="tf",
    )

    #  Mask padding tokens in labels with -100 instead of 0 to exclude them from loss computation
    labels = model_inputs['labels'].numpy()
    labels[labels == tokenizer.pad_token_id] = -100
    model_inputs['labels'] = tf.constant(labels)

    return model_inputs

# tokenize the sentence pairs
model_inputs = tokenize_data(sentence_pairs)
model_inputs


In [None]:
# split into traning and validation sets
intents = df_clean['intent'].values
train_inputs, val_inputs, train_attention, val_attention, train_labels, val_labels =train_test_split(
    model_inputs['input_ids'], model_inputs['attention_mask'], model_inputs['labels'], test_size=0.2, random_state=42, stratify=intents # stratify split on intents to ensure roper proportion of intents on train and validation
)

train_data = {
    'input_ids': train_inputs,
    'attention_mask': train_attention,
    'labels': train_labels
}

val_data = {
    'input_ids': val_inputs,
    'attention_mask': val_attention,
    'labels': val_labels
}

train_data.shape

In [None]:
# TODO: potential train test split confirm later

In [None]:
# # restore code to normal huggging face format for hugging face training
# data.reset_format()

In [None]:
# def tokenize(batch):
#   temp = tokenizer(batch['instruction'], batch['response'], truncation=True, padding=True)
#   # batch['input_ids'] = temp['input_ids']
#   # batch['attention_mask'] = temp['attention_mask']
#   return temp

In [None]:
# TODO: map tokenizer to entire dataset

# Build Model

In [None]:
model = AutoModel.from_pretrained(model_ckpt)