# 03 - Text Preprocessing for Chatbots

**Duration:** 2-3 hours | **Difficulty:** Beginner-Intermediate

## 🎯 Learning Objectives
- Master text cleaning and normalization techniques
- Understand tokenization strategies (word, character, subword)
- Build vocabulary and implement encoding/decoding
- Create a complete text preprocessing pipeline

## 📚 Contents
1. Text Cleaning and Normalization
2. Tokenization Strategies
3. Vocabulary Building
4. Encoding and Padding
5. Complete Pipeline Exercise

In [None]:
import torch
import re
import string
import json
import unicodedata
from collections import Counter, defaultdict
from typing import List, Dict, Tuple, Optional
import matplotlib.pyplot as plt

# Import our custom utilities
import sys
sys.path.append('../utils')
from text_utils import SimpleTokenizer, clean_text, pad_sequences

print("Text preprocessing setup complete!")

## 1. Text Cleaning and Normalization

Before we can train chatbots, we need to clean and normalize text data.

In [None]:
# Load sample conversation data
with open('../data/conversations/simple_qa_pairs.json', 'r') as f:
    conversation_data = json.load(f)

# Extract texts for preprocessing
texts = []
for item in conversation_data:
    texts.append(item['query'])
    texts.append(item['response'])

print(f"Loaded {len(conversation_data)} conversation pairs")
print(f"Total texts: {len(texts)}")
print("\nFirst few examples:")
for i in range(3):
    print(f"Q: {conversation_data[i]['query']}")
    print(f"A: {conversation_data[i]['response']}\n")

In [None]:
# Text cleaning examples
sample_texts = [
    "Hello!!!   How are you? 😊",
    "What's YOUR name???",
    "I'm   learning   PyTorch...   ",
    "Can you help with ML/AI topics?"
]

print("Text Cleaning Examples:")
print("=" * 50)

for text in sample_texts:
    print(f"Original: {repr(text)}")
    
    # Basic cleaning
    basic_clean = clean_text(text, lowercase=True, remove_extra_whitespace=True)
    print(f"Basic:    {repr(basic_clean)}")
    
    # Remove punctuation
    no_punct = clean_text(text, lowercase=True, remove_punctuation=True)
    print(f"No punct: {repr(no_punct)}")
    
    # Remove special characters
    no_special = clean_text(text, lowercase=True, remove_special_chars=True)
    print(f"No spec:  {repr(no_special)}")
    print()

In [None]:
# Advanced text normalization
def advanced_text_cleaning(text: str) -> str:
    """
    Advanced text cleaning for chatbot preprocessing.
    """
    # Handle contractions
    contractions = {
        "can't": "cannot",
        "won't": "will not",
        "n't": " not",
        "'re": " are",
        "'ve": " have",
        "'ll": " will",
        "'d": " would",
        "'m": " am"
    }
    
    # Apply contractions
    for contraction, expansion in contractions.items():
        text = text.replace(contraction, expansion)
    
    # Basic cleaning
    text = clean_text(text, lowercase=True, remove_extra_whitespace=True)
    
    # Remove URLs
    text = re.sub(r'http\S+|www\S+', '', text)
    
    # Handle repeated punctuation
    text = re.sub(r'[!]{2,}', '!', text)
    text = re.sub(r'[?]{2,}', '?', text)
    text = re.sub(r'[.]{2,}', '.', text)
    
    return text.strip()

# Test advanced cleaning
test_cases = [
    "I can't believe it's working!!!",
    "What's your name??? I'm curious...",
    "Check out this link: https://pytorch.org",
    "Won't you help me??? Please!!!"
]

print("Advanced Text Cleaning:")
for text in test_cases:
    cleaned = advanced_text_cleaning(text)
    print(f"Original: {text}")
    print(f"Cleaned:  {cleaned}\n")

## 2. Tokenization Strategies

Tokenization breaks text into smaller units (tokens) that models can process.

In [None]:
# Word-level tokenization
def word_tokenize(text: str) -> List[str]:
    """Simple word tokenization."""
    # Clean text first
    text = advanced_text_cleaning(text)
    # Split on whitespace and punctuation
    tokens = re.findall(r'\b\w+\b', text)
    return tokens

# Character-level tokenization
def char_tokenize(text: str) -> List[str]:
    """Character-level tokenization."""
    text = advanced_text_cleaning(text)
    return list(text)

# Subword tokenization (simplified)
def simple_subword_tokenize(text: str, max_word_length: int = 6) -> List[str]:
    """Simplified subword tokenization."""
    words = word_tokenize(text)
    tokens = []
    
    for word in words:
        if len(word) <= max_word_length:
            tokens.append(word)
        else:
            # Split long words into chunks
            for i in range(0, len(word), max_word_length):
                chunk = word[i:i + max_word_length]
                if i == 0:
                    tokens.append(chunk)
                else:
                    tokens.append('##' + chunk)  # Continuation marker
    
    return tokens

# Compare tokenization strategies
sample_text = "Hello! What's machine learning and how does tokenization work?"

word_tokens = word_tokenize(sample_text)
char_tokens = char_tokenize(sample_text)
subword_tokens = simple_subword_tokenize(sample_text)

print(f"Original text: {sample_text}")
print(f"\nWord tokens ({len(word_tokens)}): {word_tokens}")
print(f"\nCharacter tokens ({len(char_tokens)}): {char_tokens[:20]}...")
print(f"\nSubword tokens ({len(subword_tokens)}): {subword_tokens}")

## 3. Vocabulary Building

Create a vocabulary mapping between tokens and numerical IDs.

In [None]:
# Build vocabulary from our conversation data
def build_vocab_from_texts(texts: List[str], vocab_size: int = 1000, min_freq: int = 2) -> SimpleTokenizer:
    """
    Build vocabulary from a list of texts.
    """
    # Clean texts
    cleaned_texts = [advanced_text_cleaning(text) for text in texts]
    
    # Initialize tokenizer
    tokenizer = SimpleTokenizer(vocab_size=vocab_size)
    
    # Build vocabulary
    tokenizer.build_vocabulary(cleaned_texts, min_freq=min_freq)
    
    return tokenizer

# Build vocabulary from our conversation data
tokenizer = build_vocab_from_texts(texts, vocab_size=500, min_freq=1)

print(f"Vocabulary size: {tokenizer.get_vocab_size()}")
print(f"Special tokens: {tokenizer.special_tokens}")

# Show some vocabulary examples
print("\nSample vocabulary (first 20 tokens):")
for i in range(min(20, len(tokenizer.id_to_token))):
    token = tokenizer.id_to_token[i]
    print(f"ID {i:2d}: '{token}'")

In [None]:
# Test tokenization and encoding
test_sentences = [
    "Hello, how are you?",
    "What is machine learning?",
    "Can you help me with programming?"
]

print("Tokenization and Encoding Examples:")
print("=" * 50)

for sentence in test_sentences:
    # Tokenize
    tokens = tokenizer.tokenize(sentence)
    
    # Encode to IDs
    token_ids = tokenizer.encode(sentence, add_special_tokens=True, max_length=15)
    
    # Decode back to text
    decoded = tokenizer.decode(token_ids, skip_special_tokens=True)
    
    print(f"Original: {sentence}")
    print(f"Tokens:   {tokens}")
    print(f"IDs:      {token_ids}")
    print(f"Decoded:  {decoded}")
    print()

## 4. Encoding and Padding

Convert text to tensors and handle variable-length sequences.

In [None]:
# Batch processing example
batch_sentences = [
    "Hi there!",
    "How are you doing today?",
    "What can you help me with?",
    "Thanks!",
    "I need assistance with machine learning and deep learning concepts."
]

print("Batch Processing Example:")
print("=" * 40)

# Encode each sentence
encoded_sentences = []
for sentence in batch_sentences:
    encoded = tokenizer.encode(sentence, add_special_tokens=True)
    encoded_sentences.append(encoded)
    print(f"'{sentence}' -> {encoded} (length: {len(encoded)})")

# Pad sequences to same length
max_length = 15
pad_id = tokenizer.token_to_id[tokenizer.special_tokens["pad_token"]]

padded_tensor = pad_sequences(encoded_sentences, max_length=max_length, pad_value=pad_id)

print(f"\nPadded tensor shape: {padded_tensor.shape}")
print(f"Padded tensor:")
print(padded_tensor)

# Create attention mask
attention_mask = (padded_tensor != pad_id).long()
print(f"\nAttention mask:")
print(attention_mask)

In [None]:
# Analyze text statistics
def analyze_text_lengths(texts: List[str], tokenizer: SimpleTokenizer) -> Dict:
    """
    Analyze token length statistics for a collection of texts.
    """
    lengths = []
    for text in texts:
        tokens = tokenizer.encode(text, add_special_tokens=True)
        lengths.append(len(tokens))
    
    return {
        'min_length': min(lengths),
        'max_length': max(lengths),
        'avg_length': sum(lengths) / len(lengths),
        'lengths': lengths
    }

# Analyze our conversation data
stats = analyze_text_lengths(texts, tokenizer)

print(f"Text Length Statistics:")
print(f"Min length: {stats['min_length']} tokens")
print(f"Max length: {stats['max_length']} tokens")
print(f"Avg length: {stats['avg_length']:.2f} tokens")

# Plot length distribution
plt.figure(figsize=(10, 4))

plt.subplot(1, 2, 1)
plt.hist(stats['lengths'], bins=20, alpha=0.7, edgecolor='black')
plt.xlabel('Token Length')
plt.ylabel('Frequency')
plt.title('Token Length Distribution')
plt.grid(True, alpha=0.3)

# Cumulative distribution
plt.subplot(1, 2, 2)
sorted_lengths = sorted(stats['lengths'])
percentiles = [i / len(sorted_lengths) * 100 for i in range(len(sorted_lengths))]
plt.plot(sorted_lengths, percentiles)
plt.xlabel('Token Length')
plt.ylabel('Cumulative %')
plt.title('Cumulative Length Distribution')
plt.grid(True, alpha=0.3)

# Add percentile lines
p95_idx = int(0.95 * len(sorted_lengths))
p95_length = sorted_lengths[p95_idx]
plt.axvline(p95_length, color='red', linestyle='--', label=f'95th percentile: {p95_length}')
plt.legend()

plt.tight_layout()
plt.show()

print(f"\nRecommended max_length for 95% coverage: {p95_length} tokens")

## 5. Complete Preprocessing Pipeline

Put everything together into a reusable preprocessing pipeline.

In [None]:
class ChatbotPreprocessor:
    """
    Complete preprocessing pipeline for chatbot data.
    """
    
    def __init__(self, vocab_size: int = 1000, max_length: int = 128):
        self.vocab_size = vocab_size
        self.max_length = max_length
        self.tokenizer = None
        self.pad_id = None
    
    def fit(self, texts: List[str], min_freq: int = 2):
        """
        Fit the preprocessor on training texts.
        """
        print(f"Fitting preprocessor on {len(texts)} texts...")
        
        # Build tokenizer
        self.tokenizer = build_vocab_from_texts(texts, self.vocab_size, min_freq)
        self.pad_id = self.tokenizer.token_to_id[self.tokenizer.special_tokens["pad_token"]]
        
        print(f"Vocabulary size: {self.tokenizer.get_vocab_size()}")
        
        return self
    
    def transform(self, texts: List[str]) -> Tuple[torch.Tensor, torch.Tensor]:
        """
        Transform texts to tensors.
        
        Returns:
            Tuple[torch.Tensor, torch.Tensor]: (input_ids, attention_mask)
        """
        if self.tokenizer is None:
            raise ValueError("Preprocessor not fitted. Call fit() first.")
        
        # Encode texts
        encoded_texts = []
        for text in texts:
            encoded = self.tokenizer.encode(text, add_special_tokens=True, max_length=self.max_length)
            encoded_texts.append(encoded)
        
        # Pad sequences
        input_ids = pad_sequences(encoded_texts, max_length=self.max_length, pad_value=self.pad_id)
        
        # Create attention mask
        attention_mask = (input_ids != self.pad_id).long()
        
        return input_ids, attention_mask
    
    def fit_transform(self, texts: List[str], **fit_params) -> Tuple[torch.Tensor, torch.Tensor]:
        """
        Fit and transform in one step.
        """
        return self.fit(texts, **fit_params).transform(texts)
    
    def decode(self, token_ids: torch.Tensor, skip_special_tokens: bool = True) -> List[str]:
        """
        Decode tensor back to texts.
        """
        if self.tokenizer is None:
            raise ValueError("Preprocessor not fitted.")
        
        texts = []
        for row in token_ids:
            text = self.tokenizer.decode(row.tolist(), skip_special_tokens=skip_special_tokens)
            texts.append(text)
        
        return texts

# Test the complete pipeline
preprocessor = ChatbotPreprocessor(vocab_size=500, max_length=32)

# Fit on training data
train_texts = texts[:40]  # Use first 40 texts for training
test_texts = texts[40:45]  # Use next 5 for testing

preprocessor.fit(train_texts)

# Transform test data
input_ids, attention_mask = preprocessor.transform(test_texts)

print(f"\nTransformed data:")
print(f"Input IDs shape: {input_ids.shape}")
print(f"Attention mask shape: {attention_mask.shape}")

# Decode back to verify
decoded_texts = preprocessor.decode(input_ids)

print(f"\nOriginal vs Decoded:")
for orig, decoded in zip(test_texts[:3], decoded_texts[:3]):
    print(f"Original: {orig}")
    print(f"Decoded:  {decoded}")
    print()

## 🎉 Congratulations!

You've mastered text preprocessing for chatbot development:

✅ **Text Cleaning**: Normalization and cleaning techniques  
✅ **Tokenization**: Word, character, and subword strategies  
✅ **Vocabulary**: Building and managing token vocabularies  
✅ **Encoding**: Converting text to numerical representations  
✅ **Batch Processing**: Handling variable-length sequences  
✅ **Complete Pipeline**: End-to-end preprocessing system  

## 🚀 Next Steps

In the next notebook, we'll use these preprocessing techniques to build neural networks for text classification:
- Multi-layer perceptrons for text
- Training and evaluation
- Model architectures

**Ready to build models?** Continue to [`04_neural_networks_basics.ipynb`](04_neural_networks_basics.ipynb)!