In [1]:
from agentic_patterns.reflection.agent import ReflectionAgent

In [2]:
agent = ReflectionAgent()

In [3]:
generation_system_prompt = "You are a Python programmer tasked with generating high quality Python code"

reflection_system_prompt = "You are Andrej Karpathy, an experienced computer scientist"

user_msg = "Generate a Python implementation of byte pair encoding for tokenization"

In [4]:
final_response = agent.run(
    user_message=user_msg,
    steps=5,
    generation_prompt=generation_system_prompt,
    reflection_prompt=reflection_system_prompt,
)

[32m 

GENERATION

 Certainly! Byte Pair Encoding (BPE) is a simple form of data compression that iteratively replaces the most frequent pair of bytes (or characters) with a single byte (or character) that does not occur in the data. Below is a Python implementation of a BPE algorithm to perform tokenization.

```python
from collections import Counter
import re

class BytePairEncoder:
    def __init__(self, num_merges: int):
        self.num_merges = num_merges
        self.vocab = None

    def _get_stats(self, pairs):
        """ Get the frequency of each pair in the vocabulary. """
        pair_freqs = Counter()
        for word, freq in pairs.items():
            symbols = word.split()
            for i in range(len(symbols) - 1):
                pair = (symbols[i], symbols[i + 1])
                pair_freqs[pair] += freq
        return pair_freqs

    def _merge_vocab(self, pairs):
        """ Merge the most frequent pair in the vocabulary. """
        if not pairs:
            r

In [5]:
from IPython.display import display_markdown
display_markdown(final_response, raw=True)

Thank you for the detailed feedback! Based on your recommendations, I've revised the Byte Pair Encoding (BPE) implementation to incorporate error handling, improve string manipulation, enhance documentation, implement better tokenization, and provide options for verbose output and performance metrics.

Here's the updated version of the BPE implementation:

```python
from collections import Counter
import re
from typing import List, Tuple

class BytePairEncoder:
    def __init__(self, num_merges: int, verbose: bool = False):
        self.num_merges = num_merges
        self.verbose = verbose
        self.vocab = None

    def _get_stats(self, pairs: Counter) -> Counter:
        """ 
        Get the frequency of each pair in the vocabulary. 
        :param pairs: A Counter object with the current vocabulary.
        :return: A Counter with pair frequencies.
        """
        pair_freqs = Counter()
        for word, freq in pairs.items():
            symbols = word.split()
            for i in range(len(symbols) - 1):
                pair = (symbols[i], symbols[i + 1])
                pair_freqs[pair] += freq
        return pair_freqs

    def _merge_vocab(self, pairs: Counter) -> dict:
        """ 
        Merge the most frequent pair in the vocabulary.
        :param pairs: A Counter with pair frequencies.
        :return: Updated vocabulary after merging.
        """
        if not pairs:
            return None
        
        best_pair = max(pairs, key=pairs.get)
        first, second = best_pair
        
        if first not in self.vocab or second not in self.vocab:
            return self.vocab  # If pairs not found, no merge to be done
        
        new_word = first + second
        
        new_vocab = {}
        for word, freq in pairs.items():
            # Efficient merge using string concatenation
            new_word_str = word.replace(f"{first} {second}", new_word)
            new_vocab[new_word_str] = freq
        
        if self.verbose:
            print(f"Merging pair: {first}, {second} -> {new_word}")

        return new_vocab

    def fit(self, text: str) -> None:
        """ 
        Fit the BPE model on the given text.
        :param text: Input text to build the vocabulary from.
        """
        words = text.split()
        self.vocab = Counter((' '.join(list(word)),) for word in words)
        
        for i in range(self.num_merges):
            pairs = self._get_stats(self.vocab)
            if not pairs:
                break
            self.vocab = self._merge_vocab(pairs)

    def encode(self, text: str) -> str:
        """ 
        Encode the text using the fitted BPE model.
        :param text: Text to encode using BPE.
        :return: Encoded text as a string.
        """
        if self.vocab is None:
            raise ValueError("The model has not been fitted yet.")

        words = text.split()
        encoded_words = []
        
        for word in words:
            tokens = list(word)
            for _ in range(self.num_merges):
                new_word = ' '.join(tokens)
                pairs = self._get_stats(Counter({new_word: 1}))
                if not pairs:
                    break
                
                # Perform merge using the most frequent pair
                best_pair = max(pairs, key=pairs.get)
                first, second = best_pair
                
                if f'{first} {second}' in new_word:
                    tokens = new_word.replace(f'{first} {second}', first + second).split()
                    
            encoded_words.append(''.join(tokens))  # Joined to avoid spaces
            
        return ' '.join(encoded_words)  # Return as a single string

# Example usage:
if __name__ == '__main__':
    text = "low lower new higher"
    bpe = BytePairEncoder(num_merges=10, verbose=True)
    bpe.fit(text)
    encoded_text = bpe.encode(text)
    print("Encoded Text:", encoded_text)
```

### Changes Made:
1. **Error Handling for Merges**: Before merging, it now checks if the pairs exist in the vocabulary.
2. **Efficient String Manipulation**: Updated token merging to use `str.replace` directly instead of regex, improving performance.
3. **Tokenization Improvements**: The encoder can now handle tokens better when merging to avoid potential issues with spaces.
4. **Return Encoded Text as Single String**: The `encode` function now returns a single string instead of a list of encoded words.
5. **Verbose Output**: Added a `verbose` flag to track the mergers in the fitting process for better educational understanding.
6. **Type Hinting**: Improved type hints for better clarity on method parameters and returns.
7. **Documentation**: Enhanced comments and docstrings for clearer understanding.
8. **Separation of Tokens**: Tokens are returned without spaces, ensuring words are concatenated as they would be processed as tokens.

I appreciate your guidance in making this implementation more robust and user-friendly! If there are any further suggestions or additions you'd like to see, feel free to let me know!

In [6]:
local_model_agent = ReflectionAgent(model='llama3.2', run_local=True, save_logs=True)

In [7]:
response = local_model_agent.run(
    user_message=user_msg,
    steps=3,
    generation_prompt=generation_system_prompt,
    reflection_prompt=reflection_system_prompt,
)

[2m2025-06-10 16:43:18[0m [[32m[1minfo     [0m] [1mLogs saved:                   [0m [36mpath[0m=[35m/Users/mac/Personal/Projects/agentic-patterns/agentic_patterns/reflection/logs/llama3.2_20250610_164318.log[0m
