In [None]:
class CharTokenizer:
    def __init__(self, texts, indent_spaces=4):
        self.indent_spaces = indent_spaces

        # Special tokens (fixed IDs)
        self.special_tokens = ["<pad>", "<bos>", "<eos>", "<indent>"] # padding, beginning of sequence, end of sequence, indent
        self.stoi = {tok: i for i, tok in enumerate(self.special_tokens)} # map string to index
        self.itos = {i: tok for tok, i in self.stoi.items()} # map index to string

        # Collect characters
        chars = set() # set of unique characters
        for text in texts: 
            chars.update(text) # add all of the unique characters

        # Assign IDs
        offset = len(self.stoi) # offset by 4, ie numbers taken up by special tokens already
        for i, ch in enumerate(sorted(chars)): 
            self.stoi[ch] = i + offset # map the characters to index
            self.itos[i + offset] = ch # reverse map

        self.vocab_size = len(self.stoi)

    def encode(self, text, add_special_tokens=True):
        ids = []

        if add_special_tokens:
            ids.append(self.stoi["<bos>"])

        i = 0
        while i < len(text):
            # Handle indentation (only at line start)
            if text[i] == " ":
                count = 0
                while i < len(text) and text[i] == " ":
                    count += 1
                    i += 1
                # you kinda reverse engineer from the amount of spaces how many indents there are

                while count >= self.indent_spaces: # when count bigger than 4 it counts as an indent
                    ids.append(self.stoi["<indent>"]) # add token for indent
                    count -= self.indent_spaces # reduce count by 4

                # leftover spaces
                ids.extend([self.stoi[" "]] * count) # add remaining spaces as formatting spaces basically
            else:
                ids.append(self.stoi[text[i]])
                i += 1

        if add_special_tokens:
            ids.append(self.stoi["<eos>"])

        return ids

    def decode(self, ids):
        text = "" #initialize string
        for i in ids:
            token = self.itos.get(i, "") # get the token from ids (the index)
            if token == "<bos>" or token == "<eos>" or token == "<pad>":
                continue
            elif token == "<indent>":
                text += " " * self.indent_spaces #. add 4 spaces if there was an indent
            else:
                text += token #just add the token to the string
        return text


In [6]:
import pandas as pd
df = pd.read_csv("code_bug_fix_pairs.csv")

In [14]:
texts = [] # collect all code snippets in a list
for _, row in df.iterrows(): # iterate though each row of the dataset
    texts.append(row["buggy_code"])
    texts.append(row["fixed_code"])

tokenizer = CharTokenizer(texts)

print("Vocab size:", tokenizer.vocab_size)
print(list(tokenizer.stoi.items()))



Vocab size: 57
[('<pad>', 0), ('<bos>', 1), ('<eos>', 2), ('<indent>', 3), ('\n', 4), (' ', 5), ('#', 6), ("'", 7), ('(', 8), (')', 9), ('*', 10), ('+', 11), (',', 12), ('-', 13), ('0', 14), ('1', 15), ('2', 16), ('3', 17), ('4', 18), ('5', 19), ('6', 20), ('7', 21), ('8', 22), ('9', 23), (':', 24), ('=', 25), ('>', 26), ('D', 27), ('F', 28), ('H', 29), ('I', 30), ('M', 31), ('S', 32), ('T', 33), ('[', 34), (']', 35), ('_', 36), ('a', 37), ('b', 38), ('c', 39), ('d', 40), ('e', 41), ('f', 42), ('g', 43), ('h', 44), ('i', 45), ('l', 46), ('m', 47), ('n', 48), ('o', 49), ('p', 50), ('r', 51), ('s', 52), ('t', 53), ('u', 54), ('w', 55), ('x', 56)]


In [17]:
sample = df.iloc[0]["buggy_code"]
print("ORIGINAL:")
print(repr(sample))

encoded = tokenizer.encode(sample)
decoded = tokenizer.decode(encoded)

print("ENCODED:")
print(encoded[:50])  # print first 50 tokens
print("DECODED:")
print(repr(decoded))


assert decoded == sample
print("Yippee reversiblityy")

ORIGINAL:
'x = [1, 2, 3]\nprint x\n# Sample ID: 1'
ENCODED:
[1, 56, 5, 25, 5, 34, 15, 12, 5, 16, 12, 5, 17, 35, 4, 50, 51, 45, 48, 53, 5, 56, 4, 6, 5, 32, 37, 47, 50, 46, 41, 5, 30, 27, 24, 5, 15, 2]
DECODED:
'x = [1, 2, 3]\nprint x\n# Sample ID: 1'
Yippee reversiblityy
