In [191]:
class CharTokenizer:
    def __init__(self, texts, indent_spaces=4):
        self.indent_spaces = indent_spaces

        # Special tokens (fixed IDs)
        self.special_tokens = ["<pad>", "<bos>", "<eos>", "<indent>"] # padding, beginning of sequence, end of sequence, indent
        self.stoi = {tok: i for i, tok in enumerate(self.special_tokens)} # map string to index
        self.itos = {i: tok for tok, i in self.stoi.items()} # map index to string

        # Collect characters
        chars = set() # set of unique characters
        for text in texts: 
            chars.update(text) # add all of the unique characters

        # Assign IDs
        offset = len(self.stoi) # offset by 4, ie numbers taken up by special tokens already
        for i, ch in enumerate(sorted(chars)): 
            self.stoi[ch] = i + offset # map the characters to index
            self.itos[i + offset] = ch # reverse map

        self.vocab_size = len(self.stoi)

    def encode(self, text, add_special_tokens=True):
        ids = []

        if add_special_tokens:
            ids.append(self.stoi["<bos>"])

        i = 0
        while i < len(text):
            # Handle indentation (only at line start)
            if text[i] == " ":
                count = 0
                while i < len(text) and text[i] == " ":
                    count += 1
                    i += 1
                # you kinda reverse engineer from the amount of spaces how many indents there are

                while count >= self.indent_spaces: # when count bigger than 4 it counts as an indent
                    ids.append(self.stoi["<indent>"]) # add token for indent
                    count -= self.indent_spaces # reduce count by 4

                # leftover spaces
                ids.extend([self.stoi[" "]] * count) # add remaining spaces as formatting spaces basically
            else:
                ids.append(self.stoi[text[i]])
                i += 1

        if add_special_tokens:
            ids.append(self.stoi["<eos>"])

        return ids

    def decode(self, ids):
        text = "" #initialize string
        for i in ids:
            token = self.itos.get(i, "") # get the token from ids (the index)
            if token == "<bos>" or token == "<eos>" or token == "<pad>":
                continue
            elif token == "<indent>":
                text += " " * self.indent_spaces #. add 4 spaces if there was an indent
            else:
                text += token #just add the token to the string
        return text


In [192]:
import pandas as pd
df = pd.read_csv("code_bug_fix_pairs.csv")

In [193]:
import re
def clean_code_logic(text):
    if not isinstance(text, str):
            return ""

    marker = "# Sample ID"
    index = text.find(marker)

    if index == -1:
        return text.strip()

    return text[:index].strip()

# --- Step 1: Clean the DataFrame first ---
print("Cleaning data and building custom vocabulary...")

df['buggy_clean'] = df['buggy_code'].apply(clean_code_logic)
df['fixed_clean'] = df['fixed_code'].apply(clean_code_logic)

# --- Step 2: Gather cleaned tokens into a list ---
# Using .tolist() is much faster than iterrows()
texts = df['buggy_clean'].tolist() + df['fixed_clean'].tolist()

print(f"Collected {len(texts)} cleaned code snippets.")



tokenizer = CharTokenizer(texts)

print("Vocab size:", tokenizer.vocab_size)
print(list(tokenizer.stoi.items()))



Cleaning data and building custom vocabulary...
Collected 2000 cleaned code snippets.
Vocab size: 49
[('<pad>', 0), ('<bos>', 1), ('<eos>', 2), ('<indent>', 3), ('\n', 4), (' ', 5), ("'", 6), ('(', 7), (')', 8), ('*', 9), ('+', 10), (',', 11), ('-', 12), ('0', 13), ('1', 14), ('2', 15), ('3', 16), ('4', 17), ('5', 18), (':', 19), ('=', 20), ('>', 21), ('F', 22), ('H', 23), ('M', 24), ('T', 25), ('[', 26), (']', 27), ('_', 28), ('a', 29), ('b', 30), ('c', 31), ('d', 32), ('e', 33), ('f', 34), ('g', 35), ('h', 36), ('i', 37), ('l', 38), ('m', 39), ('n', 40), ('o', 41), ('p', 42), ('r', 43), ('s', 44), ('t', 45), ('u', 46), ('w', 47), ('x', 48)]


In [194]:
sample = df.iloc[0]["buggy_clean"]
print("ORIGINAL:")
print(repr(sample))

encoded = tokenizer.encode(sample)
decoded = tokenizer.decode(encoded)

print("ENCODED:")
print(encoded[:50])  # print first 50 tokens
print("DECODED:")
print(repr(decoded))


assert decoded == sample
print("Yippee reversiblityy")

ORIGINAL:
'x = [1, 2, 3]\nprint x'
ENCODED:
[1, 48, 5, 20, 5, 26, 14, 11, 5, 15, 11, 5, 16, 27, 4, 42, 43, 37, 40, 45, 5, 48, 2]
DECODED:
'x = [1, 2, 3]\nprint x'
Yippee reversiblityy


In [195]:
import torch
from torch.utils.data import Dataset



class GPTDatasetV1(Dataset):
    def __init__(self, txt, tokenizer, max_length, stride):
        self.input_ids = []
        self.target_ids = []

        token_ids = tokenizer.encode(txt)

        for i in range (0,len(token_ids) - max_length, stride):
           input_chunk = token_ids[i:i+max_length]
           target_chunk = token_ids[i+1:i+max_length+1]

           self.input_ids.append(torch.tensor(input_chunk))
           self.target_ids.append(torch.tensor(target_chunk))

    def __len__(self):
        return len(self.input_ids)
    
    def __getitem__(self, idx):
        return self.input_ids[idx], self.target_ids[idx]

In [196]:
def create_dataloader_v1(txt,batch_size = 4,max_length =256,stride=128,shuffle=True,drop_last=True,num_workers=0):
    tokenizer = CharTokenizer(texts)
    
    dataset = GPTDatasetV1(txt,tokenizer,max_length,stride)
    dataloader = torch.utils.data.DataLoader(dataset,batch_size=batch_size,shuffle=shuffle,drop_last=drop_last,num_workers=num_workers)
    return dataloader

In [197]:
dataloader = create_dataloader_v1(df.iloc[0]["buggy_clean"], batch_size=8, max_length=4, stride=4,shuffle=False,drop_last=False,num_workers=0,)

data_iter = iter(dataloader)
inputs,targets = next(data_iter)
print("inputs\n",inputs)
print("targets\n",targets)

inputs
 tensor([[ 1, 48,  5, 20],
        [ 5, 26, 14, 11],
        [ 5, 15, 11,  5],
        [16, 27,  4, 42],
        [43, 37, 40, 45]])
targets
 tensor([[48,  5, 20,  5],
        [26, 14, 11,  5],
        [15, 11,  5, 16],
        [27,  4, 42, 43],
        [37, 40, 45,  5]])


In [198]:
vocab_size = tokenizer.vocab_size
output_dim = 256 * 2

torch.manual_seed(123)
embedding_layer = torch.nn.Embedding(vocab_size, output_dim)

print(embedding_layer.weight)

Parameter containing:
tensor([[ 0.3374, -0.1778, -0.3035,  ..., -0.0315, -1.0640,  0.9417],
        [-1.3152, -0.0677, -0.1350,  ..., -0.4840, -0.2713, -0.0774],
        [ 0.5229,  0.1553,  0.5247,  ..., -0.4098,  0.4978, -0.3721],
        ...,
        [ 0.3152, -0.7450, -0.9576,  ..., -0.8263,  0.1970,  0.5988],
        [-0.8103, -0.3605, -0.7001,  ...,  0.4064, -0.4669,  0.4912],
        [ 0.6639, -2.1862,  0.0922,  ...,  1.2086, -0.4484,  0.0459]],
       requires_grad=True)


In [199]:



max_length = 4
dataloader = create_dataloader_v1(df.iloc[0]["buggy_clean"], batch_size=4, max_length=max_length, stride=max_length,shuffle=False)

data_iter = iter(dataloader)
inputs, targets = next(data_iter)
print("inputs\n",inputs)
print("targets\n",targets)

inputs
 tensor([[ 1, 48,  5, 20],
        [ 5, 26, 14, 11],
        [ 5, 15, 11,  5],
        [16, 27,  4, 42]])
targets
 tensor([[48,  5, 20,  5],
        [26, 14, 11,  5],
        [15, 11,  5, 16],
        [27,  4, 42, 43]])


In [200]:
token_embeddings = embedding_layer(inputs)
token_embeddings.shape

torch.Size([4, 4, 512])