## Dataset

In [60]:
!jupyter nbconvert --to html tokenizing.ipynb
!open tokenizing.html


[NbConvertApp] Converting notebook tokenizing.ipynb to html
  return _pygments_highlight(
[NbConvertApp] Writing 360122 bytes to tokenizing.html


In [6]:
import pandas as pd
data = pd.read_csv("code_bug_fix_pairs.csv")
data.head()

Unnamed: 0,id,buggy_code,fixed_code,commit_message,commit_url,date
0,1,"x = [1, 2, 3]\nprint x\n# Sample ID: 1","x = [1, 2, 3]\nprint(x)\n# Sample ID: 1",Improved readability with proper indentation,https://github.com/open-source-repo/commit/a5a...,2024-12-16
1,2,"list = [1, 2, 3, 4]\nfor i in list\n print(...","lst = [1, 2, 3, 4]\nfor i in lst:\n print(i...",Corrected conditional operator mistake,https://github.com/open-source-repo/commit/f47...,2024-01-03
2,3,def factorial(n):\n if n == 1\n retu...,def factorial(n):\n if n == 1:\n ret...,Resolved off-by-one error in loop,https://github.com/open-source-repo/commit/e89...,2023-09-05
3,4,def foo()\n print('Missing colon in functio...,def foo():\n print('Fixed missing colon in ...,Added missing parentheses for print function,https://github.com/open-source-repo/commit/bd7...,2024-09-15
4,5,def factorial(n):\n if n == 1\n retu...,def factorial(n):\n if n == 1:\n ret...,Fixed bug in recursive function call,https://github.com/open-source-repo/commit/d66...,2024-01-24


## Tokenising

In [2]:
class CharTokenizer:
    def __init__(self, txt, indent_spaces=4):
        self.indent_spaces = indent_spaces

        # Special tokens (fixed IDs)
        self.special_tokens = ["<pad>", "<bos>", "<eos>", "<indent>","<sep>"] # padding, beginning of sequence, end of sequence, indent
        self.str = {tok: i for i, tok in enumerate(self.special_tokens)} # map string to index
        self.itm = {i: tok for tok, i in self.str.items()} # map index to string

        # Collect characters
        chars = set() # set of unique characters
        for text in txt:
            chars.update(text) # add all the unique characters

        # Assign IDs
        offset = len(self.str) # offset by 4, ie numbers taken up by special tokens already
        for i, ch in enumerate(sorted(chars)): 
            self.str[ch] = i + offset # map the characters to index
            self.itm[i + offset] = ch # reverse map

        self.vocab_size = len(self.str)

    def encode(self, text, add_special_tokens=True):
        ids = []

        if add_special_tokens:
            ids.append(self.str["<bos>"])

        i = 0
        while i < len(text):
            # Handle indentation (only at line start)
            if text[i] == " ":
                count = 0
                while i < len(text) and text[i] == " ":
                    count += 1
                    i += 1
                # you kinda reverse engineer from the amount of spaces how many indents there are

                while count >= self.indent_spaces: # when count bigger than 4 it counts as an indent
                    ids.append(self.str["<indent>"]) # add token for indent
                    count -= self.indent_spaces # reduce count by 4

                # leftover spaces
                ids.extend([self.str[" "]] * count) # add remaining spaces as formatting spaces basically
            else:
                ids.append(self.str[text[i]])
                i += 1

        if add_special_tokens:
            ids.append(self.str["<eos>"])

        return ids

    def decode(self, ids):
        text = "" #initialize string
        for i in ids:
            token = self.itm.get(i, "") # get the token from ids (the index)
            if token == "<bos>" or token == "<eos>" or token == "<pad>":
                continue
            elif token == "<indent>":
                text += " " * self.indent_spaces #. add 4 spaces if there was an indent
            else:
                text += token #just add the token to the string
        return text


In [3]:
import pandas as pd
df = pd.read_csv("code_bug_fix_pairs.csv")

In [9]:

def clean_code_logic(text):
    if not isinstance(text, str):
            return ""

    marker = "# Sample ID"
    index = text.find(marker)

    if index == -1:
        return text.strip()

    return text[:index].strip()

# --- Step 1: Clean the DataFrame first ---
print("Cleaning data and building custom vocabulary...")

df['buggy_clean'] = df['buggy_code'].apply(clean_code_logic)
df['fixed_clean'] = df['fixed_code'].apply(clean_code_logic)

# --- Step 2: Gather cleaned tokens into a list ---
# Using .tolist() is much faster than iter rows()
texts = df['buggy_clean'].tolist() + df['fixed_clean'].tolist()

print(f"Collected {len(texts)} cleaned code snippets.")



tokenizer = CharTokenizer(texts)

print("Vocab size:", tokenizer.vocab_size)
print(list(tokenizer.str.items()))



Cleaning data and building custom vocabulary...
Collected 2000 cleaned code snippets.
Vocab size: 50
[('<pad>', 0), ('<bos>', 1), ('<eos>', 2), ('<indent>', 3), ('<sep>', 4), ('\n', 5), (' ', 6), ("'", 7), ('(', 8), (')', 9), ('*', 10), ('+', 11), (',', 12), ('-', 13), ('0', 14), ('1', 15), ('2', 16), ('3', 17), ('4', 18), ('5', 19), (':', 20), ('=', 21), ('>', 22), ('F', 23), ('H', 24), ('M', 25), ('T', 26), ('[', 27), (']', 28), ('_', 29), ('a', 30), ('b', 31), ('c', 32), ('d', 33), ('e', 34), ('f', 35), ('g', 36), ('h', 37), ('i', 38), ('l', 39), ('m', 40), ('n', 41), ('o', 42), ('p', 43), ('r', 44), ('s', 45), ('t', 46), ('u', 47), ('w', 48), ('x', 49)]


In [5]:
sample = df.iloc[0]["buggy_clean"]
print("ORIGINAL:")
print(repr(sample))

encoded = tokenizer.encode(sample)
decoded = tokenizer.decode(encoded)

print("ENCODED:")
print(encoded[:50])  # print first 50 tokens
print("DECODED:")
print(repr(decoded))


assert decoded == sample
print("Decodes tokens back into text!")

ORIGINAL:
'x = [1, 2, 3]\nprint x'
ENCODED:
[1, 48, 5, 20, 5, 26, 14, 11, 5, 15, 11, 5, 16, 27, 4, 42, 43, 37, 40, 45, 5, 48, 2]
DECODED:
'x = [1, 2, 3]\nprint x'
Decodes tokens back into text!


## Embeddings Layer


In [66]:
print(df.columns.tolist())
df.head(3)


['id', 'buggy_code', 'fixed_code', 'commit_message', 'commit_url', 'date', 'buggy_clean', 'fixed_clean']


Unnamed: 0,id,buggy_code,fixed_code,commit_message,commit_url,date,buggy_clean,fixed_clean
0,1,"x = [1, 2, 3]\nprint x\n# Sample ID: 1","x = [1, 2, 3]\nprint(x)\n# Sample ID: 1",Improved readability with proper indentation,https://github.com/open-source-repo/commit/a5a...,2024-12-16,"x = [1, 2, 3]\nprint x","x = [1, 2, 3]\nprint(x)"
1,2,"list = [1, 2, 3, 4]\nfor i in list\n print(...","lst = [1, 2, 3, 4]\nfor i in lst:\n print(i...",Corrected conditional operator mistake,https://github.com/open-source-repo/commit/f47...,2024-01-03,"list = [1, 2, 3, 4]\nfor i in list\n print(i)","lst = [1, 2, 3, 4]\nfor i in lst:\n print(i)"
2,3,def factorial(n):\n if n == 1\n retu...,def factorial(n):\n if n == 1:\n ret...,Resolved off-by-one error in loop,https://github.com/open-source-repo/commit/e89...,2023-09-05,def factorial(n):\n if n == 1\n retu...,def factorial(n):\n if n == 1:\n ret...


In [103]:
import torch
import torch.nn as nn

d_model = 256

PAD_ID = tokenizer.str["<pad>"]
BOS_ID = tokenizer.str["<bos>"]
EOS_ID = tokenizer.str["<eos>"]

# ADD THIS TOKEN IN YOUR TOKENIZER SPECIAL TOKENS
SEP_ID = tokenizer.str["<sep>"]  # must exist

token_embedding = nn.Embedding(
    num_embeddings=tokenizer.vocab_size,
    embedding_dim=d_model,
    padding_idx=PAD_ID
)

# Optional: force PAD row to zeros
with torch.no_grad():
    token_embedding.weight[PAD_ID].zero_()

print("Vocab size:", tokenizer.vocab_size)
print("Embedding dim:", d_model)
print("PAD_ID:", PAD_ID, "SEP_ID:", SEP_ID)

# ---- USE REAL DATASET HERE ----
sample_buggy = df.iloc[0]["buggy_code"]
sample_clean = df.iloc[0]["fixed_code"]

buggy_ids = tokenizer.encode(sample_buggy, add_special_tokens=False)
clean_ids = tokenizer.encode(sample_clean, add_special_tokens=False)

# Build decoder-only correction sequence: <bos> buggy <sep> clean <eos>
ids = [BOS_ID] + buggy_ids + [SEP_ID] + clean_ids + [EOS_ID]

x = torch.tensor(ids, dtype=torch.long).unsqueeze(0)  # (1, T)
emb = token_embedding(x)

print("Sequence length:", len(ids))
print("x shape:", x.shape)
print("emb shape:", emb.shape)
print(emb[0, :5, :8])

# Check sep exists and roughly where it is
sep_pos = (x[0] == SEP_ID).nonzero(as_tuple=True)[0].item()
print("SEP position:", sep_pos, "=> buggy_len:", len(buggy_ids), "clean_len:", len(clean_ids))
print("Expected sep_pos:", 1 + len(buggy_ids), "Actual:", sep_pos)


pad_norm = token_embedding.weight[PAD_ID].norm().item()
print("PAD embedding norm:", pad_norm)

Vocab size: 50
Embedding dim: 256
PAD_ID: 0 SEP_ID: 4
Sequence length: 68
x shape: torch.Size([1, 68])
emb shape: torch.Size([1, 68, 256])
tensor([[ 1.0598, -0.4031, -1.7674,  0.2216,  0.4563,  0.7601,  1.5383,  0.1208],
        [ 0.9320, -0.3950, -0.3423,  1.1899, -0.0314,  0.3191,  2.6929, -0.7372],
        [-0.0220, -1.2752,  0.8734,  0.3941, -0.5526, -0.0518, -0.7748,  0.9775],
        [ 1.3041, -1.1076, -0.1412,  0.0630,  1.7751, -0.2507,  1.2804,  1.7896],
        [-0.0220, -1.2752,  0.8734,  0.3941, -0.5526, -0.0518, -0.7748,  0.9775]],
       grad_fn=<SliceBackward0>)
SEP position: 33 => buggy_len: 32 clean_len: 33
Expected sep_pos: 33 Actual: 33
PAD embedding norm: 0.0


In [71]:
import torch
from torch.utils.data import Dataset, DataLoader
import torch.nn.utils.rnn as rnn

PAD_ID = tokenizer.str["<pad>"]
BOS_ID = tokenizer.str["<bos>"]
EOS_ID = tokenizer.str["<eos>"]
SEP_ID = tokenizer.str["<sep>"]

class SyntaxCorrectionDataset(Dataset):
    def __init__(self, df, tokenizer, max_len=256, buggy_col="buggy_clean", fixed_col="fixed_clean"):
        self.df = df.reset_index(drop=True)
        self.tok = tokenizer
        self.max_len = max_len
        self.buggy_col = buggy_col
        self.fixed_col = fixed_col

    def __len__(self):
        return len(self.df)

    def __getitem__(self, idx):
        buggy = self.df.iloc[idx][self.buggy_col]
        fixed = self.df.iloc[idx][self.fixed_col]

        buggy_ids = self.tok.encode(buggy, add_special_tokens=False)
        fixed_ids = self.tok.encode(fixed, add_special_tokens=False)

        ids = [BOS_ID] + buggy_ids + [SEP_ID] + fixed_ids + [EOS_ID]

        # truncate
        ids = ids[: self.max_len]

        # labels: ignore everything up to SEP (inclusive)
        labels = ids.copy()
        if SEP_ID in ids:
            sep_pos = ids.index(SEP_ID)
            labels[:sep_pos + 1] = [-100] * (sep_pos + 1)
        else:
            labels = [-100] * len(ids)

        return torch.tensor(ids, dtype=torch.long), torch.tensor(labels, dtype=torch.long)

#below function pads variable-length sequences, masks padding tokens, and prevents padded positions from contributing to the loss

def collate_fn(batch):
    input_ids, labels = zip(*batch)

    input_ids = rnn.pad_sequence(input_ids, batch_first=True, padding_value=PAD_ID)
    labels = rnn.pad_sequence(labels, batch_first=True, padding_value=-100)

    attention_mask = (input_ids != PAD_ID).long()  # 1=real, 0=pad
    return input_ids, labels, attention_mask

train_ds = SyntaxCorrectionDataset(df, tokenizer, max_len=256)
train_loader = DataLoader(train_ds, batch_size=16, shuffle=True, collate_fn=collate_fn, drop_last=True)

# --- sanity check one batch ---
input_ids, labels, attention_mask = next(iter(train_loader))

print("input_ids:", input_ids.shape)
print("labels:", labels.shape)
print("attention_mask:", attention_mask.shape)

# Verify: labels ignored before SEP for the first item
first = 0
sep_positions = (input_ids[first] == SEP_ID).nonzero(as_tuple=True)[0]
print("SEP positions (first item):", sep_positions.tolist())

if len(sep_positions) > 0:
    s = sep_positions[0].item()
    print("Ignored count up to SEP:", (labels[first, :s+1] == -100).sum().item(), "expected", s+1)
    print("First supervised label index:", (labels[first] != -100).nonzero(as_tuple=True)[0][0].item())


input_ids: torch.Size([16, 152])
labels: torch.Size([16, 152])
attention_mask: torch.Size([16, 152])
SEP positions (first item): [42]
Ignored count up to SEP: 43 expected 43
First supervised label index: 43


continue with mode.forward()