Import necessary libraries

In [4]:
import tensorflow as tf 
import pandas as pd
import re
from tensorflow.keras import layers
import numpy as np

Jude's code: tokenizer from scratch

In [2]:
class CharTokenizer: #convert raw text into numbers (encoding) and numbers back into text (decoding)
    def __init__(self, texts, indent_spaces=4): # creates dictionary where every unique character is assigned a unique number
        self.indent_spaces = indent_spaces

        # Special tokens (fixed IDs)
        self.special_tokens = ["<pad>", "<bos>", "<eos>", "<indent>"] # padding, beginning of sequence, end of sequence, indent
        self.stoi = {tok: i for i, tok in enumerate(self.special_tokens)} # map string to index
        self.itos = {i: tok for tok, i in self.stoi.items()} # map index to string

        # Collect characters
        chars = set() # set of unique characters
        for text in texts: 
            chars.update(text) # add all of the unique characters

        # Assign IDs
        offset = len(self.stoi) # offset by 4, ie numbers taken up by special tokens already
        for i, ch in enumerate(sorted(chars)): 
            self.stoi[ch] = i + offset # map the characters to index
            self.itos[i + offset] = ch # reverse map

        self.vocab_size = len(self.stoi)

    def encode(self, text, add_special_tokens=True):
        ids = []

        if add_special_tokens:
            ids.append(self.stoi["<bos>"])

        i = 0
        while i < len(text):
            # Handle indentation (only at line start)
            if text[i] == " ":
                count = 0
                while i < len(text) and text[i] == " ":
                    count += 1
                    i += 1
                # you kinda reverse engineer from the amount of spaces how many indents there are

                while count >= self.indent_spaces: # when count bigger than 4 it counts as an indent
                    ids.append(self.stoi["<indent>"]) # add token for indent
                    count -= self.indent_spaces # reduce count by 4

                # leftover spaces
                ids.extend([self.stoi[" "]] * count) # add remaining spaces as formatting spaces basically
            else:
                ids.append(self.stoi[text[i]])
                i += 1

        if add_special_tokens:
            ids.append(self.stoi["<eos>"])

        return ids

    def decode(self, ids):
        text = "" #initialize string
        for i in ids:
            token = self.itos.get(i, "") # get the token from ids (the index)
            if token == "<bos>" or token == "<eos>" or token == "<pad>":
                continue
            elif token == "<indent>":
                text += " " * self.indent_spaces # add 4 spaces if there was an indent
            else:
                text += token # just add the token to the string
        return text

def clean_code_logic(text):
    if not isinstance(text, str):
            return ""

    marker = "# Sample ID" # search for specific marker in the code to remove anything after it
    index = text.find(marker)

    if index == -1:
        return text.strip()

    return text[:index].strip()

# --- Execution ---

# Load data
df = pd.read_csv("code_bug_fix_pairs.csv")

# Step 1: Clean the DataFrame first
print("Cleaning data and building custom vocabulary...")

df['buggy_clean'] = df['buggy_code'].apply(clean_code_logic)
df['fixed_clean'] = df['fixed_code'].apply(clean_code_logic)

# Step 2: Gather cleaned tokens into a list
texts = df['buggy_clean'].tolist() + df['fixed_clean'].tolist()
print(f"Collected {len(texts)} cleaned code snippets.")

# Initialize Tokenizer
tokenizer = CharTokenizer(texts)

print("Vocab size:", tokenizer.vocab_size)
print(list(tokenizer.stoi.items()))

# Step 3: Test Reversibility
sample = df.iloc[0]["buggy_clean"]
print("\nORIGINAL:")
print(repr(sample))

encoded = tokenizer.encode(sample)
decoded = tokenizer.decode(encoded)

print("\nENCODED (First 50 tokens):")
print(encoded[:50]) 

print("\nDECODED:")
print(repr(decoded))

assert decoded == sample
print("\nYippee reversiblityy")

Cleaning data and building custom vocabulary...
Collected 2000 cleaned code snippets.
Vocab size: 49
[('<pad>', 0), ('<bos>', 1), ('<eos>', 2), ('<indent>', 3), ('\n', 4), (' ', 5), ("'", 6), ('(', 7), (')', 8), ('*', 9), ('+', 10), (',', 11), ('-', 12), ('0', 13), ('1', 14), ('2', 15), ('3', 16), ('4', 17), ('5', 18), (':', 19), ('=', 20), ('>', 21), ('F', 22), ('H', 23), ('M', 24), ('T', 25), ('[', 26), (']', 27), ('_', 28), ('a', 29), ('b', 30), ('c', 31), ('d', 32), ('e', 33), ('f', 34), ('g', 35), ('h', 36), ('i', 37), ('l', 38), ('m', 39), ('n', 40), ('o', 41), ('p', 42), ('r', 43), ('s', 44), ('t', 45), ('u', 46), ('w', 47), ('x', 48)]

ORIGINAL:
'x = [1, 2, 3]\nprint x'

ENCODED (First 50 tokens):
[1, 48, 5, 20, 5, 26, 14, 11, 5, 15, 11, 5, 16, 27, 4, 42, 43, 37, 40, 45, 5, 48, 2]

DECODED:
'x = [1, 2, 3]\nprint x'

Yippee reversiblityy


Embedding process: turn integer IDs into rich, high-dimensional vectors that represent the meaning of the characters. We use an embedding layer

In [None]:
vocab_size = tokenizer.vocab_size # vocabulary size
embedding_dim = 128 # embedding dimension
rnn_units = 256 # recurrent neural network units

embedding_layer = layers.Embedding( # embedding layer, gives a 'snapshot' of the input data
    input_dim=vocab_size, # size of vocabulary
    output_dim=embedding_dim) # dimension of embedding
rnn_layer = layers.GRU( # GRU layer, what understands the logic and flow of the code
    rnn_units, # number of units
    return_sequences=True, # return sequences
    return_state=True) # return hidden state, carries forward information => allows long-range dependencies
# Example input
sample_ids = tf.constant([encoded])  # Batch size of 1
embedded_output = embedding_layer(sample_ids) # pass through embedding layer
rnn_output, rnn_state = rnn_layer(embedded_output) # pass through RNN layer
print("\nEMBEDDING OUTPUT SHAPE:", embedded_output.shape) # (1 batch size, sequence length (char), embedding dimension (values per char))


EMBEDDING OUTPUT SHAPE: (1, 23, 128)
