In [2]:
import sys
import os
import pandas as pd
import pickle

In [3]:
# ----------------------------
# 1. Add project root to path
# ----------------------------
sys.path.append(os.path.abspath(".."))
from bpe.bpe_tokenizer import BPETokenizer

In [4]:
# ----------------------------
# 2. Load cleaned dataset
# ----------------------------
df = pd.read_csv("../data/clean_dataset.csv")

In [5]:
# ----------------------------
# 3. Use only a subset for faster training
# ----------------------------
# Start with 5000 rows; if still slow, try 1000
sample_df = df.sample(5000, random_state=42)

code_texts = sample_df['code'].astype(str).tolist()
doc_texts = sample_df['docstring'].astype(str).tolist()

In [6]:
# ----------------------------
# 4. Train BPE Tokenizers (fewer merges = faster)
# ----------------------------
bpe_code = BPETokenizer(num_merges=500)   # can later increase to 1000–2000
bpe_code.train(code_texts)

bpe_doc = BPETokenizer(num_merges=500)
bpe_doc.train(doc_texts)

In [7]:
# ----------------------------
# 5. Save models for later LM training
# ----------------------------
with open("../bpe/bpe_code.pkl", "wb") as f:
    pickle.dump(bpe_code, f)
with open("../bpe/bpe_doc.pkl", "wb") as f:
    pickle.dump(bpe_doc, f)

In [8]:
# ----------------------------
# 6. Test encoding/decoding
# ----------------------------
sample_code = "def add_numbers(a, b): return a+b"
encoded = bpe_code.encode(sample_code)
decoded = bpe_code.decode(encoded)

print("Original:", sample_code)
print("Encoded:", encoded)
print("Decoded:", decoded)

Original: def add_numbers(a, b): return a+b
Encoded: ['def', ' ', 'ad', 'd_', 'n', 'um', 'b', 'er', 's', '(', 'a', ',', ' ', 'b', '):', ' ', 'return', ' ', 'a', '+', 'b', '</w>']
Decoded: def add_numbers(a, b): return a+b
