In [1]:
pip install -U transformers datasets peft accelerate sacrebleu evaluate tokenizers


Collecting transformers
  Downloading transformers-4.57.1-py3-none-any.whl.metadata (43 kB)
[2K     [90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━[0m [32m44.0/44.0 kB[0m [31m2.1 MB/s[0m eta [36m0:00:00[0m
Collecting datasets
  Downloading datasets-4.3.0-py3-none-any.whl.metadata (18 kB)
Collecting peft
  Downloading peft-0.17.1-py3-none-any.whl.metadata (14 kB)
Collecting accelerate
  Downloading accelerate-1.11.0-py3-none-any.whl.metadata (19 kB)
Collecting sacrebleu
  Downloading sacrebleu-2.5.1-py3-none-any.whl.metadata (51 kB)
[2K     [90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━[0m [32m51.8/51.8 kB[0m [31m4.1 MB/s[0m eta [36m0:00:00[0m
[?25hCollecting evaluate
  Downloading evaluate-0.4.6-py3-none-any.whl.metadata (9.5 kB)
Collecting tokenizers
  Downloading tokenizers-0.22.1-cp39-abi3-manylinux_2_17_x86_64.manylinux2014_x86_64.whl.metadata (6.8 kB)
Collecting huggingface-hub<1.0,>=0.34.0 (from transformers)
  Downloading huggingface_hub-0.36.0-py3-none-any.whl.met

In [2]:
import os, json, random, glob, torch
random.seed(42)
os.environ["WANDB_DISABLED"] = "true"
from datasets import Dataset
from transformers import AutoTokenizer, AutoModelForCausalLM, TrainingArguments, Trainer, DataCollatorForLanguageModeling
from peft import LoraConfig, get_peft_model
import sacrebleu

2025-11-01 17:03:37.856154: E external/local_xla/xla/stream_executor/cuda/cuda_fft.cc:477] Unable to register cuFFT factory: Attempting to register factory for plugin cuFFT when one has already been registered
E0000 00:00:1762016618.054654      37 cuda_dnn.cc:8310] Unable to register cuDNN factory: Attempting to register factory for plugin cuDNN when one has already been registered
E0000 00:00:1762016618.109530      37 cuda_blas.cc:1418] Unable to register cuBLAS factory: Attempting to register factory for plugin cuBLAS when one has already been registered


In [3]:
import json, glob, random
random.seed(42)
candidates = glob.glob("/kaggle/input/**/sourceData.jsonl", recursive=True) + glob.glob("sourceData.jsonl")
if not candidates: raise FileNotFoundError("upload sourceData.jsonl")
src = candidates[0]
records = []
with open(src, "r", encoding="utf-8") as f:
    for line in f:
        if line.strip():
            try:
                o = json.loads(line)
                if "pseudo" in o and "code" in o: records.append(o)
            except: pass
random.shuffle(records)
n = len(records)
split = min(2000, int(0.8 * n))
train, val = records[:split], records[split:split+500]
train_file = "/kaggle/working/train.jsonl"
eval_file = "/kaggle/working/eval.jsonl"
with open(train_file, "w", encoding="utf-8") as f:
    for r in train: f.write(json.dumps(r) + "\n")
with open(eval_file, "w", encoding="utf-8") as f:
    for r in val: f.write(json.dumps(r) + "\n")
print("train:", len(train), "eval:", len(val))

train: 2000 eval: 500


In [4]:
from transformers import AutoModelForCausalLM, AutoTokenizer
from peft import LoraConfig, get_peft_model
import torch

model_name = "gpt2"
tokenizer = AutoTokenizer.from_pretrained(model_name, trust_remote_code=False)
tokenizer.pad_token = tokenizer.eos_token

model = AutoModelForCausalLM.from_pretrained(
    model_name,
    torch_dtype=torch.float16,
    device_map="auto",
    trust_remote_code=False
)

lora_config = LoraConfig(
    r=16,
    lora_alpha=32,
    target_modules=["c_attn", "c_proj"],
    lora_dropout=0.05,
    bias="none",
    task_type="CAUSAL_LM"
)

model = get_peft_model(model, lora_config)
model.print_trainable_parameters()

tokenizer_config.json:   0%|          | 0.00/26.0 [00:00<?, ?B/s]

config.json:   0%|          | 0.00/665 [00:00<?, ?B/s]

vocab.json:   0%|          | 0.00/1.04M [00:00<?, ?B/s]

merges.txt:   0%|          | 0.00/456k [00:00<?, ?B/s]

tokenizer.json:   0%|          | 0.00/1.36M [00:00<?, ?B/s]

`torch_dtype` is deprecated! Use `dtype` instead!


model.safetensors:   0%|          | 0.00/548M [00:00<?, ?B/s]

generation_config.json:   0%|          | 0.00/124 [00:00<?, ?B/s]

trainable params: 1,622,016 || all params: 126,061,824 || trainable%: 1.2867




In [5]:
from datasets import load_dataset
from transformers import TrainingArguments, Trainer, DataCollatorForLanguageModeling

output_dir = "/kaggle/working/gpt2-lora-improved"

def tokenize_function(examples):
    texts = [f"<|pseudocode|>{p}<|code|>{c}<|end|>" for p, c in zip(examples["pseudo"], examples["code"])]
    return tokenizer(texts, truncation=True, max_length=384, padding="max_length")

dataset = load_dataset("json", data_files={"train": train_file, "eval": eval_file})
tokenized = dataset.map(tokenize_function, batched=True, remove_columns=dataset["train"].column_names)
data_collator = DataCollatorForLanguageModeling(tokenizer=tokenizer, mlm=False)

args = TrainingArguments(
    output_dir=output_dir,
    per_device_train_batch_size=2,
    per_device_eval_batch_size=2,
    gradient_accumulation_steps=4,
    learning_rate=2e-4,
    num_train_epochs=3,
    fp16=True,
    eval_strategy="steps",
    eval_steps=200,
    logging_steps=50,
    save_strategy="steps",
    save_steps=500,
    save_total_limit=2,
    report_to="none",
    logging_dir="/kaggle/working/logs",
    warmup_steps=50,
    weight_decay=0.01
)

trainer = Trainer(
    model=model,
    args=args,
    train_dataset=tokenized["train"],
    eval_dataset=tokenized["eval"],
    data_collator=data_collator,
    tokenizer=tokenizer,
)

os.environ["WANDB_DISABLED"] = "true"

trainer.train()
model.save_pretrained(output_dir)
tokenizer.save_pretrained(output_dir)
print(f"GPT-2 model saved to {output_dir}")

Generating train split: 0 examples [00:00, ? examples/s]

Generating eval split: 0 examples [00:00, ? examples/s]

Map:   0%|          | 0/2000 [00:00<?, ? examples/s]

Map:   0%|          | 0/500 [00:00<?, ? examples/s]

  trainer = Trainer(
The model is already on multiple devices. Skipping the move to device specified in `args`.
The tokenizer has new PAD/BOS/EOS tokens that differ from the model config and generation config. The model config and generation config were aligned accordingly, being updated with the tokenizer's values. Updated tokens: {'pad_token_id': 50256}.
`loss_type=None` was set in the config but it is unrecognized. Using the default loss: `ForCausalLMLoss`.


Step,Training Loss,Validation Loss
200,1.411,1.210845
400,1.2309,1.076782
600,1.1481,1.03564


GPT-2 model saved to /kaggle/working/gpt2-lora-improved


In [6]:
import sacrebleu, json, torch
from transformers import AutoTokenizer, AutoModelForCausalLM

tokenizer = AutoTokenizer.from_pretrained(output_dir)
model = AutoModelForCausalLM.from_pretrained(output_dir).to("cuda" if torch.cuda.is_available() else "cpu")
print("Working on calculating BLEU")
hyps, refs = [], []
with open(eval_file, "r", encoding="utf-8") as f:
    for i, line in enumerate(f):
        if i >= 50: break
        ex = json.loads(line)
        prompt = "<|pseudocode|>" + ex["pseudo"].strip() + "<|code|>"
        ids = tokenizer.encode(prompt, return_tensors="pt").to(model.device)
        out = model.generate(ids, max_length=256, num_return_sequences=1, do_sample=False, pad_token_id=tokenizer.eos_token_id)
        pred = tokenizer.decode(out[0], skip_special_tokens=True)
        gen = pred.split("<|code|>",1)[1].strip() if "<|code|>" in pred else pred.strip()
        hyps.append(gen)
        refs.append([ex["code"].strip()])

bleu = sacrebleu.corpus_bleu(hyps, refs)
print("BLEU:", bleu.score)
with open("/kaggle/working/predictions.jsonl","w") as f:
    for r,h in zip(refs,hyps): f.write(json.dumps({"reference":r[0],"prediction":h})+"\n")

The attention mask is not set and cannot be inferred from input because pad token is same as eos token. As a consequence, you may observe unexpected behavior. Please pass your input's `attention_mask` to obtain reliable results.


Working on calculating BLEU
BLEU: 33.542066273429306


# Testing our model

In [7]:
import torch
from transformers import AutoTokenizer, AutoModelForCausalLM

output_dir = "/kaggle/working/gpt2-lora-improved"

tokenizer = AutoTokenizer.from_pretrained(output_dir)
model = AutoModelForCausalLM.from_pretrained(output_dir).to("cuda" if torch.cuda.is_available() else "cpu")

def generate_code(pseudocode, max_length=256, temperature=0.7):
    prompt = f"<|pseudocode|>{pseudocode.strip()}<|code|>"
    inputs = tokenizer.encode(prompt, return_tensors="pt").to(model.device)

    outputs = model.generate(
        inputs,
        max_length=max_length,
        temperature=temperature,
        do_sample=True,
        top_p=0.95,
        pad_token_id=tokenizer.eos_token_id
    )

    generated = tokenizer.decode(outputs[0], skip_special_tokens=False)
    # Stop at the first <|end|> token
    if "<|end|>" in generated:
        generated = generated.split("<|end|>")[0]

    # Extract only code part
    code = generated.split("<|code|>", 1)[1].strip() if "<|code|>" in generated else generated.strip()

    return code


# ==== TEST EXAMPLES ====
examples = [
    ("Factorial Function", """
function to calculate factorial of a number
input: n (integer)
if n is 0 or 1, return 1
else multiply n by factorial of n-1
"""),
    ("Find Maximum", """
function to find maximum element in a list
input: arr (list of numbers)
initialize max_val to first element
loop through each element in arr
if element is greater than max_val, update max_val
return max_val
"""),
    ("Palindrome Check", """
function to check if a string is palindrome
input: s (string)
reverse the string and store in reversed_s
if s equals reversed_s, return True
else return False
""")
]

for title, pseudo in examples:
    print("=" * 50)
    print(f"TEST EXAMPLE: {title}")
    print("=" * 50)
    print("PSEUDOCODE:")
    print(pseudo)
    print("\nGENERATED CODE:")
    print(generate_code(pseudo))
    print()


TEST EXAMPLE: Factorial Function
PSEUDOCODE:

function to calculate factorial of a number
input: n (integer)
if n is 0 or 1, return 1
else multiply n by factorial of n-1


GENERATED CODE:
def main():
    if n == 0 or 1:
             return 1
            else:
                n - 1

TEST EXAMPLE: Find Maximum
PSEUDOCODE:

function to find maximum element in a list
input: arr (list of numbers)
initialize max_val to first element
loop through each element in arr
if element is greater than max_val, update max_val
return max_val


GENERATED CODE:
def main():
      = 0
     max_val = 0
      arr(int, input().split())
      max_val = input().split()
     if max_val == max_val:
            return max_val

TEST EXAMPLE: Palindrome Check
PSEUDOCODE:

function to check if a string is palindrome
input: s (string)
reverse the string and store in reversed_s
if s equals reversed_s, return True
else return False


GENERATED CODE:
def main():
     = 0
     s = input().split()
     s = reverse().split()

In [11]:
import shutil

model_dir_gpt2 = "/kaggle/working/gpt2-lora-improved"
zip_path_gpt2 = "/kaggle/working/gpt2-lora-improved"
shutil.make_archive(zip_path_gpt2, 'zip', model_dir_gpt2)

print(f"GPT-2 model zipped: {zip_path_gpt2}.zip")


GPT-2 model zipped: /kaggle/working/gpt2-lora-improved.zip


# Fine Tuning CodeParrrot

In [8]:
from transformers import AutoModelForCausalLM, AutoTokenizer
from peft import LoraConfig, get_peft_model
import torch

model_name = "codeparrot/codeparrot-small"
tokenizer = AutoTokenizer.from_pretrained(model_name, trust_remote_code=False)
tokenizer.pad_token = tokenizer.eos_token

model = AutoModelForCausalLM.from_pretrained(
    model_name,
    torch_dtype=torch.float16,
    device_map="auto",
    trust_remote_code=False
)

lora_config = LoraConfig(
    r=16,
    lora_alpha=32,
    target_modules=["c_attn", "c_proj"],
    lora_dropout=0.05,
    bias="none",
    task_type="CAUSAL_LM"
)

model = get_peft_model(model, lora_config)
model.print_trainable_parameters()

tokenizer_config.json:   0%|          | 0.00/259 [00:00<?, ?B/s]

vocab.json: 0.00B [00:00, ?B/s]

merges.txt: 0.00B [00:00, ?B/s]

tokenizer.json: 0.00B [00:00, ?B/s]

special_tokens_map.json:   0%|          | 0.00/90.0 [00:00<?, ?B/s]

config.json:   0%|          | 0.00/903 [00:00<?, ?B/s]

pytorch_model.bin:   0%|          | 0.00/457M [00:00<?, ?B/s]

trainable params: 1,622,016 || all params: 112,630,272 || trainable%: 1.4401




In [9]:
from datasets import load_dataset
from transformers import TrainingArguments, Trainer, DataCollatorForLanguageModeling

output_dir_codeparrot = "/kaggle/working/codeparrot-lora-improved"

def tokenize_function(examples):
    texts = [f"<|pseudocode|>{p}<|code|>{c}<|end|>" for p, c in zip(examples["pseudo"], examples["code"])]
    return tokenizer(texts, truncation=True, max_length=384, padding="max_length")

dataset = load_dataset("json", data_files={"train": train_file, "eval": eval_file})
tokenized = dataset.map(tokenize_function, batched=True, remove_columns=dataset["train"].column_names)
data_collator = DataCollatorForLanguageModeling(tokenizer=tokenizer, mlm=False)

args = TrainingArguments(
    output_dir=output_dir_codeparrot,
    per_device_train_batch_size=2,
    per_device_eval_batch_size=2,
    gradient_accumulation_steps=4,
    learning_rate=2e-4,
    num_train_epochs=3,
    fp16=True,
    eval_strategy="steps",
    eval_steps=200,
    logging_steps=50,
    save_strategy="steps",
    save_steps=500,
    save_total_limit=2,
    report_to="none",
    logging_dir="/kaggle/working/logs_codeparrot",
    warmup_steps=50,
    weight_decay=0.01
)

trainer = Trainer(
    model=model,
    args=args,
    train_dataset=tokenized["train"],
    eval_dataset=tokenized["eval"],
    data_collator=data_collator,
    tokenizer=tokenizer,
)

trainer.train()
model.save_pretrained(output_dir_codeparrot)
tokenizer.save_pretrained(output_dir_codeparrot)
print(f"CodeParrot model saved to {output_dir_codeparrot}")


model.safetensors:   0%|          | 0.00/457M [00:00<?, ?B/s]

Map:   0%|          | 0/2000 [00:00<?, ? examples/s]

Map:   0%|          | 0/500 [00:00<?, ? examples/s]

  trainer = Trainer(
The model is already on multiple devices. Skipping the move to device specified in `args`.
The tokenizer has new PAD/BOS/EOS tokens that differ from the model config and generation config. The model config and generation config were aligned accordingly, being updated with the tokenizer's values. Updated tokens: {'eos_token_id': 0, 'bos_token_id': 0, 'pad_token_id': 0}.


Step,Training Loss,Validation Loss
200,1.7688,1.521128
400,1.4802,1.32333
600,1.3915,1.256053


CodeParrot model saved to /kaggle/working/codeparrot-lora-improved


In [16]:
import json, time, torch, sacrebleu
from transformers import AutoTokenizer, AutoModelForCausalLM
from peft import PeftModel

device = "cuda" if torch.cuda.is_available() else "cpu"
base = "codeparrot/codeparrot-small"
adapter_dir = "/kaggle/working/codeparrot-lora-improved"
eval_file = "/kaggle/working/eval.jsonl"

tokenizer = AutoTokenizer.from_pretrained(base)
base_model = AutoModelForCausalLM.from_pretrained(base).to(device)
model = PeftModel.from_pretrained(base_model, adapter_dir).to(device)
model.eval()

hyps, refs = [], []
with open(eval_file, "r", encoding="utf-8") as f:
    for i, line in enumerate(f):
        if not line.strip(): continue
        ex = json.loads(line)
        prompt = f"<|pseudocode|>{ex['pseudo'].strip()}<|code|>"
        inputs = tokenizer(prompt, return_tensors="pt", truncation=True, max_length=384).to(device)

        with torch.no_grad():
            out = model.generate(
                **inputs,
                max_new_tokens=128,
                do_sample=False,
                pad_token_id=tokenizer.eos_token_id
            )

        pred = tokenizer.decode(out[0], skip_special_tokens=False)
        if "<|end|>" in pred: pred = pred.split("<|end|>")[0]
        gen = pred.split("<|code|>", 1)[1].strip() if "<|code|>" in pred else pred.strip()

        hyps.append(gen)
        refs.append(ex.get("code","").strip())

        if (i+1) % 10 == 0:
            print(f"[{i+1}] examples processed", flush=True)
        if i >= 99:
            break

bleu = sacrebleu.corpus_bleu(hyps, [refs])
print("BLEU:", bleu.score)

with open("/kaggle/working/codeparrot_predictions.jsonl", "w", encoding="utf-8") as out:
    for r, h in zip(refs, hyps):
        out.write(json.dumps({"reference": r, "prediction": h}, ensure_ascii=False) + "\n")
print("Predictions saved to /kaggle/working/codeparrot_predictions.jsonl")


[10] examples processed
[20] examples processed
[30] examples processed
[40] examples processed
[50] examples processed
[60] examples processed
[70] examples processed
[80] examples processed
[90] examples processed
[100] examples processed
BLEU: 40.26321037762566
Predictions saved to /kaggle/working/codeparrot_predictions.jsonl


In [13]:
import torch
from transformers import AutoTokenizer, AutoModelForCausalLM

device = "cuda" if torch.cuda.is_available() else "cpu"
output_dir_codeparrot = "/kaggle/working/codeparrot-lora-improved"

tokenizer = AutoTokenizer.from_pretrained(output_dir_codeparrot)
model = AutoModelForCausalLM.from_pretrained(output_dir_codeparrot).to(device)

def generate_code(pseudocode, max_length=384):
    prompt = f"<|pseudocode|>{pseudocode.strip()}<|code|>"
    inputs = tokenizer.encode(prompt, return_tensors="pt").to(device)

    outputs = model.generate(
        inputs,
        max_length=max_length,
        temperature=0.7,
        top_p=0.95,
        do_sample=True,
        pad_token_id=tokenizer.eos_token_id
    )

    gen = tokenizer.decode(outputs[0], skip_special_tokens=False)

    # Stop at first <|end|> if exists
    if "<|end|>" in gen:
        gen = gen.split("<|end|>")[0]

    #Extract only the part after <|code|>
    code = gen.split("<|code|>", 1)[1].strip() if "<|code|>" in gen else gen.strip()
    return code


# ==== TEST CASES ====
tests = {
    "Factorial": """function to calculate factorial of a number
input: n
if n is 0 or 1 return 1
else return n * factorial(n-1)""",

    "Find Maximum": """function to find maximum in list
input: arr
initialize max to first element
loop through arr
if element > max update max
return max""",

    "Palindrome Check": """function to check palindrome
input: s
reverse the string
if s equals reversed return True
else return False"""
}

for name, pseudo in tests.items():
    print("\n" + "="*60)
    print(f"TEST: {name}")
    print("="*60)
    print(generate_code(pseudo))



TEST: Factorial
def factorial(n):
    if n == 0 or n == 1:
        return 1
    else:
        return n * factorial(n-1)

TEST: Find Maximum
def maxmaxmax():
    max = 0
    maxmax = 0
    maxmax = 0
    maxmax = 0
    maxmax = 0
    maxmax = 0
    maxmax = 0
    maxmax = maxmax()
    maxmax = maxmax()
    maxmax = maxmax()
    maxmax = maxmax()
    maxmax = maxmax()
    maxmax = maxmax()

TEST: Palindrome Check
def palindrome(s):
    if s == reversed(s):
        return True
        else:
            return False


In [14]:
import shutil
model_dir = "/kaggle/working/codeparrot-lora-improved"
zip_path = "/kaggle/working/codeparrot-lora-improved"
shutil.make_archive(zip_path, 'zip', model_dir)
print(f"CodeParrot model zipped: {zip_path}.zip")


CodeParrot model zipped: /kaggle/working/codeparrot-lora-improved.zip


In [None]:
print("\n" + "="*80)
print("COMPARISON SUMMARY")
