In [1]:
#Install transformers and torch
!pip install torch==2.5.1 torchvision==0.20.1 torchaudio==2.5.1 --index-url https://download.pytorch.org/whl/cu124
!pip install transformers datasets evaluate -q

Looking in indexes: https://download.pytorch.org/whl/cu124


In [2]:
#Load tokenizer
from transformers import T5ForConditionalGeneration, AutoModelForSeq2SeqLM
from transformers import RobertaTokenizer
from datasets import DatasetDict
from transformers import TrainingArguments, Trainer
from transformers import EarlyStoppingCallback


In [3]:
from datasets import Dataset
import pandas as pd
import torch
from sklearn.model_selection import train_test_split
from evaluate import load
from tqdm import tqdm
import nltk
nltk.download('punkt')

[nltk_data] Downloading package punkt to /root/nltk_data...
[nltk_data]   Package punkt is already up-to-date!


True

In [18]:
#Load Dataset
train_df = pd.read_csv("ft_train.csv")
val_df = pd.read_csv("ft_valid.csv")
test_df = pd.read_csv("ft_test.csv")

train_df = train_df.head(50)
val_df = val_df.head(50)
test_df = test_df.head(50)

print(train_df.columns)
print(len(train_df))
print(val_df.columns)
print(len(val_df))
print(test_df.columns)
print(len(test_df))

Index(['cleaned_method', 'target_block', 'tokens_in_method'], dtype='object')
50
Index(['cleaned_method', 'target_block', 'tokens_in_method'], dtype='object')
50
Index(['cleaned_method', 'target_block', 'tokens_in_method'], dtype='object')
50


In [19]:
#Preprocessing, creating the mask
MASK_TOKEN = "<mask>"

def mask_if_condition(row):
    original_code = row["cleaned_method"]
    target = row["target_block"]
    masked_code = original_code.replace(target, MASK_TOKEN)
    return masked_code

train_df["masked_method"] = train_df.apply(mask_if_condition, axis=1)
val_df["masked_method"] = val_df.apply(mask_if_condition, axis=1)
test_df["masked_method"] = test_df.apply(mask_if_condition, axis=1)

train_df = train_df[["masked_method", "target_block"]]
val_df = val_df[["masked_method", "target_block"]]
test_df = test_df[["masked_method", "target_block"]]

model_checkpoint = "Salesforce/codet5-small"
tokenizer = RobertaTokenizer.from_pretrained(model_checkpoint)
model = T5ForConditionalGeneration.from_pretrained(model_checkpoint)


def tokenize_function(examples):
    model_inputs = tokenizer(
        examples["masked_method"], max_length=512, truncation=True, padding="max_length"
    )
    with tokenizer.as_target_tokenizer():
        labels = tokenizer(
            examples["target_block"], max_length=128, truncation=True, padding="max_length"
        )
    model_inputs["labels"] = labels["input_ids"]
    return model_inputs

train_dataset = Dataset.from_pandas(train_df)
val_dataset = Dataset.from_pandas(val_df)
test_dataset = Dataset.from_pandas(test_df)

tokenized_train = train_dataset.map(tokenize_function, batched=True)
tokenized_val = val_dataset.map(tokenize_function, batched=True)
tokenized_test = test_dataset.map(tokenize_function, batched=True)


Map:   0%|          | 0/50 [00:00<?, ? examples/s]



Map:   0%|          | 0/50 [00:00<?, ? examples/s]

Map:   0%|          | 0/50 [00:00<?, ? examples/s]

In [20]:
#Training Setup
training_args = TrainingArguments(
    output_dir="./codeT5-if-finetuned",
    evaluation_strategy="epoch",
    save_strategy="epoch",
    learning_rate=5e-5,
    per_device_train_batch_size=4,
    per_device_eval_batch_size=4,
    num_train_epochs=5,
    weight_decay=0.01,
    load_best_model_at_end=True,
    metric_for_best_model="eval_loss",
    save_total_limit=2,
    logging_steps=100,
    push_to_hub=False,
)

trainer = Trainer(
    model=model,
    args=training_args,
    train_dataset=tokenized_train,
    eval_dataset=tokenized_val,
    tokenizer=tokenizer,
    callbacks=[EarlyStoppingCallback(early_stopping_patience=2)]
)

  trainer = Trainer(


In [21]:
#Training the Model
trainer.train()


Epoch,Training Loss,Validation Loss
1,No log,1.968799
2,No log,1.336459
3,No log,0.636953
4,No log,0.168401
5,No log,0.071818


There were missing keys in the checkpoint model loaded: ['encoder.embed_tokens.weight', 'decoder.embed_tokens.weight', 'lm_head.weight'].


TrainOutput(global_step=65, training_loss=1.5324284480168269, metrics={'train_runtime': 998.8132, 'train_samples_per_second': 0.25, 'train_steps_per_second': 0.065, 'total_flos': 33835450368000.0, 'train_loss': 1.5324284480168269, 'epoch': 5.0})

In [22]:
#Evaluating the Model
metrics = trainer.evaluate()


In [23]:
!pip install sacrebleu
!pip install codebleu

Collecting tree-sitter<0.23.0,>=0.22.0 (from codebleu)
  Using cached tree_sitter-0.22.3-cp311-cp311-manylinux_2_17_x86_64.manylinux2014_x86_64.whl.metadata (10 kB)
Using cached tree_sitter-0.22.3-cp311-cp311-manylinux_2_17_x86_64.manylinux2014_x86_64.whl (544 kB)
Installing collected packages: tree-sitter
  Attempting uninstall: tree-sitter
    Found existing installation: tree_sitter 0.2.0
    Uninstalling tree_sitter-0.2.0:
      Successfully uninstalled tree_sitter-0.2.0
Successfully installed tree-sitter-0.22.3


In [24]:
!pip install tree_sitter==0.2.0
!git clone -q https://github.com/microsoft/CodeXGLUE.git

Collecting tree_sitter==0.2.0
  Using cached tree_sitter-0.2.0-cp311-cp311-linux_x86_64.whl
Installing collected packages: tree_sitter
  Attempting uninstall: tree_sitter
    Found existing installation: tree-sitter 0.22.3
    Uninstalling tree-sitter-0.22.3:
      Successfully uninstalled tree-sitter-0.22.3
[31mERROR: pip's dependency resolver does not currently take into account all the packages that are installed. This behaviour is the source of the following dependency conflicts.
codebleu 0.7.0 requires tree-sitter<0.23.0,>=0.22.0, but you have tree-sitter 0.2.0 which is incompatible.[0m[31m
[0mSuccessfully installed tree_sitter-0.2.0
fatal: destination path 'CodeXGLUE' already exists and is not an empty directory.


In [25]:
#Predict and Evaluate on Test Set
import os
import subprocess
import pandas as pd
from tqdm import tqdm
from evaluate import load

sacrebleu = load("evaluate-metric/sacrebleu")

predictions_file = '/content/predictions-1000.txt'
targets_file = '/content/targets-1000.txt'

predictions = []
targets = []

results = []

for row in tqdm(test_df.itertuples(index=False)):
    input_text = row.masked_method
    expected = row.target_block
    inputs = tokenizer(input_text, return_tensors="pt", truncation=True, padding=True).to(model.device)
    outputs = model.generate(**inputs, max_length=128)
    predicted = tokenizer.decode(outputs[0], skip_special_tokens=True)

    predictions.append(predicted.strip())
    targets.append(expected.strip())

    is_exact = predicted.strip() == expected.strip()

    bleu_score = sacrebleu.compute(predictions=[predicted], references=[[expected]])["score"]

    results.append({
        "Input function with masked if condition": input_text,
        "Prediction is correct (exact match)": is_exact,
        "Expected if condition": expected,
        "Predicted if condition": predicted,
        "BLEU-4 prediction score (0-100)": round(bleu_score, 2)
    })

results_df = pd.DataFrame(results)
results_df.to_csv("testset-results.csv", index=False)

num_correct = sum([r["Prediction is correct (exact match)"] for r in results])
exact_match_accuracy = num_correct / len(results) * 100

print(f"Exact Match Accuracy: {exact_match_accuracy:.2f}%")
print("Saved results to testset-results.csv")

predictions = [r["Predicted if condition"] for r in results]
references = [[r["Expected if condition"]] for r in results]

average_bleu = sacrebleu.compute(predictions=predictions, references=references)["score"]
print(f"Average BLEU-4 Score: {average_bleu:.2f}")

!cd /content/CodeXGLUE/Code-Code/code-to-code-trans/evaluator/CodeBLEU/ && python calc_code_bleu.py --refs /content/targets-1000.txt --hyp /content/predictions-1000.txt --lang python --params 0.25,0.25,0.25,0.25

50it [01:16,  1.52s/it]


Exact Match Accuracy: 36.00%
Saved results to testset-results.csv
Average BLEU-4 Score: 58.36
ngram match: 0.1949789135290786, weighted ngram match: 0.20179560922377052, syntax_match: 0.22033898305084745, dataflow_match: 0.0
CodeBLEU score:  0.15427837645092413
