In [1]:
#Install transformers and torch
!pip install torch==2.5.1 torchvision==0.20.1 torchaudio==2.5.1 --index-url https://download.pytorch.org/whl/cu124
!pip install transformers datasets evaluate -q

Looking in indexes: https://download.pytorch.org/whl/cu124


In [2]:
#Load tokenizer
from transformers import T5ForConditionalGeneration, AutoModelForSeq2SeqLM
from transformers import RobertaTokenizer
from datasets import DatasetDict
from transformers import TrainingArguments, Trainer
from transformers import EarlyStoppingCallback


In [3]:
from datasets import Dataset
import pandas as pd
import torch
from sklearn.model_selection import train_test_split
from evaluate import load
from tqdm import tqdm
import nltk
nltk.download('punkt')

[nltk_data] Downloading package punkt to /root/nltk_data...
[nltk_data]   Package punkt is already up-to-date!


True

In [24]:
#Load Dataset
train_df = pd.read_csv("ft_train.csv")
val_df = pd.read_csv("ft_valid.csv")
test_df = pd.read_csv("ft_test.csv")

train_df = train_df.head(25)
val_df = val_df.head(25)
test_df = test_df.head(25)

print(train_df.columns)
print(len(train_df))
print(val_df.columns)
print(len(val_df))
print(test_df.columns)
print(len(test_df))

Index(['cleaned_method', 'target_block', 'tokens_in_method'], dtype='object')
25
Index(['cleaned_method', 'target_block', 'tokens_in_method'], dtype='object')
25
Index(['cleaned_method', 'target_block', 'tokens_in_method'], dtype='object')
25


In [25]:
#Preprocessing, creating the mask
MASK_TOKEN = "<mask>"

def mask_if_condition(row):
    original_code = row["cleaned_method"]
    target = row["target_block"]
    masked_code = original_code.replace(target, MASK_TOKEN)
    return masked_code

train_df["masked_method"] = train_df.apply(mask_if_condition, axis=1)
val_df["masked_method"] = val_df.apply(mask_if_condition, axis=1)
test_df["masked_method"] = test_df.apply(mask_if_condition, axis=1)

train_df = train_df[["masked_method", "target_block"]]
val_df = val_df[["masked_method", "target_block"]]
test_df = test_df[["masked_method", "target_block"]]

model_checkpoint = "Salesforce/codet5-small"
tokenizer = RobertaTokenizer.from_pretrained(model_checkpoint)
model = T5ForConditionalGeneration.from_pretrained(model_checkpoint)


def tokenize_function(examples):
    model_inputs = tokenizer(
        examples["masked_method"], max_length=512, truncation=True, padding="max_length"
    )
    with tokenizer.as_target_tokenizer():
        labels = tokenizer(
            examples["target_block"], max_length=128, truncation=True, padding="max_length"
        )
    model_inputs["labels"] = labels["input_ids"]
    print(f"Input text tokenized: {tokenizer(input_text)}")
    print(f"Target block tokenized: {tokenizer(expected)}")
    return model_inputs

train_dataset = Dataset.from_pandas(train_df)
val_dataset = Dataset.from_pandas(val_df)
test_dataset = Dataset.from_pandas(test_df)

tokenized_train = train_dataset.map(tokenize_function, batched=True)
tokenized_val = val_dataset.map(tokenize_function, batched=True)
tokenized_test = test_dataset.map(tokenize_function, batched=True)


Map:   0%|          | 0/25 [00:00<?, ? examples/s]



Input text tokenized: {'input_ids': [1, 536, 855, 12, 2890, 16, 1056, 33, 5510, 16, 2021, 33, 7036, 16, 2305, 67, 5836, 67, 4324, 33, 5510, 16, 2305, 67, 4538, 87, 33, 5510, 4672, 203, 565, 775, 30, 203, 3639, 327, 365, 6315, 896, 12, 1883, 16, 2021, 13, 203, 565, 1335, 31261, 18, 24128, 668, 487, 425, 30, 203, 3639, 309, 6369, 67, 4208, 49, 30, 203, 5411, 613, 18, 1376, 12, 203, 7734, 315, 896, 30, 425, 18, 19088, 5095, 87, 425, 18, 701, 1636, 5095, 87, 425, 18, 2150, 5095, 87, 8480, 5095, 87, 6, 203, 7734, 738, 261, 73, 18, 19088, 16, 425, 18, 701, 1636, 16, 425, 18, 2150, 16, 8480, 12, 73, 3719, 203, 5411, 262, 203, 3639, 309, 2305, 67, 4538, 87, 471, 353, 67, 4538, 12, 73, 4672, 203, 5411, 327, 5378, 203, 3639, 309, 2305, 67, 5836, 67, 4324, 471, 353, 67, 2135, 370, 12, 73, 4672, 203, 5411, 327, 5378, 203, 3639, 1002, 203, 2], 'attention_mask': [1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 

Map:   0%|          | 0/25 [00:00<?, ? examples/s]

Input text tokenized: {'input_ids': [1, 536, 855, 12, 2890, 16, 1056, 33, 5510, 16, 2021, 33, 7036, 16, 2305, 67, 5836, 67, 4324, 33, 5510, 16, 2305, 67, 4538, 87, 33, 5510, 4672, 203, 565, 775, 30, 203, 3639, 327, 365, 6315, 896, 12, 1883, 16, 2021, 13, 203, 565, 1335, 31261, 18, 24128, 668, 487, 425, 30, 203, 3639, 309, 6369, 67, 4208, 49, 30, 203, 5411, 613, 18, 1376, 12, 203, 7734, 315, 896, 30, 425, 18, 19088, 5095, 87, 425, 18, 701, 1636, 5095, 87, 425, 18, 2150, 5095, 87, 8480, 5095, 87, 6, 203, 7734, 738, 261, 73, 18, 19088, 16, 425, 18, 701, 1636, 16, 425, 18, 2150, 16, 8480, 12, 73, 3719, 203, 5411, 262, 203, 3639, 309, 2305, 67, 4538, 87, 471, 353, 67, 4538, 12, 73, 4672, 203, 5411, 327, 5378, 203, 3639, 309, 2305, 67, 5836, 67, 4324, 471, 353, 67, 2135, 370, 12, 73, 4672, 203, 5411, 327, 5378, 203, 3639, 1002, 203, 2], 'attention_mask': [1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 

Map:   0%|          | 0/25 [00:00<?, ? examples/s]

Input text tokenized: {'input_ids': [1, 536, 855, 12, 2890, 16, 1056, 33, 5510, 16, 2021, 33, 7036, 16, 2305, 67, 5836, 67, 4324, 33, 5510, 16, 2305, 67, 4538, 87, 33, 5510, 4672, 203, 565, 775, 30, 203, 3639, 327, 365, 6315, 896, 12, 1883, 16, 2021, 13, 203, 565, 1335, 31261, 18, 24128, 668, 487, 425, 30, 203, 3639, 309, 6369, 67, 4208, 49, 30, 203, 5411, 613, 18, 1376, 12, 203, 7734, 315, 896, 30, 425, 18, 19088, 5095, 87, 425, 18, 701, 1636, 5095, 87, 425, 18, 2150, 5095, 87, 8480, 5095, 87, 6, 203, 7734, 738, 261, 73, 18, 19088, 16, 425, 18, 701, 1636, 16, 425, 18, 2150, 16, 8480, 12, 73, 3719, 203, 5411, 262, 203, 3639, 309, 2305, 67, 4538, 87, 471, 353, 67, 4538, 12, 73, 4672, 203, 5411, 327, 5378, 203, 3639, 309, 2305, 67, 5836, 67, 4324, 471, 353, 67, 2135, 370, 12, 73, 4672, 203, 5411, 327, 5378, 203, 3639, 1002, 203, 2], 'attention_mask': [1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 

In [26]:
#Training Setup
training_args = TrainingArguments(
    output_dir="./codeT5-if-finetuned",
    evaluation_strategy="epoch",
    save_strategy="epoch",
    learning_rate=5e-5,
    per_device_train_batch_size=4,
    per_device_eval_batch_size=4,
    num_train_epochs=5,
    weight_decay=0.01,
    load_best_model_at_end=True,
    metric_for_best_model="eval_loss",
    save_total_limit=2,
    logging_steps=100,
    push_to_hub=False,
)

trainer = Trainer(
    model=model,
    args=training_args,
    train_dataset=tokenized_train,
    eval_dataset=tokenized_val,
    tokenizer=tokenizer,
    callbacks=[EarlyStoppingCallback(early_stopping_patience=2)]
)

  trainer = Trainer(


In [27]:
#Training the Model
trainer.train()


Epoch,Training Loss,Validation Loss
1,No log,2.297465
2,No log,1.988649
3,No log,1.690122
4,No log,1.464521
5,No log,1.386525


There were missing keys in the checkpoint model loaded: ['encoder.embed_tokens.weight', 'decoder.embed_tokens.weight', 'lm_head.weight'].


TrainOutput(global_step=35, training_loss=2.6596448625837055, metrics={'train_runtime': 547.7042, 'train_samples_per_second': 0.228, 'train_steps_per_second': 0.064, 'total_flos': 16917725184000.0, 'train_loss': 2.6596448625837055, 'epoch': 5.0})

In [28]:
#Evaluating the Model
metrics = trainer.evaluate()


In [29]:
!pip install sacrebleu
!pip install codebleu

Collecting tree-sitter<0.23.0,>=0.22.0 (from codebleu)
  Using cached tree_sitter-0.22.3-cp311-cp311-manylinux_2_17_x86_64.manylinux2014_x86_64.whl.metadata (10 kB)
Using cached tree_sitter-0.22.3-cp311-cp311-manylinux_2_17_x86_64.manylinux2014_x86_64.whl (544 kB)
Installing collected packages: tree-sitter
  Attempting uninstall: tree-sitter
    Found existing installation: tree-sitter 0.23.0
    Uninstalling tree-sitter-0.23.0:
      Successfully uninstalled tree-sitter-0.23.0
Successfully installed tree-sitter-0.22.3


In [10]:
!pip install tree_sitter==0.23.0
!pip install tree-sitter-python==0.23.0

Collecting tree_sitter==0.23.0
  Using cached tree_sitter-0.23.0-cp311-cp311-manylinux_2_17_x86_64.manylinux2014_x86_64.whl.metadata (10 kB)
Using cached tree_sitter-0.23.0-cp311-cp311-manylinux_2_17_x86_64.manylinux2014_x86_64.whl (559 kB)
Installing collected packages: tree_sitter
  Attempting uninstall: tree_sitter
    Found existing installation: tree-sitter 0.22.3
    Uninstalling tree-sitter-0.22.3:
      Successfully uninstalled tree-sitter-0.22.3
[31mERROR: pip's dependency resolver does not currently take into account all the packages that are installed. This behaviour is the source of the following dependency conflicts.
codebleu 0.7.0 requires tree-sitter<0.23.0,>=0.22.0, but you have tree-sitter 0.23.0 which is incompatible.[0m[31m
[0mSuccessfully installed tree_sitter-0.23.0


In [38]:
#Predict and Evaluate on Test Set
import os
import subprocess
import pandas as pd
from tqdm import tqdm
from evaluate import load

# Initialize BLEU and CodeBLEU evaluators
sacrebleu = load("evaluate-metric/sacrebleu")

# Set up paths for predictions and targets
predictions_file = '/content/predictions-1000.txt'
targets_file = '/content/targets-1000.txt'

# Initialize lists to store results
predictions = []
targets = []

results = []

# Loop through the test dataset and compute predictions
for row in tqdm(test_df.itertuples(index=False)):
    input_text = row.masked_method
    expected = row.target_block
    inputs = tokenizer(input_text, return_tensors="pt", truncation=True, padding=True).to(model.device)
    outputs = model.generate(**inputs, max_length=128)
    predicted = tokenizer.decode(outputs[0], skip_special_tokens=True)

    predictions.append(predicted.strip())
    targets.append(expected.strip())

    # Check if prediction is correct (exact match)
    is_exact = predicted.strip() == expected.strip()

    # Compute BLEU-4 score using sacreBLEU
    bleu_score = sacrebleu.compute(predictions=[predicted], references=[[expected]])["score"]

    # Append results
    results.append({
        "Input function with masked if condition": input_text,
        "Prediction is correct (exact match)": is_exact,
        "Expected if condition": expected,
        "Predicted if condition": predicted,
        "BLEU-4 prediction score (0-100)": round(bleu_score, 2)
    })

# Write predictions and expected outputs to files for CodeBLEU
with open(predictions_file, 'w') as f_pred, open(targets_file, 'w') as f_tgt:
    for pred, tgt in zip(predictions, targets):
        f_pred.write(pred + '\n')
        f_tgt.write(tgt + '\n')

# Run the CodeBLEU evaluation (this may take some time)
codebleu_command = f"""
cd /content/CodeXGLUE/Code-Code/code-to-code-trans/evaluator/CodeBLEU/ &&
python calc_code_bleu.py --refs {targets_file} --hyp {predictions_file} --lang java --params 0.25,0.25,0.25,0.25
"""

# Capture the output from the CodeBLEU command
codebleu_output = subprocess.run(codebleu_command, shell=True, capture_output=True, text=True)

# Check if the CodeBLEU output contains the score and extract it
codebleu_score = 0
for line in codebleu_output.stdout.splitlines():
    if "CodeBLEU" in line:  # Look for the line that contains the CodeBLEU score
        # Assuming the format "CodeBLEU score: X" or similar, extract the score
        try:
            codebleu_score = float(line.split()[-1])  # This assumes the score is the last word
        except ValueError:
            codebleu_score = 0

# Add the CodeBLEU score to each result
for result in results:
    result["CodeBLEU prediction score (0-100)"] = round(codebleu_score, 2)

# Save results to a CSV file
results_df = pd.DataFrame(results)
results_df.to_csv("testset-results.csv", index=False)

print("Saved results to testset-results.csv")


25it [00:18,  1.32it/s]

Saved results to testset-results.csv



