In [None]:
import os
os.environ["CUDA_DEVICE_ORDER"] = "PCI_BUS_ID"
os.environ["CUDA_VISIBLE_DEVICES"] = "1"

In [None]:
from tqdm import tqdm
import torch
import re
import torch.nn as nn
from c2nl.eval.bleu import corpus_bleu
from c2nl.eval.rouge import Rouge
from c2nl.eval.meteor import Meteor
from datasets import load_dataset, load_from_disk, Dataset
from transformers import AutoModel, AutoTokenizer, TrainingArguments, Trainer, get_linear_schedule_with_warmup, T5ForConditionalGeneration
from torch.utils.data import DataLoader
import numpy as np
import random

In [None]:
def set_seed(seed_value):
    """Set seed for reproducibility."""
    random.seed(seed_value)  # Python random module
    np.random.seed(seed_value)  # Numpy module
    torch.manual_seed(seed_value)  # PyTorch
    torch.cuda.manual_seed(seed_value)  # PyTorch CUDA
    torch.cuda.manual_seed_all(seed_value)  # PyTorch CUDA (for multi-GPU setups)
    torch.backends.cudnn.deterministic = True  # For CUDA backend
    torch.backends.cudnn.benchmark = False
    os.environ['PYTHONHASHSEED'] = str(seed_value)  # For Python hash seeding
# Example usage
set_seed(42)  # Replace 42 with your desired seed

In [None]:
lang = "python"
device = torch.device("cuda" if torch.cuda.is_available() else "cpu")
if lang == 'java':
    base_model ='Salesforce/codet5p-220m'
else:
    base_model = 'Salesforce/codet5p-220m-bimodal'
if lang == 'java':
    base_model_tokenizer = 'Salesforce/codet5p-220m'
else:
    base_model_tokenizer = 'Salesforce/codet5p-220m-bimodal'
if 'bimodal' in base_model or 'python' in base_model:
    print("using auto model")
    model = AutoModel.from_pretrained(base_model, trust_remote_code=True).to(device)
else:
    print("using t5 conditional generation model")
    model = T5ForConditionalGeneration.from_pretrained(base_model, trust_remote_code=True).to(device)
    
checkpoint_dir = "./codet5p_checkpoints"
checkpoint_name = f"codet5p_ft_lang_{lang}_backbone"
max_input_length = 512
max_target_length = 128

In [None]:
print(torch.cuda.get_device_name(device))

In [None]:
tokenizer = AutoTokenizer.from_pretrained(base_model_tokenizer)
train_source_dir = "./data/{}/train/code.original".format(lang)
train_target_dir = "./data/{}/train/javadoc.original".format(lang)
validation_source_dir = "./data/{}/dev/code.original".format(lang)
validation_target_dir = "./data/{}/dev/javadoc.original".format(lang)
test_source_dir = "./data/{}/test/code.original".format(lang)
test_target_dir = "./data/{}/test/javadoc.original".format(lang)

In [None]:
# load my own data

codes = open(train_source_dir, 'r').readlines()
docs = open(train_target_dir, 'r').readlines()
train_inputs = tokenizer(codes, max_length=max_input_length, padding="max_length", truncation=True)
labels = tokenizer(docs, max_length=max_target_length, padding="max_length", truncation=True)
train_inputs["labels"] = labels["input_ids"].copy()
train_inputs["labels"] = [
    [(l if l != tokenizer.pad_token_id else -100) for l in label] for label in train_inputs["labels"]
]
train_inputs["labels_attention_mask"] = labels["attention_mask"].copy()

In [None]:
train_data = Dataset.from_dict(train_inputs)
train_data.set_format(type='torch', columns=['input_ids', 'attention_mask', 'labels', 'labels_attention_mask'])
train_data

In [None]:
train_loader = DataLoader(train_data, batch_size=15, shuffle=True)

In [None]:
learning_rate = 5e-5
warmup_steps = 10000
num_epochs = 200

total_steps = len(train_loader) * num_epochs

In [None]:
def eval_bleu(model, device, src_dir, tgt_dir, tokenizer):
    model.eval()
    source_codes = open(src_dir, encoding="utf-8").readlines()
    targets = open(tgt_dir, encoding="utf-8").readlines()
    all_summaries = []
    batch_size = 32
    for i in tqdm(range(0, len(source_codes), batch_size)):
        batch = source_codes[i:i+batch_size]
        input_ids = tokenizer(batch, return_tensors="pt", padding=True, truncation=True, max_length=512).input_ids.to(device)
        generated_ids = model.generate(input_ids, max_length=50)
        summaries = [tokenizer.decode(generated_ids[j], skip_special_tokens=True) for j in range(len(batch))]
        all_summaries.extend(summaries)
    hypotheses = dict(enumerate([[summary.rstrip().lower()[:-1]+' .'] for summary in all_summaries]))
    references = dict(enumerate([[target.rstrip().lower()] for target in targets]))
    _, bleu, ind_bleu = corpus_bleu(hypotheses, references)
    return bleu
    

In [None]:
optimizer = torch.optim.AdamW(model.parameters(), lr=learning_rate)
scheduler = get_linear_schedule_with_warmup(optimizer, num_warmup_steps=warmup_steps, num_training_steps=total_steps)

In [None]:
steps = 0
curr_epoch = 0
checkpoints = [f for f in os.listdir(checkpoint_dir) if f.startswith(checkpoint_name+".pt")]
if len(checkpoints) > 0:
    checkpoint = torch.load(os.path.join(checkpoint_dir, checkpoints[-1]))
    model = model.from_pretrained(checkpoint_name).to(device)
    optimizer.load_state_dict(checkpoint['optimizer_state_dict'])
    scheduler.load_state_dict(checkpoint['scheduler_state_dict'])
    curr_epoch = checkpoint['epoch']
    print("Loaded checkpoint: ", checkpoints[-1], " at epoch ", curr_epoch)
    print(f"current loss: {checkpoint['loss']}")
else:
    print("No checkpoints found")

In [None]:
# measure model performance before fine-tuning
if curr_epoch < num_epochs:
    print("Test BLEU: ", eval_bleu(model, device, test_source_dir, test_target_dir, tokenizer))

In [None]:
def train(model, device, train_loader, optimizer, scheduler, num_epochs, curr_epoch=0):
    for epoch in range(curr_epoch, num_epochs):
        model.train()
        total_loss = 0

        # Wrap the train_loader with tqdm for a progress bar
        progress_bar = tqdm(train_loader, desc=f"Epoch {epoch+1}/{num_epochs}")

        for batch in progress_bar:
            # Load batch to device
            input_ids = batch['input_ids'].to(device)
            attention_mask = batch['attention_mask'].to(device)
            labels = batch['labels'].to(device)
            labels_attention_mask = batch['labels_attention_mask'].to(device)

            # Forward pass
            model.zero_grad()
            outputs = model(input_ids=input_ids, 
                            attention_mask=attention_mask, 
                            labels=labels,
                            decoder_attention_mask=labels_attention_mask)
            loss = outputs.loss

            # Backward pass
            optimizer.zero_grad()
            loss.backward()
            torch.nn.utils.clip_grad_norm_(model.parameters(), 1.0)  # Gradient clipping
            optimizer.step()
            scheduler.step()  # Update the learning rate

            total_loss += loss.item()

            # Update the progress bar with the current loss
            progress_bar.set_postfix({'loss': loss.item()})

        avg_epoch_loss = total_loss / len(train_loader)
        
        torch.save({
            'epoch': epoch+1,
            'optimizer_state_dict': optimizer.state_dict(),
            'scheduler_state_dict': scheduler.state_dict(),
            'loss': avg_epoch_loss
        }, os.path.join(checkpoint_dir, checkpoint_name+".pt"))
        model.save_pretrained(checkpoint_name)
            
        # evaluate model performance every 5 epochs
        if (epoch+1) % 5 == 0:
            bleu = eval_bleu(model, device, validation_source_dir, validation_target_dir, tokenizer)
            print("validation BLEU: ", bleu)
        print(f"Epoch {epoch+1} completed. Average Loss: {avg_epoch_loss}")

In [None]:
# Start training
train(model, device, train_loader, optimizer, scheduler, num_epochs, curr_epoch=curr_epoch)

In [None]:
# measure model performance after fine-tuning
eval_bleu(model, device, test_source_dir, test_target_dir, tokenizer)

In [None]:
# Save the model
model.save_pretrained(checkpoint_name)

In [None]:
def distinct_with_beam_search(model,
                              device,
                              src_dir,
                              tgt_dir,
                              tokenizer,
                              batch_size=16,
                              beam_size=10,
                              num_return_sequences=8):
    model.eval()
    source_codes = open(src_dir, encoding="utf-8").readlines()
    targets = open(tgt_dir, encoding="utf-8").readlines()
    source_codes = [code.rstrip() for code in source_codes]
    targets = [target.rstrip() for target in targets]
    all_summaries = []
    for i in tqdm(range(0, len(source_codes), batch_size)):
        batch = source_codes[i:i+batch_size]
        input = tokenizer(batch, return_tensors="pt", padding=True, truncation=True, max_length=512)
        input_ids = input["input_ids"].to(device)

        generated_ids = model.generate(input_ids, max_length=100, num_beams=beam_size, num_return_sequences=num_return_sequences)
        summaries = [tokenizer.decode(generated_ids[j], skip_special_tokens=True) for j in range(len(batch)*num_return_sequences)]
        all_summaries.extend(summaries)
    hypotheses = dict(enumerate([[re.sub(r"\n{1,}|\t{1,}|\r{1,}", " ", summary.strip().lower()[:-1]+' .')] for summary in all_summaries]))
    # repeat targets for each generated sequence
    repeated_targets = []
    for target in targets:
        repeated_targets.extend([target]*num_return_sequences)
    references = dict(enumerate([[re.sub(r"\n{1,}|\t{1,}|\r{1,}", " ", target.strip().lower())] for target in repeated_targets]))
    #calculate oracle scores
    _, bleu, ind_bleu = corpus_bleu(hypotheses, references)
    reshaped_bleu = np.array(list(ind_bleu.values())).reshape(-1, num_return_sequences)
    oracle_bleu = np.max(reshaped_bleu, axis=1)
    print("Oracle bleu: ", np.mean(oracle_bleu) * 100)
    rouge_calculator = Rouge()
    rouge_l, ind_rouge = rouge_calculator.compute_score(references, hypotheses)
    reshaped_rouge = np.array(list(ind_rouge.values())).reshape(-1, num_return_sequences)
    oracle_rouge = np.max(reshaped_rouge, axis=1)
    print("Oracle rouge-l: ", np.mean(oracle_rouge) * 100)
    meteor_calculator = Meteor()
    meteor, ind_meteor = meteor_calculator.compute_score(references, hypotheses)
    reshaped_meteor = np.array(list(ind_meteor)).reshape(-1, num_return_sequences)
    oracle_meteor = np.max(reshaped_meteor, axis=1)
    print("Oracle meteor: ", np.mean(oracle_meteor) * 100)

    return hypotheses, references

In [None]:
num_return_sequences_list = [10, 20]
for num_return_sequences in num_return_sequences_list:
    hypotheses, references = distinct_with_beam_search(model,
                                                       device,
                                                       test_source_dir,
                                                       test_target_dir,
                                                       tokenizer,
                                                       batch_size=4,
                                                       beam_size=num_return_sequences,
                                                       num_return_sequences=num_return_sequences)