# Dependencies

In [1]:
from google.colab import drive
drive.mount('/content/drive')

Mounted at /content/drive


In [2]:
%cd "drive/MyDrive/Master Thesis/compbert"
%ls

/content/drive/MyDrive/Master Thesis/compbert
[0m[01;34mdataset[0m/  requirements.txt  [01;34msaved_models[0m/


In [3]:
! pip install torch transformers numpy torcheval

Collecting torcheval
  Downloading torcheval-0.0.7-py3-none-any.whl.metadata (8.6 kB)
Downloading torcheval-0.0.7-py3-none-any.whl (179 kB)
[2K   [90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━[0m [32m179.2/179.2 kB[0m [31m4.7 MB/s[0m eta [36m0:00:00[0m
[?25hInstalling collected packages: torcheval
Successfully installed torcheval-0.0.7


# Imports and arguments

In [4]:
import logging
import os
import random
import json
import torch
import torch.nn as nn
import numpy as np
from math import ceil
from pathlib import Path
from importlib import reload
from torcheval.metrics.functional import binary_auprc, binary_auroc
from torch.utils.data import (
    Dataset, DataLoader, RandomSampler, SequentialSampler
)
from transformers import (
    RobertaModel, RobertaTokenizer,
    get_linear_schedule_with_warmup
)


In [5]:
class Args:
    def __init__(self):
        self.output_dir = (
            "/content/drive/MyDrive/Master Thesis/compbert/saved_models"
        )
        self.model_name_or_path = "microsoft/unixcoder-base"
        self.do_train = False
        self.do_eval = True
        self.train_data_file = (
            "/content/drive/MyDrive/Master Thesis/compbert/dataset/"
            "data_formatted2_sample.json"
        )
        self.eval_data_file = (
            "/content/drive/MyDrive/Master Thesis/compbert/dataset/"
            "data_formatted2_sample.json"
        )
        self.dev_data_file = (
            "/content/drive/MyDrive/Master Thesis/compbert/dataset/"
            "data_formatted2_sample.json"
        )
        self.balance_type = "full_set_balanced"
        self.num_train_epochs = 2
        self.code_length = 512
        self.nl_length = 256
        self.train_batch_size = 24
        self.eval_batch_size = 24
        self.learning_rate = 1e-5
        self.max_grad_norm = 1.0
        self.seed = 123456

        self.device = torch.device(
            "cuda" if torch.cuda.is_available() else "cpu"
        )
        self.n_gpu = torch.cuda.device_count()

args = Args()


# Setup

In [6]:
random.seed(args.seed)
os.environ['PYHTONHASHSEED'] = str(args.seed)
np.random.seed(args.seed)
torch.manual_seed(args.seed)
torch.cuda.manual_seed(args.seed)
torch.backends.cudnn.deterministic = True

# Set log
reload(logging)
logger = logging.getLogger(__name__)

logging.basicConfig(
    format="%(asctime)s - %(levelname)s - %(name)s -   %(message)s",
    datefmt="%m/%d/%Y %H:%M:%S",
    level=logging.INFO,
)

# Model

In [7]:
class Model(nn.Module):
    """
    A neural network model that encodes code and natural language (NL) inputs
    using a shared encoder and normalizes the output representations. This
    model can handle either code or NL inputs, depending on which is provided.

    Attributes:
        encoder (nn.Module): The encoder module used to generate embeddings
                             for the input data.

    Methods:
        forward(code_inputs=None, nl_inputs=None):
            Performs a forward pass through the encoder using either code or NL
            inputs and returns the normalized output embeddings. If
            `code_inputs` is provided, it processes the code inputs; otherwise,
            it processes the NL inputs.

            Args:
                code_inputs (torch.Tensor, optional): Tokenized code input
                                                      tensor. Defaults to None.
                nl_inputs (torch.Tensor, optional): Tokenized NL input tensor.
                                                    Defaults to None.

            Returns:
                torch.Tensor: Normalized output embeddings of the input data.
    """
    def __init__(self, encoder):
        super(Model, self).__init__()
        self.encoder = encoder

    def forward(self, code_inputs=None, nl_inputs=None):
        if code_inputs is not None:
            outputs = self.encoder(
                code_inputs, attention_mask=code_inputs.ne(1)
            )[0]
            outputs = (
                (outputs * code_inputs.ne(1)[:, :, None]).sum(1)
                / code_inputs.ne(1).sum(-1)[:, None]
            )
            return torch.nn.functional.normalize(outputs, p=2, dim=1)
        else:
            outputs = self.encoder(
                nl_inputs, attention_mask=nl_inputs.ne(1)
            )[0]
            outputs = (
                (outputs * nl_inputs.ne(1)[:, :, None]).sum(1)
                / nl_inputs.ne(1).sum(-1)[:, None]
            )
            return torch.nn.functional.normalize(outputs, p=2, dim=1)


# Data classes and functions

In [8]:
class InputFeatures(object):
    """
    A class to represent a single set of features for training or testing.

    This class encapsulates the features extracted from an example, including
    tokenized inputs and metadata. It is used to store and pass around the
    processed data for a single example, which includes both code and natural
    language inputs.

    Attributes:
        code_tokens (list of str): List of tokenized code tokens.
        code_ids (list of int): List of token IDs corresponding to the code
                                tokens.
        nl_tokens (list of str): List of tokenized natural language tokens.
        nl_ids (list of int): List of token IDs corresponding to the natural
                              language tokens.
        changed (bool): Indicator of whether the code has been modified.
        idx (int): Index of the example in the dataset.
    """
    def __init__(self,
                 code_tokens,
                 code_ids,
                 nl_tokens,
                 nl_ids,
                 changed,
                 idx):
        self.code_tokens = code_tokens
        self.code_ids = code_ids
        self.nl_tokens = nl_tokens
        self.nl_ids = nl_ids
        self.changed = changed
        self.idx = idx

In [9]:
def convert_examples_to_features(entry, idx, tokenizer):
    """
    Convert a single example into token IDs suitable for model input.

    This function tokenizes the code and natural language input from the
    provided example, truncates or pads them to the specified lengths, and
    then converts these tokens into token IDs using the provided tokenizer.

    Args:
        entry (dict): A dictionary containing 'code', 'nl_input', and 'changed'
                      keys. 'code' and 'nl_input' are the texts to be tokenized,
                      and 'changed' indicates if the code has been modified.
        idx (int): The index of the example in the dataset.
        tokenizer (transformers.PreTrainedTokenizer): The tokenizer instance
                  used to convert text to token IDs.

    Returns:
        InputFeatures: An instance of InputFeatures containing tokenized and
                       padded code and natural language inputs, along with
                       additional metadata (e.g., the 'changed' flag and index).
                       Returns None if the tokenized inputs exceed the maximum
                       allowed length.
    """
    code_tokens = tokenizer.tokenize(entry["code"])
    nl_tokens = tokenizer.tokenize(entry["nl_input"])

    # print(f"code: {len(code_tokens)} - nl: {len(nl_tokens)}")
    if (len(code_tokens) > args.code_length - 4
            or len(nl_tokens) > args.nl_length - 4):
        return None

    code_tokens = [
        tokenizer.cls_token, "<encoder-only>", tokenizer.sep_token
    ] + code_tokens + [tokenizer.sep_token]
    code_ids = tokenizer.convert_tokens_to_ids(code_tokens)
    padding_length = args.code_length - len(code_ids)
    code_ids += [tokenizer.pad_token_id] * padding_length

    nl_tokens = [
        tokenizer.cls_token, "<encoder-only>", tokenizer.sep_token
    ] + nl_tokens + [tokenizer.sep_token]
    nl_ids = tokenizer.convert_tokens_to_ids(nl_tokens)
    padding_length = args.nl_length - len(nl_ids)
    nl_ids += [tokenizer.pad_token_id] * padding_length

    return InputFeatures(
        code_tokens, code_ids, nl_tokens, nl_ids, entry["changed"], idx
    )


In [10]:
class TextDataset(Dataset):
    """
    A dataset class for handling text examples, including code and natural
    language inputs. This class is responsible for loading, processing, and
    balancing examples from a JSON file according to the specified balance type.

    Attributes:
        examples (list): A list of `InputFeatures` objects representing
                         the processed examples.
    """

    def __init__(self, tokenizer, file_path=None,
                 balance_type="full_set_imbalanced", show_example=False):
        """
        Initializes the dataset by loading examples from a JSON file,
        processing them, and balancing them according to the specified
        balance type.

        Args:
            tokenizer (PreTrainedTokenizer): The tokenizer to use for
                                             tokenizing the code and
                                             natural language inputs.
            file_path (str, optional): Path to the JSON file containing the
                                       dataset. Defaults to None.
            balance_type (str, optional): Specifies how to balance the dataset.
                                           Options are "full_set_balanced",
                                           "full_set_imbalanced",
                                           "one_per_issue", and
                                           "multiple_per_issue". Defaults
                                           to "full_set_imbalanced".
            show_example (bool, optional): If True, prints a few examples
                                            from the dataset for inspection.
                                            Defaults to False.
        """
        self.examples = []

        file_path = Path(file_path)
        with file_path.open("r") as f:
            entries = json.load(f)

        logger.info(f"Dataset size at start: {len(entries)}")

        large_examples = 0
        issue = ""
        pos_examples, neg_examples = [], []
        last_idx = len(entries) - 1
        for idx, entry in enumerate(entries):
            if (balance_type in {"one_per_issue", "multiple_per_issue"}
                and (issue != entry["issue"] or idx == last_idx)):

                if balance_type == "one_per_issue":
                    if (len(pos_examples) != 0) and (len(neg_examples) != 0):
                        pos_example = random.choice(pos_examples)
                        neg_example = random.choice(neg_examples)
                        self.examples += [pos_example, neg_example]

                elif balance_type == "multiple_per_issue":
                    self.examples += balance_multiple(
                        pos_examples, neg_examples
                    )

                pos_examples, neg_examples = [], []
                issue = entry["issue"]

            feature = convert_examples_to_features(entry, idx, tokenizer)
            if feature is None:
                large_examples += 1
            elif entry["changed"]:
                pos_examples.append(feature)
            else:
                neg_examples.append(feature)

        if balance_type == "full_set_balanced":
            self.examples += balance_multiple(pos_examples, neg_examples)

        elif balance_type == "full_set_imbalanced":
            logger.info(f"  n positive examples = {len(pos_examples)}")
            logger.info(f"  n negative examples = {len(neg_examples)}")
            self.examples += pos_examples + neg_examples

        logger.info(f"Final dataset size: {len(self.examples)}")
        logger.info(
            f"Removed entries due to exceeding token limit: {large_examples}"
        )

        if show_example:
            for idx, example in enumerate(self.examples[:3]):
                logger.info("*** Example ***")
                logger.info(f"idx: {idx}")
                logger.info("code_tokens: {}".format(
                    [x.replace('\u0120', '_') for x in example.code_tokens]))
                logger.info(f"code_ids: {' '.join(map(str, example.code_ids))}")
                logger.info("nl_tokens: {}".format(
                    [x.replace('\u0120', '_') for x in example.nl_tokens]))
                logger.info(f"nl_ids: {' '.join(map(str, example.nl_ids))}")

    def __len__(self):
        """Returns the number of examples in the dataset."""
        return len(self.examples)

    def __getitem__(self, idx):
        """
        Retrieves the example at the specified index.

        Args:
            idx (int): The index of the example to retrieve.

        Returns:
            dict: A dictionary containing the token IDs and metadata for the
                  example.
        """
        example = self.examples[idx]
        return {"code_input": torch.tensor(example.code_ids),
                "nl_input": torch.tensor(example.nl_ids),
                "changed": torch.tensor(example.changed),
                "idx": example.idx}


In [11]:
def balance_multiple(pos_examples, neg_examples):
    """
    Balances two lists of examples by truncating the larger list to match
    the size of the smaller list. The lists are shuffled before truncation
    to ensure randomness in the selection.

    Args:
        pos_examples (list): A list of positive examples.
        neg_examples (list): A list of negative examples.

    Returns:
        list: A combined list of positive and negative examples, balanced
              to have the same number of examples from each list. The
              examples are shuffled before truncation to ensure randomness.
    """
    len_positives = len(pos_examples)
    len_negatives = len(neg_examples)
    if len_positives < len_negatives:
        np.random.shuffle(neg_examples)
        neg_examples = neg_examples[:len_positives]
    elif len_positives > len_negatives:
        np.random.shuffle(pos_examples)
        pos_examples = pos_examples[:len_negatives]

    assert len(pos_examples) == len(neg_examples)
    return pos_examples + neg_examples

# Training function

In [12]:
def train(model, tokenizer):
    """
    Trains the model using the specified tokenizer.

    This function performs the training of the given model on the dataset
    provided by `args.train_data_file`. It handles the creation of training
    and evaluation datasets, sets up the optimizer and learning rate scheduler,
    and manages the training loop. It also logs various training metrics and
    saves model checkpoints based on evaluation performance.

    Args:
        model (torch.nn.Module): The model to be trained.
        tokenizer (transformers.PreTrainedTokenizer): The tokenizer used to
                                                      preprocess the text data.

    Returns:
        None
    """
    # Get training dataset
    train_dataset = TextDataset(
        tokenizer,
        args.train_data_file,
        balance_type=args.balance_type,
        show_example=True
    )
    train_sampler = RandomSampler(train_dataset)
    train_dataloader = DataLoader(
        train_dataset,
        sampler=train_sampler,
        batch_size=args.train_batch_size,
        num_workers=4
    )

    # Get evaluation dataset
    dev_dataset = TextDataset(
        tokenizer,
        args.dev_data_file,
        balance_type="full_set_imbalanced"
    )

    # Get optimizer and scheduler
    optimizer = torch.optim.AdamW(
        model.parameters(),
        lr=args.learning_rate,
        eps=1e-8
    )
    scheduler = get_linear_schedule_with_warmup(
        optimizer,
        num_warmup_steps=0,
        num_training_steps=len(train_dataloader) * args.num_train_epochs
    )

    # Train!
    logger.info("***** Running training *****")
    logger.info(f"  Num examples = {len(train_dataset)}")
    logger.info(f"  Num Epochs = {args.num_train_epochs}")
    logger.info(
        "  Instantaneous batch size per GPU = "
        f"{args.train_batch_size // args.n_gpu}"
    )
    logger.info(
        f"  Total train batch size = {args.train_batch_size}"
    )
    logger.info(
        "  Total optimization steps = "
        f"{len(train_dataloader) * args.num_train_epochs}"
    )

    evaluation_interval = ceil(len(train_dataloader) / 5)
    loss_progress = []

    model.zero_grad()

    checkpoint_prefix = "checkpoint-best"
    output_dir_path = Path(args.output_dir).joinpath(checkpoint_prefix)
    if not os.path.exists(output_dir_path):
        os.makedirs(output_dir_path)

    model.train()
    tr_num, tr_loss, best_score = 0, 0, 0
    for idx in range(args.num_train_epochs):
        n_interval = 0

        if idx != 0:
            # Reset sampling
            train_dataset = TextDataset(
                tokenizer,
                args.train_data_file,
                balance_type=args.balance_type
            )
            train_sampler = RandomSampler(train_dataset)
            train_dataloader = DataLoader(
                train_dataset,
                sampler=train_sampler,
                batch_size=args.train_batch_size,
                num_workers=4
            )

        for step, batch in enumerate(train_dataloader):
            # Get inputs
            code_inputs = batch["code_input"].to(args.device)
            nl_inputs = batch["nl_input"].to(args.device)
            labels = batch["changed"].to(args.device).float()

            code_vecs = model(code_inputs=code_inputs)
            nl_vecs = model(nl_inputs=nl_inputs)

            # Calculate scores and loss
            scores = torch.einsum("ij,ij->i", code_vecs, nl_vecs)
            loss_fct = nn.BCEWithLogitsLoss()
            loss = loss_fct(scores * 5, labels)

            # Report loss
            tr_loss += loss.item()
            tr_num += 1
            if (step + 1) % 100 == 0:
                loss_report = round(tr_loss / tr_num, 5)
                logger.info(f"epoch {idx} step {step+1} loss {loss_report}")
                tr_loss = 0
                tr_num = 0
                loss_progress.append((idx, step + 1, loss_report))

            # Backward
            loss.backward()
            torch.nn.utils.clip_grad_norm_(
                model.parameters(), args.max_grad_norm
            )
            optimizer.step()
            optimizer.zero_grad()
            scheduler.step()

            if (step + 1) % evaluation_interval == 0:
                best_score = create_checkpoint(
                    model,
                    tokenizer,
                    dev_dataset,
                    best_score,
                    output_dir_path,
                    loss_progress,
                    idx,
                    n_interval
                )
                n_interval += 1

        best_score = create_checkpoint(
            model,
            tokenizer,
            dev_dataset,
            best_score,
            output_dir_path,
            loss_progress,
            idx,
            n_interval
        )


In [13]:
def create_checkpoint(
    model, tokenizer, dev_dataset, best_score, output_dir_path,
    loss_progress, idx, n_interval
):
    """
    Saves the checkpoint if the model performance has improved.

    Args:
        model (torch.nn.Module): The model to be evaluated and saved.
        tokenizer (transformers.PreTrainedTokenizer): The tokenizer used
                                                      for processing text data.
        dev_dataset (TextDataset): The dataset used for evaluation.
        best_score (float): The current best score for comparison.
        output_dir_path (Path): Directory path to save the checkpoint and
                                evaluation results.
        loss_progress (list): List of loss values recorded during training.
        idx (int): The current epoch index.
        n_interval (int): The current interval index for evaluation.

    Returns:
        float: The updated best score after evaluation.
    """
    results = evaluate(model, tokenizer, dev_dataset)
    for key, value in results.items():
        if key not in {"predictions", "predictions_mult", "labels", "idxs"}:
            logger.info("  %s = %s", key, round(value, 4))

    #save best model
    auprc = results["auprc"]
    if auprc > best_score:
        best_score = auprc
        logger.info("  "+"*" * 20)
        logger.info(f"  Best auprc: {round(best_score, 4)}")
        logger.info("  "+"*" * 20)

        model_to_save = model.module if hasattr(model, "module") else model
        output_path = output_dir_path.joinpath(f"model.bin")
        torch.save(model_to_save.state_dict(), output_path)
        logger.info("Saving model checkpoint to %s", output_path)

    results["loss_progress"] = loss_progress
    results_path = output_dir_path.joinpath(
        f"train_eval_{idx}_{n_interval}.json"
    )
    with results_path.open("w", encoding="utf-8") as f:
        f.write(json.dumps(results))

    return best_score

# Evaluation function

In [16]:
def evaluate(model, tokenizer, eval_dataset=None):
    """
    Evaluate the model on the given dataset and return evaluation metrics.

    Args:
        model (torch.nn.Module): The model to be evaluated.
        tokenizer (transformers.PreTrainedTokenizer): The tokenizer used
            for processing text data.
        eval_dataset (TextDataset, optional): The dataset to evaluate the
            model on. If None, a default dataset is created from the
            evaluation data file.

    Returns:
        dict: A dictionary containing evaluation metrics and predictions.
    """
    if eval_dataset is None:
        eval_dataset = TextDataset(
            tokenizer, args.eval_data_file, balance_type="full_set_imbalanced"
        )

    eval_sampler = SequentialSampler(eval_dataset)
    eval_dataloader = DataLoader(
        eval_dataset, sampler=eval_sampler, batch_size=args.eval_batch_size,
        num_workers=4
    )

    # Eval!
    logger.info("***** Running evaluation *****")
    logger.info(f"  Num entries = {len(eval_dataset)}")
    logger.info(f"  Batch size = {args.eval_batch_size}")

    model.eval()
    code_vecs, nl_vecs, label_vecs, idx_vecs = [], [], [], []
    for step, batch in enumerate(eval_dataloader):
        # Get inputs and ground truth labels
        code_inputs = batch["code_input"].to(args.device)
        nl_inputs = batch["nl_input"].to(args.device)
        labels = batch["changed"].to(args.device).float()
        idxs = batch["idx"].to(args.device)

        with torch.no_grad():
            code_vec = model(code_inputs=code_inputs)
            nl_vec = model(nl_inputs=nl_inputs)

            code_vecs.append(code_vec)
            nl_vecs.append(nl_vec)
            label_vecs.append(labels)
            idx_vecs.append(idxs)

    model.train()
    code_vecs = torch.cat(code_vecs, dim=0)
    nl_vecs = torch.cat(nl_vecs, dim=0)
    label_vecs = torch.cat(label_vecs, dim=0)
    idx_vecs = torch.cat(idx_vecs, dim=0)

    # Calcuate scores and binary cross entropy/mean squared error
    scores = torch.einsum("ij,ij->i", code_vecs, nl_vecs)
    mult_scores = scores * 5

    predictions = torch.special.expit(scores)
    predictions_mult = torch.special.expit(mult_scores)

    bce = nn.functional.binary_cross_entropy_with_logits(
        scores, label_vecs
    )
    bce_mult = nn.functional.binary_cross_entropy_with_logits(
        mult_scores, label_vecs
    )

    auprc = binary_auprc(predictions, label_vecs)
    auroc = binary_auroc(predictions, label_vecs)

    return {"bce": float(bce),
            "bce_mult": float(bce_mult),
            "auprc": float(auprc),
            "auroc": float(auroc),
            "predictions": predictions.tolist(),
            "predictions_mult": predictions_mult.tolist(),
            "labels": label_vecs.tolist(),
            "idxs": idx_vecs.tolist()}


# Run

In [20]:
logger.info("device: %s, n_gpu: %s", args.device, args.n_gpu)

#build model
tokenizer = RobertaTokenizer.from_pretrained(args.model_name_or_path)
model = RobertaModel.from_pretrained(args.model_name_or_path)

model = Model(model)
model.to(args.device)

# Training
if args.do_train:
    train(model, tokenizer)

if args.do_eval:
    input_dir = Path(args.output_dir)
    input_dir = input_dir.joinpath("checkpoint-best/model.bin")
    model_to_load = model.module if hasattr(model, "module") else model
    model_to_load.load_state_dict(torch.load(
        input_dir, map_location=torch.device('cpu'))
    )
    model.to(args.device)
    result = evaluate(model, tokenizer)
    logger.info("***** Eval results *****")
    for key, value in result.items():
        if key not in {"predictions", "predictions_mult", "labels", "idxs"}:
            logger.info("  %s = %s", key, round(value, 4))

    output_dir = Path(args.output_dir)
    if not os.path.exists(output_dir):
        os.makedirs(output_dir)
    results_path = output_dir.joinpath(f"final_eval.json")
    with results_path.open("w", encoding="utf-8") as f:
        f.write(json.dumps(result))

08/28/2024 15:08:09 - INFO - __main__ -   device: cuda, n_gpu: 1
  model_to_load.load_state_dict(torch.load(input_dir, map_location=torch.device('cpu')))
08/28/2024 15:08:10 - INFO - __main__ -   Dataset size at start: 463
08/28/2024 15:08:11 - INFO - __main__ -     n positive examples = 17
08/28/2024 15:08:11 - INFO - __main__ -     n negative examples = 383
08/28/2024 15:08:11 - INFO - __main__ -   Final dataset size: 400
08/28/2024 15:08:11 - INFO - __main__ -   Removed entries due to exceeding token limit: 63
08/28/2024 15:08:11 - INFO - __main__ -   ***** Running evaluation *****
08/28/2024 15:08:11 - INFO - __main__ -     Num entries = 400
08/28/2024 15:08:11 - INFO - __main__ -     Batch size = 24
08/28/2024 15:08:21 - INFO - __main__ -   ***** Eval results *****
08/28/2024 15:08:21 - INFO - __main__ -     bce = 0.6933
08/28/2024 15:08:21 - INFO - __main__ -     bce_mult = 0.7122
08/28/2024 15:08:21 - INFO - __main__ -     auprc = 0.4655
08/28/2024 15:08:21 - INFO - __main__ -  

In [None]:
from google.colab import runtime
runtime.unassign()