In [None]:
!pip install torch
!pip install transformers
!pip install pandas
!pip install evaluate

In [None]:
import os
import random
import re
import torch

import numpy as np
import pandas as pd

from torch.utils.data import DataLoader, TensorDataset, RandomSampler
from transformers import RobertaTokenizer, RobertaForSequenceClassification

In [None]:
from google.colab import drive
drive.mount('/content/drive')

In [None]:
DATASET_DIRECTORY_PATH = '/content/drive/My Drive/MLinter/dataset'

def extract_rules_from_drive():
    rule_ids = []

    for _, _, files in os.walk(DATASET_DIRECTORY_PATH):
        files.sort()

        for file in files:
            if file.endswith('.zip'):
                rule_ids.append(get_rule_id_from_file(file))

    return rule_ids


def get_rule_id_from_file(file_name):
    return re.search(r'^(.+)\.zip$', file_name).group(1)

In [None]:
import datetime
import time

def format_time(elapsed):
    '''
    Takes a time in seconds and returns a string hh:mm:ss
    '''
    # Round to the nearest second.
    elapsed_rounded = int(round(elapsed))

    # Format as hh:mm:ss
    return str(datetime.timedelta(seconds=elapsed_rounded))

In [None]:
def tokenize_dataset(lines, labels, max_length=64):
    token_ids = []
    attention_masks = []
    labels_used = []

    for index, line in enumerate(lines):
        if len(line) > 115:
            continue

        encoding_dict = tokenizer.encode_plus(
            line,
            truncation=True,
            add_special_tokens=True,
            max_length=max_length,
            padding='max_length',
            return_attention_mask=True,
            return_tensors='pt'
        )

        token_ids.append(encoding_dict['input_ids'])
        attention_masks.append(encoding_dict['attention_mask'])
        labels_used.append(labels[index])

    token_ids = torch.cat(token_ids, dim=0)
    attention_masks = torch.cat(attention_masks, dim=0)
    labels = torch.tensor(labels_used)

    return TensorDataset(token_ids, attention_masks, labels)

In [None]:
BATCH_SIZE = 16

def build_data_loader_for_dataset_file(dataset_path):
    print(f'\t\t\t\tExtracting, tokenizing and building data loader for dataset...', flush=True)
    dataset_df = pd.read_csv(dataset_path)
    dataset_df = dataset_df.dropna()

    lines = dataset_df['content'].to_numpy()
    labels = dataset_df['value'].astype(int).to_numpy()
    dataset = tokenize_dataset(lines, labels)

    positive_labels = dataset_df[dataset_df['value'] == 1]
    ratio = len(positive_labels) / len(labels)

    return DataLoader(dataset, batch_size=BATCH_SIZE, sampler=RandomSampler(dataset)), ratio

In [None]:
def initialize_model_and_optimizer():
    # Load the BertForSequenceClassification model
    model = RobertaForSequenceClassification.from_pretrained(
        'microsoft/codebert-base', # Use the 12-layer BERT model, with an uncased vocab.
        num_labels=2, # The number of output labels--2 for binary classification.
        # You can increase this for multi-class tasks.
        output_attentions=False, # Whether the model returns attentions weights.
        output_hidden_states=False, # Whether the model returns all hidden-states.
    )

    # Recommended learning rates (Adam): 5e-5, 3e-5, 2e-5. See: https://arxiv.org/pdf/1810.04805.pdf
    optimizer = torch.optim.AdamW(model.parameters(), lr=5e-5, eps=1e-08)
    model.to(device)

    return model, optimizer

In [None]:
EPOCHS = 4

def train_model(train_file_path):
    time_at_train_start = time.time()

    train_loader, _ = build_data_loader_for_dataset_file(train_file_path)

    print(f'\t\t\t\tTraining model...', flush=True)
    model, optimizer = initialize_model_and_optimizer()
    model.train()
    for epoch in range(0, EPOCHS):
        print(f'\t\t\t\t\tEpoch {epoch+1} / {EPOCHS}', flush=True)
        time_at_epoch_start = time.time()

        tr_loss = 0
        nb_tr_examples, nb_tr_steps = 0, 0
        for batch in train_loader:
            batch = tuple(t.to(device) for t in batch)
            b_input_ids, b_input_mask, b_labels = batch

            optimizer.zero_grad()
            train_output = model(b_input_ids, token_type_ids=None, attention_mask=b_input_mask, labels=b_labels)

            train_output.loss.backward()
            optimizer.step()

            tr_loss += train_output.loss.item()
            nb_tr_examples += b_input_ids.size(0)
            nb_tr_steps += 1

        training_time = format_time(time.time() - time_at_epoch_start)
        print(f'\t\t\t\t\t\tTraining epoch took: {training_time}', flush=True)
        print(f'\t\t\t\t\t\tTrain loss: {tr_loss / nb_tr_steps:.4f}', flush=True)

    total_train_time = format_time(time.time() - time_at_train_start)
    print(f'\t\t\t\tTotal training time: {total_train_time}', flush=True)

    return model

In [None]:
import evaluate

SPECIFICITY_FILE = '/content/drive/My Drive/MLinter/utils/specificity.py'
DETAILS_FILE = '/content/drive/My Drive/MLinter/utils/details.py'

evaluate.load(SPECIFICITY_FILE)
evaluate.load(DETAILS_FILE)

METRICS = ['accuracy', 'f1', 'precision', 'recall', 'specificity', 'details']

def evaluate_model(model, validation_file_path):
    time_at_validation_start = time.time()

    validation_loader, ratio = build_data_loader_for_dataset_file(validation_file_path)

    print(f'\t\t\t\t\tEvaluating...', flush=True)
    model.eval()
    metrics = evaluate.combine(METRICS)
    for batch in validation_loader:
        batch = tuple(t.to(device) for t in batch)
        b_input_ids, b_input_mask, b_labels = batch

        with torch.no_grad():
            eval_output = model(b_input_ids, token_type_ids=None, attention_mask=b_input_mask)

        logits = eval_output.logits.detach().cpu()
        predictions = torch.argmax(logits, dim=-1)

        metrics.add_batch(predictions=predictions, references=b_labels)

    values = metrics.compute()

    results = {}
    for metric in values:
        metric_value = round(values[metric], ndigits=4)
        results[metric] = metric_value
        print(f'\t\t\t\t\t\t{metric}: {metric_value}', flush=True)
    results['ratio'] = round(ratio, ndigits=4)

    total_validation_time = format_time(time.time() - time_at_validation_start)
    print(f'\t\t\t\tTotal validation time: {total_validation_time}', flush=True)

    return results

In [None]:
RESULTS_DIRECTORY_PATH = '/content/drive/My Drive/MLinter/result'

METRICS_COMPUTED = ['accuracy', 'f1', 'precision', 'recall', 'specificity', 'TN', 'FP', 'FN', 'TP', 'ratio']

if not os.path.exists(RESULTS_DIRECTORY_PATH):
    os.mkdir(RESULTS_DIRECTORY_PATH)

def write_results(rule, method, conf, size, file, results):
    results_file_path = f'{RESULTS_DIRECTORY_PATH}/{rule}_{method}.csv'

    if not os.path.exists(results_file_path):
        with open(results_file_path, 'w') as results_file:
            header = 'conf,size,file'
            for metric in METRICS_COMPUTED:
                header += f',{metric}'
            header += '\n'

            results_file.write(header)

    with open(results_file_path, 'a') as results_file:
        result_csv = f'{conf},{size},{file}'
        for metric in METRICS_COMPUTED:
            result_csv += f',{results[metric]}'

        results_file.write(f'{result_csv}\n')

In [None]:
if torch.cuda.is_available():
    # Tell PyTorch to use the GPU.
    device = torch.device('cuda')
    print('There are %d GPU(s) available.' % torch.cuda.device_count())
    print('We will use the GPU:', torch.cuda.get_device_name(0))
    print('GPU properties:', torch.cuda.get_device_properties(device))
    # If not...
else:
    print('No GPU available, using the CPU instead.')
    device = torch.device('cpu')

In [None]:
# Load the codeBERT tokenizer.
print('Loading codeBERT tokenizer...')
tokenizer = RobertaTokenizer.from_pretrained('microsoft/codebert-base')

In [None]:
LOCAL_DATASET_DIRECTORY_PATH = '/content/dataset'

VALIDATION_METHODS = ['balanced_corpus', 'ground_truth_with_violation']

random_state = 20221118
random.seed(random_state)
np.random.seed(random_state)
torch.manual_seed(random_state)
torch.cuda.manual_seed_all(random_state)

time_at_start = time.time()

rules = extract_rules_from_drive()

current_exec = 0
execs_number = 1
rules_size = len(rules)
rules_by_exec = int(rules_size / execs_number)
current_exec_start, current_exec_end = 0, 0
for computing_exec in range(0, current_exec + 1):
    current_exec_start = current_exec_end
    current_exec_end = current_exec_start + rules_by_exec
    if rules_size % execs_number > computing_exec:
        current_exec_end += 1
rules_selected = rules[current_exec_start:current_exec_end]
print(f'{len(rules_selected)} rules selected for this execution.\n')

for rule_index, rule in enumerate(rules_selected, 1):
    time_at_rule_start = time.time()
    print('============== ', rule, ' (', rule_index, '/', len(rules_selected), ') ==============', flush=True)

    !rm -rf '{LOCAL_DATASET_DIRECTORY_PATH}/{rule}'
    !unzip -q '{DATASET_DIRECTORY_PATH}/{rule}' -d '{LOCAL_DATASET_DIRECTORY_PATH}'

    for _, confs, _ in os.walk(f'{LOCAL_DATASET_DIRECTORY_PATH}/{rule}'):
        confs.sort(reverse=True)
        for conf in confs:
            time_at_conf_start = time.time()
            print(f'\tConf {conf}', flush=True)

            for _, sizes, _ in os.walk(f'{LOCAL_DATASET_DIRECTORY_PATH}/{rule}/{conf}'):
                sizes.sort(reverse=True)
                for size in sizes:
                    time_at_size_start = time.time()
                    print(f'\t\tSize {size}', flush=True)

                    file_index = 0
                    base_path = f'{LOCAL_DATASET_DIRECTORY_PATH}/{rule}/{conf}/{size}'
                    train_path = f'{base_path}/train'
                    while os.path.exists(f'{train_path}/{file_index}.csv'):
                        time_at_file_start = time.time()
                        print(f'\t\t\tFile {file_index}')

                        train_file_path = f'{train_path}/{file_index}.csv'
                        model = train_model(train_file_path)

                        for method in VALIDATION_METHODS:
                            print(f'\n\t\t\t\t{method}...', flush=True)
                            validation_file_path = f'{base_path}/{method}_validation/{file_index}.csv'
                            results = evaluate_model(model, validation_file_path)

                            write_results(rule, method, conf, size, file_index, results)

                        total_file_time = format_time(time.time() - time_at_file_start)
                        print(f'\t\t\tTotal time for file {file_index}: {total_file_time}', flush=True)
                        file_index += 1

                    total_size_time = format_time(time.time() - time_at_size_start)
                    print(f'\t\tTotal time for size {size}: {total_size_time}', flush=True)

                break

            total_conf_time = format_time(time.time() - time_at_conf_start)
            print(f'\tTotal time for conf {conf}: {total_conf_time}', flush=True)

        break

    total_rule_time = format_time(time.time() - time_at_rule_start)
    print(f'============== Total time for {rule}: {total_rule_time} ==============', flush=True)

total_time = format_time(time.time() - time_at_start)
print(f'\nTotal time: {total_time}', flush=True)