In [1]:
import torch
from torch.utils.data import DataLoader, RandomSampler, SequentialSampler
from torch.utils.data import TensorDataset
from transformers import DistilBertForSequenceClassification, DistilBertTokenizer
from transformers import RobertaForSequenceClassification, RobertaTokenizer


from tqdm.notebook import tqdm
from tqdm.auto import tqdm

from transformers import BertTokenizer
from transformers import BertForSequenceClassification
from transformers import AdamW, get_linear_schedule_with_warmup

In [2]:
import os
import numpy as np
import pandas as pd
import gensim

from sklearn.model_selection import train_test_split
from sklearn.metrics import f1_score

import random
from sklearn.metrics import f1_score, precision_score, recall_score, roc_auc_score
from sklearn.model_selection import KFold
from numpy import mean
import time
import psutil

In [None]:
device = torch.device('cuda' if torch.cuda.is_available() else 'cpu')

print(device)
if device.type == 'cuda':
    print(torch.cuda.get_device_name(0))
    print('Memory usage of GPU:')
    print('Allocated:', round(torch.cuda.memory_allocated(0)/1024**2), 'MBs')
    print('Cached:   ', round(torch.cuda.memory_reserved(0)/1024**2), 'MBs')
else:
    exit(0)
torch.cuda.empty_cache()

In [4]:
# start time for training
start_time = time.time()

In [None]:
df = pd.read_csv("NLBSE_sample_10k_cleaned.csv")
df.shape

In [7]:
# Renaming columns
df = df.rename(columns={'text': 'issue_data', 'labels': 'issue_label'})
df.head(1)

Unnamed: 0,issue_data,issue_label
0,add api design document design document outlin...,0


In [None]:
df.loc[df['issue_label'] == 1, 'issue_label'] = 'valid'
df.loc[df['issue_label'] == 0, 'issue_label'] = 'invalid'

In [None]:
print(df['issue_label'].value_counts())
possible_labels = df.issue_label.unique()

label_dict = {}
for index, possible_label in enumerate(possible_labels):
    label_dict[possible_label] = index
print(label_dict)

In [None]:
df['label'] = df.issue_label.replace(label_dict)

In [11]:
# preprocessing can be customized by participants
def preprocess(row):
  
  # convert to string
  doc = str(row.issue_data)
  
  # lowercase
  doc = doc.lower()
  
  # remove punctuation
  doc = gensim.parsing.preprocessing.strip_punctuation(doc)

  # remove consecutive whitespace characters and convert tabs to spaces
  doc = gensim.parsing.preprocessing.strip_multiple_whitespaces(doc)
  
  #remove stop-words
  doc = gensim.parsing.preprocessing.remove_stopwords(doc)
    
  # make stems
  doc = gensim.parsing.preprocessing.stem_text(doc)

  #remove white space
  doc = gensim.parsing.preprocessing.strip_multiple_whitespaces(doc)
  
  return doc

In [None]:
df['issue_data'] = df.apply(preprocess, axis=1)

newDF = df[['issue_label','issue_data','label']]
df = newDF.copy()
print(df.head(2))

In [13]:
#tokenizer = BertTokenizer.from_pretrained('bert-base-uncased', do_lower_case=True)

# Load the RoBERTa tokenizer
#tokenizer = RobertaTokenizer.from_pretrained('roberta-base', do_lower_case=True)

# Load the tokenizer
#tokenizer = DistilBertTokenizer.from_pretrained('distilbert-base-uncased', do_lower_case=True)

In [14]:
#model = BertForSequenceClassification.from_pretrained("bert-base-uncased", num_labels=len(label_dict), output_attentions=False,
                                                      #output_hidden_states=False)

# Load the RoBERTa model for sequence classification
#model = RobertaForSequenceClassification.from_pretrained("roberta-base", num_labels=len(label_dict), output_attentions=False,
                                                          #output_hidden_states=False)

# Define the model for sequence classification
#model = DistilBertForSequenceClassification.from_pretrained("distilbert-base-uncased", num_labels=len(label_dict),
                                                      #output_attentions=False, output_hidden_states=False)

In [15]:
def f1_score_func(preds, labels):
    preds_flat = np.argmax(preds, axis=1).flatten()
    labels_flat = labels.flatten()
    return f1_score(labels_flat, preds_flat, average='weighted')

def get_roc_auc_func(preds, labels):
    preds_flat = np.argmax(preds, axis=1).flatten()
    labels_flat = labels.flatten()
    return roc_auc_score(preds_flat, labels_flat)
    
def evaluate(model, dataloader_val):

    model.eval()
    
    loss_val_total = 0
    predictions, true_vals = [], []
    
    for batch in dataloader_val:
        
        batch = tuple(b.to(device) for b in batch)
        
        inputs = {'input_ids':      batch[0],
                  'attention_mask': batch[1],
                  'labels':         batch[2],
                 }

        with torch.no_grad():        
            outputs = model(**inputs)
            
        loss = outputs[0]
        logits = outputs[1]
        loss_val_total += loss.item()

        logits = logits.detach().cpu().numpy()
        label_ids = inputs['labels'].cpu().numpy()
        predictions.append(logits)
        true_vals.append(label_ids)
    
    loss_val_avg = loss_val_total/len(dataloader_val) 
    
    predictions = np.concatenate(predictions, axis=0)
    true_vals = np.concatenate(true_vals, axis=0)
            
    return loss_val_avg, predictions, true_vals

In [None]:
num_folds = 10
i = 1
auc_scores = []
auc_scores_1 = []
kf = KFold(n_splits=num_folds, shuffle=True, random_state=42)
#get actual labels of training data
y = df['label'].values
for train_index, test_index in kf.split(df):
    tokenizer = RobertaTokenizer.from_pretrained('roberta-base', do_lower_case=True)
    model = RobertaForSequenceClassification.from_pretrained("roberta-base", num_labels=len(label_dict), output_attentions=False,
                                                          output_hidden_states=False)
    print("Fold:", i)
    i=i+1
    X_train, X_val = df.iloc[train_index], df.iloc[test_index] 
    y_train, y_val = y[train_index], y[test_index]

    X_train = X_train.index.values
    #y_train = y_train.index.values
    
    X_val = X_val.index.values
    #y_val = y_val.index.values
    
    df['data_type'] = ['not_set']*df.shape[0]

    df.loc[X_train, 'data_type'] = 'train'
    df.loc[X_val, 'data_type'] = 'val'

    #encode training data
    encoded_data_train = tokenizer.batch_encode_plus(df[df.data_type=='train'].issue_data.values, add_special_tokens=True, 
                                                     return_attention_mask=True, padding='longest', truncation=True, return_tensors='pt')

    input_ids_train = encoded_data_train['input_ids']
    attention_masks_train = encoded_data_train['attention_mask']
    labels_train = torch.tensor(df[df.data_type=='train'].label.values)

    #encode testing data
    encoded_data_val = tokenizer.batch_encode_plus(df[df.data_type=='val'].issue_data.values, add_special_tokens=True, 
                                                   return_attention_mask=True, padding='longest', truncation=True, return_tensors='pt')

    input_ids_val = encoded_data_val['input_ids']
    attention_masks_val = encoded_data_val['attention_mask']
    labels_val = torch.tensor(df[df.data_type=='val'].label.values)

    #prepare tensor datasets
    dataset_train = TensorDataset(input_ids_train, attention_masks_train, labels_train)
    dataset_val = TensorDataset(input_ids_val, attention_masks_val, labels_val)
    print(len(dataset_train), len(dataset_val))

    #define batch size
    batch_size = 8

    #data loaders
    dataloader_train = DataLoader(dataset_train, sampler=RandomSampler(dataset_train), batch_size=batch_size)
    dataloader_validation = DataLoader(dataset_val, sampler=SequentialSampler(dataset_val), batch_size=batch_size)
    
    #define optimizer
    optimizer = AdamW(model.parameters(), lr=1e-5, eps=1e-8)
    #optimizer = AdamW(model.parameters(), lr=1e-3, eps=1e-8)
    
    #define epochs
    epochs = 4

    #define schedualr
    scheduler = get_linear_schedule_with_warmup(optimizer, num_warmup_steps=0, num_training_steps=len(dataloader_train)*epochs)

    seed_val = 17
    random.seed(seed_val)
    np.random.seed(seed_val)
    torch.manual_seed(seed_val)
    torch.cuda.manual_seed_all(seed_val)
    
    #model to cuda
    model.to(device);
    
    best_val_loss = float('inf')  # Initialize with a very large value
    best_epoch = 0
    best_epoch_auc = 0.0
    best_epoch_auc_1 = 0.0
    
    for epoch in tqdm(range(1, epochs+1)):
        model.train()
        loss_train_total = 0
        progress_bar = tqdm(dataloader_train, desc='Epoch {:1d}'.format(epoch), leave=False, disable=False)
        for batch in progress_bar:
            model.zero_grad()
            batch = tuple(b.to(device) for b in batch)
            inputs = {'input_ids': batch[0],'attention_mask': batch[1],'labels': batch[2],}
            outputs = model(**inputs)
        
            loss = outputs[0]
            loss_train_total += loss.item()
            loss.backward()

            torch.nn.utils.clip_grad_norm_(model.parameters(), 1.0)

            optimizer.step()
            scheduler.step()
        
            progress_bar.set_postfix({'training_loss': '{:.3f}'.format(loss.item()/len(batch))})
         
        
        #torch.save(model.state_dict(), f'models/finetuned_BERT_epoch_{epoch}.model')
        
        tqdm.write(f'\nEpoch {epoch}')

        loss_train_avg = loss_train_total/len(dataloader_train)            
        tqdm.write(f'Training loss: {loss_train_avg}')
    
        val_loss, predictions, true_vals = evaluate(model,dataloader_validation)
        val_f1 = f1_score_func(predictions, true_vals)
        val_auc = get_roc_auc_func(predictions, true_vals)

        tqdm.write(f'Validation loss: {val_loss}')
        tqdm.write(f'F1 Score (Weighted): {val_f1}')
        tqdm.write(f'AUC: {val_auc}')
        #Check if validation loss has improved
        if val_loss < best_val_loss:
            best_val_loss = val_loss
            best_epoch = epoch
            best_epoch_auc =  val_auc
            # Save the model
            #torch.save(model.state_dict(), f'models/best_model_finetuned_BESTDistillBERT.model')
            #torch.save(model.state_dict(), f'models/best_model_finetuned_BESTRoberta.model')
            #torch.save(model.state_dict(), f'models/best_model_finetuned_BESTBERT.model')
        if best_epoch_auc_1 < val_auc:
            best_epoch_auc_1 = val_auc
    
    #Print the best epoch and validation loss
    print(f"Best model found at epoch {best_epoch} with validation loss: {best_val_loss}")
    auc_scores.append(best_epoch_auc)
    auc_scores_1.append(best_epoch_auc_1)
    print("AUC of best epoch:",best_epoch_auc)
    print("Best AUC across epochs:", best_epoch_auc_1)

In [None]:
#print("Mean AUC:",mean(auc_scores))
print("Mean of best AUC across epochs:",mean(auc_scores_1))
print("Training time in mintues:", (time.time() - start_time)/60)
pid = psutil.Process().pid
memory_usage_in_bytes = psutil.Process(pid).memory_info().rss
memory_usage_in_megabytes = memory_usage_in_bytes / 1024**2

print("RAM in MBS:", memory_usage_in_megabytes)

In [None]:
print(torch.cuda.get_device_name(0))
print('Memory usage of GPU:')
print('Allocated:', round(torch.cuda.memory_allocated(0)/1024**2), 'MBs')
print('Cached:   ', round(torch.cuda.memory_reserved(0)/1024**2), 'MBs')