In [None]:
import numpy as np
import pandas as pd
import torch
import torch.nn as nn
from sklearn.model_selection import train_test_split
from sklearn.metrics import classification_report
import transformers
from transformers import AutoTokenizer, AutoModel
from sklearn.preprocessing import LabelEncoder
from lib import bertoutput
from datasets import load_dataset

In [None]:
if torch.cuda.is_available():
    device = torch.device("cuda")
    print(f'There are {torch.cuda.device_count()} GPU(s) available.')
    print('Device name:', torch.cuda.get_device_name(0))
else:
    print('No GPU available, using the CPU instead.')
    device = torch.device("cpu")

In [None]:
# Create the BertClassfier class
class BertClassifier(nn.Module):
    """Bert Model for Classification Tasks.
    """
    def __init__(self, model, freeze_bert = False, finetuningsetting = 'cls from last layer'):
        """
        @param    bert: a BertModel object
        @param    classifier: a torch.nn.Module classifier
        @param    freeze_bert (bool): Set `False` to fine-tune the BERT model
        """
        super(BertClassifier, self).__init__()
        # Specify hidden size of BERT, hidden size of our classifier, and number of labels
        D_in, H, D_out = 768, 80, 4

        # Instantiate BERT model
        self.bert = AutoModel.from_pretrained(model, output_hidden_states=True)
#         self.bert = AutoModel.from_pretrained('bert-base-uncased')
        self.finetuningsetting = finetuningsetting
        # Instantiate an one-layer feed-forward classifier
        # self.classifier = nn.Sequential(
        #     nn.Linear(D_in, H),
        #     nn.ReLU(),
        #     nn.Dropout(0.5),
        #     nn.Linear(H, D_out)
        # )
        self.classifier = nn.Sequential(
            nn.Linear(D_in, D_out)
        )
#         self.softmax = nn.LogSoftmax(dim=1)
        # # Freeze the BERT model
        # if freeze_bert:
        #     for param in self.bert.parameters():
        #         param.requires_grad = False

    def forward(self, input_ids, attention_mask):
        """
        Feed input to BERT and the classifier to compute logits.
        @param    input_ids (torch.Tensor): an input tensor with shape (batch_size,
                      max_length)
        @param    attention_mask (torch.Tensor): a tensor that hold attention mask
                      information with shape (batch_size, max_length)
        @return   logits (torch.Tensor): an output tensor with shape (batch_size,
                      num_labels)
        """
        # Feed input to BERT
        outputs = self.bert(input_ids=input_ids, attention_mask=attention_mask)
        # Extract the last hidden state of the token `[CLS]` for classification task
#         last_hidden_state_cls = outputs[0][:, 0, :]
        if self.finetuningsetting == 'cls from last layer':
            feature = bertoutput.getCLSfromLastLayer(outputs)
        elif self.finetuningsetting == 'concatenate Last four CLS':
            feature = bertoutput.concatenateLast4CLS(outputs)
        elif self.finetuningsetting == 'concatenate Last 4 Layers':
            feature = bertoutput.concatenateLast4Layers(outputs, attention_mask)
        elif self.finetuningsetting == 'mean pooling':
            feature = bertoutput.mean_pooling(outputs, attention_mask)
        elif self.finetuningsetting == 'mean sqrt pooling':
            feature = bertoutput.mean_sqrt_len_pooling(outputs, attention_mask)
        elif self.finetuningsetting == 'max pooling':
            feature = bertoutput.max_pooling(outputs, attention_mask)
        elif self.finetuningsetting == 'cls from second last layer':
            feature = bertoutput.getCLSfromSecondLastLayer(outputs)
        elif self.finetuningsetting == 'get Pooling Output':
            feature = bertoutput.getPoolingOutput(outputs)

#         print('length of output ', len(outputs[]))
#         print(outputs)

        # Feed input to classifier to compute logits
        logits = self.classifier(feature)
#         logits = self.softmax(logits)
        return logits

In [None]:
from transformers import AdamW, get_linear_schedule_with_warmup

def initialize_model(model, lengthTrainDataLoader, epochs=4, outputLayer = 'get Pooling Output'):
    """Initialize the Bert Classifier, the optimizer and the learning rate scheduler.
    """
    # Instantiate Bert Classifier
    bert_classifier = BertClassifier(model,finetuningsetting = outputLayer)
    # bert_classifier = BertClassifier(model)

    # Tell PyTorch to run the model on GPU
    bert_classifier.to(device)

    # Create the optimizer
    optimizer = AdamW(bert_classifier.parameters(),
                      lr=5e-5,    # Default learning rate
                      eps=1e-8    # Default epsilon value
                      )

    # Total number of training steps
    total_steps = lengthTrainDataLoader * epochs

    # Set up the learning rate scheduler
    scheduler = get_linear_schedule_with_warmup(optimizer,
                                                num_warmup_steps=0, # Default value
                                                num_training_steps=total_steps)
    return bert_classifier, optimizer, scheduler

In [None]:
import random
import time
import copy
from tqdm.auto import tqdm

def set_seed(seed_value=42):
    """Set seed for reproducibility.
    """
    random.seed(seed_value)
    np.random.seed(seed_value)
    torch.manual_seed(seed_value)
    torch.cuda.manual_seed_all(seed_value)

def train(model, train_dataloader, val_dataloader=None, epochs=4, evaluation=False):
    """Train the BertClassifier model.
    """
    # Start training loop
    print("Start training...\n")
    val_ = 1
    best_model = None
    for epoch_i in range(epochs):
        # =======================================
        #               Training
        # =======================================
        # Print the header of the result table
        print(f"{'Epoch':^7} | {'Batch':^7} | {'Train Loss':^12} | {'Val Loss':^10} | {'Val Acc':^9} | {'Elapsed':^9}")
        print("-"*70)

        # Measure the elapsed time of each epoch
        t0_epoch, t0_batch = time.time(), time.time()

        # Reset tracking variables at the beginning of each epoch
        total_loss, batch_loss, batch_counts = 0, 0, 0

        # Put the model into the training mode
        model.train()

        # For each batch of training data...
        for step, batch in enumerate(train_dataloader):
            batch_counts +=1
            # Load batch to GPU
            batch = {k: v.to(device) for k, v in batch.items()}
            b_input_ids, b_attn_mask, b_labels = batch['input_ids'], batch['attention_mask'], batch['labels']

            # Zero out any previously calculated gradients
            model.zero_grad()

            # Perform a forward pass. This will return logits.
            logits = model(b_input_ids, b_attn_mask)

            # Compute loss and accumulate the loss values
            loss = loss_fn(logits, b_labels.long())
            batch_loss += loss.item()
            total_loss += loss.item()

            # Perform a backward pass to calculate gradients
            loss.backward()

            # Clip the norm of the gradients to 1.0 to prevent "exploding gradients"
            torch.nn.utils.clip_grad_norm_(model.parameters(), 1.0)

            # Update parameters and the learning rate
            optimizer.step()
            scheduler.step()

            # Print the loss values and time elapsed for every 20 batches
            if (step % 20 == 0 and step != 0) or (step == len(train_dataloader) - 1):
                # Calculate time elapsed for 20 batches
                time_elapsed = time.time() - t0_batch

                # Print training results
                print(f"{epoch_i + 1:^7} | {step:^7} | {batch_loss / batch_counts:^12.6f} | {'-':^10} | {'-':^9} | {time_elapsed:^9.2f}")

                # Reset batch tracking variables
                batch_loss, batch_counts = 0, 0
                t0_batch = time.time()

        # Calculate the average loss over the entire training data
        avg_train_loss = total_loss / len(train_dataloader)

        print("-"*70)
        # =======================================
        #               Evaluation
        # =======================================
        if evaluation == True:
            # After the completion of each training epoch, measure the model's performance
            # on our validation set.
            val_loss, val_accuracy = evaluate(model, val_dataloader)
            if val_ < val_accuracy:
                print('good model ', val_accuracy)
                val_ = val_accuracy
                best_model = copy.deepcopy(model)
            # Print performance over the entire training data
            time_elapsed = time.time() - t0_epoch

            print(f"{epoch_i + 1:^7} | {'-':^7} | {avg_train_loss:^12.6f} | {val_loss:^10.6f} | {val_accuracy:^9.2f} | {time_elapsed:^9.2f}")
            print("-"*70)
        print("\n")

    print("Training complete!")

    return best_model

def evaluate(model, val_dataloader):
    """After the completion of each training epoch, measure the model's performance
    on our validation set.
    """
    # Put the model into the evaluation mode. The dropout layers are disabled during
    # the test time.
    model.eval()

    # Tracking variables
    val_accuracy = []
    val_loss = []

    # For each batch in our validation set...
    for batch in val_dataloader:
        # Load batch to GPU

        batch = {k: v.to(device) for k, v in batch.items()}
        b_input_ids, b_attn_mask, b_labels = batch['input_ids'], batch['attention_mask'], batch['labels']

        # Compute logits
        with torch.no_grad():
            logits = model(b_input_ids, b_attn_mask)

        # Compute loss
#         loss = loss_fn(logits, b_labels.long())
        loss = loss_fn(logits, b_labels)
        val_loss.append(loss.item())

        # Get the predictions
        preds = torch.argmax(logits, dim=1).flatten()

        # Calculate the accuracy rate
        accuracy = (preds == b_labels).cpu().numpy().mean() * 100
        val_accuracy.append(accuracy)

    # Compute the average accuracy and loss over the validation set.
    val_loss = np.mean(val_loss)
    val_accuracy = np.mean(val_accuracy)

    return val_loss, val_accuracy

In [None]:
import torch.nn.functional as F

def bert_predict(model, test_dataloader):
    """Perform a forward pass on the trained BERT model to predict probabilities
    on the test set.
    """
    # Put the model into the evaluation mode. The dropout layers are disabled during
    # the test time.
    model.eval()

    all_logits = []

    # For each batch in our test set...
    for batch in test_dataloader:
        # Load batch to GPU
#         b_input_ids, b_attn_mask = tuple(t.to(device) for t in batch)[:2]
        batch = {k: v.to(device) for k, v in batch.items()}
        b_input_ids, b_attn_mask = batch['input_ids'], batch['attention_mask']
        # Compute logits
        with torch.no_grad():
            logits = model(b_input_ids, b_attn_mask)
        all_logits.append(logits)

    # Concatenate logits from each batch
    all_logits = torch.cat(all_logits, dim=0)

    # Apply softmax to calculate probabilities
    probs = F.softmax(all_logits, dim=1).cpu().numpy()

    return probs

In [None]:
def tokenize_data(data, MAX_LEN = 80):
    # tokenizer vs tokenizer.encode_plus produce the same result but encode_plus doesn't support batch true in map function
    return tokenizer(data["UserStory"], padding="max_length", truncation=True, max_length = MAX_LEN)

In [None]:
from sklearn.utils.class_weight import compute_class_weight

def computeClassWeight(trainingLabels):
    #compute the class weights
    class_weights = compute_class_weight(class_weight = 'balanced', classes = np.unique(trainingLabels), y = trainingLabels)

    print("Class Weights:",class_weights)
    # converting list of class weights to a tensor
    weights= torch.tensor(class_weights,dtype=torch.float)
    # push to GPU
    weights = weights.to(device)

    return weights

In [None]:
def evaluation(labels, preds, target_names = ['Capability', 'Hard-goal', 'Soft-goal', 'Task']):

    metricReport = classification_report(labels, preds, target_names=target_names, zero_division=0, output_dict=True)
    return {
        'Accuracy': metricReport['accuracy'],
        'CapP': metricReport[target_names[0]]['precision'],
        'CapR': metricReport[target_names[0]]['recall'],
        'CapF1': metricReport[target_names[0]]['f1-score'],
        'HGP': metricReport[target_names[1]]['precision'],
        'HGR': metricReport[target_names[1]]['recall'],
        'HGF1': metricReport[target_names[1]]['f1-score'],
        'SGP': metricReport[target_names[2]]['precision'],
        'SGR': metricReport[target_names[2]]['recall'],
        'SGF1': metricReport[target_names[2]]['f1-score'],
        'TP': metricReport[target_names[3]]['precision'],
        'TR': metricReport[target_names[3]]['recall'],
        'TF1': metricReport[target_names[3]]['f1-score'],
    }

In [None]:
# Load the BERT tokenizer
# model = 'bert-base-uncased'
# model = 'roberta-base'
pretrain_model_dir = "model/"
model = pretrain_model_dir + "bert-base-uncased_us"
# model = pretrain_model_dir + "roberta-base_us"
# model = r'C:\Users\porch\Downloads\transformers-main\transformers-main\examples\pytorch\language-modeling\tmp_robert_epochs'

tokenizer = AutoTokenizer.from_pretrained(model, do_lower_case=True)

In [None]:
!nvidia-smi

In [None]:
from google.colab import drive
import sys
sys.path.insert(0,'/content/drive/My Drive/AI and Tutorial/NLP4RE/code/')

data_dir = "../../data/us/newDataset/separate_5_folds_2/"

from torch.utils.data import DataLoader
from sklearn.metrics import classification_report

numOfEpochs = 10
set_seed(42)    # Set seed for reproducibility
results = []
for iteratorDataset in range(1,6):
    print('iterator: ', str(iteratorDataset))
# class_names = {0: "Capability", 1: "Soft-goal", 2: "Hard-goal", 3: "Task"}

#     iteratorDataset = '2'
    dataset = load_dataset('csv', data_files={'train': data_dir + 'train_' + str(iteratorDataset) + '.csv', 'test': data_dir + 'test_' + str(iteratorDataset) + '.csv'}, encoding = "utf-8")
    dataset = dataset.class_encode_column("Label")
    dataset = dataset.map(tokenize_data, batched=True)
    # print(dataset)


    dataset = dataset.remove_columns(['Unnamed: 0', 'UserStory'])
    dataset = dataset.rename_column("Label", "labels")
    classWeight = computeClassWeight(dataset['train']['labels'])
    # loss_fn = nn.NLLLoss(weight=classWeight)
    loss_fn = nn.CrossEntropyLoss(weight=classWeight)
    dataset.set_format("torch")
    train_dataloader = DataLoader(dataset['train'], shuffle=True, batch_size=32)
    val_dataloader = DataLoader(dataset['test'], batch_size=32)
    print(len(train_dataloader))
    print(len(val_dataloader))

    bert_classifier, optimizer, scheduler = initialize_model(model = model, lengthTrainDataLoader = len(train_dataloader), epochs=numOfEpochs, outputLayer='cls from last layer')
    t = train(bert_classifier , train_dataloader, val_dataloader, epochs=numOfEpochs, evaluation=True)

    # Compute predicted probabilities on the test set
    probs = bert_predict(t, val_dataloader)
    probs = np.argmax(probs, axis = 1)
    print(classification_report(dataset['test']['labels'], probs))
    results.append(evaluation(dataset['test']['labels'], probs))
    torch.cuda.empty_cache()
    del bert_classifier
    del t

CapR = CapP = CapF1 = HGP = HGR = HGF1 = SGP = SGR = SGF1 = TP = TR = TF1 = 0
for i in results:
    CapP += i['CapP']
    CapR += i['CapR']
    CapF1 += i['CapF1']
    HGP += i['HGP']
    HGR += i['HGR']
    HGF1 += i['HGF1']
    SGP += i['SGP']
    SGR += i['SGR']
    SGF1 += i['SGF1']
    TP += i['TP']
    TR += i['TR']
    TF1 += i['TF1']
print(CapP/len(results))
print(CapR/len(results))
print(CapF1/len(results))

print(HGP/len(results))
print(HGR/len(results))
print(HGF1/len(results))

print(SGP/len(results))
print(SGR/len(results))
print(SGF1/len(results))

print(TP/len(results))
print(TR/len(results))
print(TF1/len(results))

In [None]:
   # https://pytorch.org/docs/stable/generated/torch.nn.CrossEntropyLoss.html
    # loss_fn = nn.CrossEntropyLoss(weight=weights)
    # loss_fn = nn.CrossEntropyLoss()

    # https://pytorch.org/docs/stable/generated/torch.nn.NLLLoss.html
#     loss_fn  = nn.NLLLoss()
    # loss_fn  = nn.NLLLoss(weight=classWeight)