#**Issue Report Classification using Fine-tuned BERT-based classifier**

- This notebook contains the code for using finetuning DistilBERT for multi-class classification.

## Implementation details

* Model used: distil-bert-uncased
* MAX_LEN: 512
* TRAIN_BATCH_SIZE: 4
* VALID_BATCH_SIZE: 2
* EPOCHS: 1
* LEARNING_RATE: 1e-05
* Tokenizer: DistilBertTokenizer
* Loss function: Cross-entropy
* Optimizer: Adam


### Installing required libraries

In [1]:
 !pip install datasets

Collecting datasets
  Downloading datasets-2.16.1-py3-none-any.whl (507 kB)
[2K     [90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━[0m [32m507.1/507.1 kB[0m [31m1.3 MB/s[0m eta [36m0:00:00[0m
Collecting dill<0.3.8,>=0.3.0 (from datasets)
  Downloading dill-0.3.7-py3-none-any.whl (115 kB)
[2K     [90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━[0m [32m115.3/115.3 kB[0m [31m5.9 MB/s[0m eta [36m0:00:00[0m
Collecting multiprocess (from datasets)
  Downloading multiprocess-0.70.15-py310-none-any.whl (134 kB)
[2K     [90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━[0m [32m134.8/134.8 kB[0m [31m3.2 MB/s[0m eta [36m0:00:00[0m
Installing collected packages: dill, multiprocess, datasets
Successfully installed datasets-2.16.1 dill-0.3.7 multiprocess-0.70.15


### Importing necessary libraries

In [2]:
# Importing the libraries needed
import pandas as pd
import torch
import transformers
from torch.utils.data import Dataset, DataLoader
from datasets import Dataset as HuggingFaceDataset
from transformers import DistilBertModel, DistilBertTokenizer
from sklearn.metrics import classification_report

In [3]:
# Setting up the device for GPU usage

from torch import cuda
device = 'cuda' if cuda.is_available() else 'cpu'

### Loading the datasets

In [4]:
# # Reading files from mounted Google Drive
# from google.colab import drive
# drive.mount('/content/drive')

# data_dir = "drive/MyDrive/ml_for_mde_project/data"
# train_datapath = data_dir + "/preprocessed_issues_train.csv"
# test_datapath = data_dir + "/preprocessed_issues_test.csv"

# train_set = pd.read_csv(train_datapath)
# test_set = pd.read_csv(test_datapath)

In [5]:
train_set = pd.read_csv("https://github.com/lhamu/issue-report-classification/raw/main/preprocessed_data/preprocessed_issues_train.csv")
test_set = pd.read_csv("https://github.com/lhamu/issue-report-classification/raw/main/preprocessed_data/preprocessed_issues_test.csv")

Loading the preprocessed train and test datasets

In [6]:
train_set = train_set.rename(columns={"issue_text": "text"})
test_set = test_set.rename(columns={"issue_text": "text"})
train_set.columns

Index(['repo', 'text', 'label'], dtype='object')

In [7]:
repos = list(set(train_set["repo"].unique()))
print(repos)

['tensorflow/tensorflow', 'facebook/react', 'bitcoin/bitcoin', 'microsoft/vscode', 'opencv/opencv']


In [8]:
train_set.groupby(["repo", "label"]).size().unstack(fill_value=0)

label,0,1,2
repo,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1
bitcoin/bitcoin,100,100,100
facebook/react,100,100,100
microsoft/vscode,100,100,100
opencv/opencv,100,100,100
tensorflow/tensorflow,100,100,100


In [9]:
group_by_repo = lambda dataset: {
    repo: dataset[dataset["repo"] == repo]
    for repo in dataset["repo"].unique()
}

train_sets = group_by_repo(train_set)
test_sets = group_by_repo(test_set)

In [10]:
datasets = {
    repo: {'train': train_sets[repo], 'test': test_sets[repo]} for repo in train_sets.keys()
}

### Defining key training variables

In [11]:
# Defining key variables for training
MAX_LEN = 512
TRAIN_BATCH_SIZE = 4
VALID_BATCH_SIZE = 2
EPOCHS = 1
LEARNING_RATE = 1e-05
tokenizer = DistilBertTokenizer.from_pretrained('distilbert-base-cased')

The secret `HF_TOKEN` does not exist in your Colab secrets.
To authenticate with the Hugging Face Hub, create a token in your settings tab (https://huggingface.co/settings/tokens), set it as secret in your Google Colab and restart your session.
You will be able to reuse this secret in all of your notebooks.
Please note that authentication is recommended but still optional to access public models or datasets.


tokenizer_config.json:   0%|          | 0.00/29.0 [00:00<?, ?B/s]

vocab.txt:   0%|          | 0.00/213k [00:00<?, ?B/s]

tokenizer.json:   0%|          | 0.00/436k [00:00<?, ?B/s]

config.json:   0%|          | 0.00/465 [00:00<?, ?B/s]

#### Triage Dataset Class <br/>

* The Triage class is defined to accept the Dataframe as input and generate tokenized output that is used by the DistilBERT model for training.
* The DistilBERT tokenizer is used to tokenize the data in the issue_text column of the dataframe.
* The tokenizer uses the encode_plus method to perform tokenization and generate the necessary outputs, namely: ids and attention_mask.

In [12]:
class Triage(Dataset):
    def __init__(self, dataframe, tokenizer, max_len):
        self.len = len(dataframe)
        self.data = dataframe
        self.tokenizer = tokenizer
        self.max_len = max_len

    def __getitem__(self, index):
        text = str(self.data.text[index])
        text = " ".join(text.split())
        inputs = self.tokenizer.encode_plus(
            text,
            None,
            add_special_tokens=True,
            max_length=self.max_len,
            pad_to_max_length=True,
            return_token_type_ids=True,
            truncation=True
        )
        ids = inputs['input_ids']
        mask = inputs['attention_mask']

        return {
            'ids': torch.tensor(ids, dtype=torch.long),
            'mask': torch.tensor(mask, dtype=torch.long),
            'targets': torch.tensor(self.data.label[index], dtype=torch.long)
        }

    def __len__(self):
        return self.len

### Definining the network model

* Added a drop out layer and a dense layer on top of distil bert to get the final output for the model.


In [13]:
# Creating the customized model, by adding a drop out and a dense layer on top of distil bert to get the final output for the model.

class DistillBERTClass(torch.nn.Module):
    def __init__(self):
        super(DistillBERTClass, self).__init__()
        self.l1 = DistilBertModel.from_pretrained("distilbert-base-uncased")
        self.pre_classifier = torch.nn.Linear(768, 768)
        self.dropout = torch.nn.Dropout(0.3)
        self.classifier = torch.nn.Linear(768, 4)

    def forward(self, input_ids, attention_mask):
        output_1 = self.l1(input_ids=input_ids, attention_mask=attention_mask)
        hidden_state = output_1[0]
        pooler = hidden_state[:, 0]
        pooler = self.pre_classifier(pooler)
        pooler = torch.nn.ReLU()(pooler)
        pooler = self.dropout(pooler)
        output = self.classifier(pooler)
        return output

In [14]:
model = DistillBERTClass()
model.to(device)

config.json:   0%|          | 0.00/483 [00:00<?, ?B/s]

model.safetensors:   0%|          | 0.00/268M [00:00<?, ?B/s]

DistillBERTClass(
  (l1): DistilBertModel(
    (embeddings): Embeddings(
      (word_embeddings): Embedding(30522, 768, padding_idx=0)
      (position_embeddings): Embedding(512, 768)
      (LayerNorm): LayerNorm((768,), eps=1e-12, elementwise_affine=True)
      (dropout): Dropout(p=0.1, inplace=False)
    )
    (transformer): Transformer(
      (layer): ModuleList(
        (0-5): 6 x TransformerBlock(
          (attention): MultiHeadSelfAttention(
            (dropout): Dropout(p=0.1, inplace=False)
            (q_lin): Linear(in_features=768, out_features=768, bias=True)
            (k_lin): Linear(in_features=768, out_features=768, bias=True)
            (v_lin): Linear(in_features=768, out_features=768, bias=True)
            (out_lin): Linear(in_features=768, out_features=768, bias=True)
          )
          (sa_layer_norm): LayerNorm((768,), eps=1e-12, elementwise_affine=True)
          (ffn): FFN(
            (dropout): Dropout(p=0.1, inplace=False)
            (lin1): Linear(i

### Defining the loss function and optimizer

In [15]:
# Creating the loss function and optimizer
loss_function = torch.nn.CrossEntropyLoss()
optimizer = torch.optim.Adam(params =  model.parameters(), lr=LEARNING_RATE)

In [16]:
# Function to calcuate the accuracy of the model

def calcuate_accu(big_idx, targets):
    n_correct = (big_idx==targets).sum().item()
    return n_correct

In [17]:
def loss_fn(outputs, targets):
    return loss_function(outputs, targets)

### Fine-tuning

In [18]:
# Defining the training function on the dataset for tuning the distilbert model

def train(train_loader):
    model.train()
    final_targets = []
    final_outputs = []
    for _,data in enumerate(train_loader):
        ids = data['ids'].to(device, dtype = torch.long)
        mask = data['mask'].to(device, dtype = torch.long)
        targets = data['targets'].to(device, dtype = torch.long)

        outputs = model(ids, mask)
        loss = loss_fn(outputs, targets)
        if _%500==0:
            print(f'Epoch: {1}, Loss:  {loss.item()}')

        loss.backward()
        optimizer.step()

    return model

In [19]:
def valid(model, testing_loader):
    model.eval()
    final_targets = []
    final_outputs = []
    with torch.no_grad():
        for _, data in enumerate(testing_loader, 0):
            ids = data['ids'].to(device, dtype = torch.long)
            mask = data['mask'].to(device, dtype = torch.long)
            targets = data['targets'].to(device, dtype = torch.long)
            outputs = model(ids, mask).squeeze()
            big_val, big_idx = torch.max(outputs.data, dim=1)
            final_targets.extend(targets.cpu().detach().numpy().tolist())
            final_outputs.extend(big_idx.cpu().detach().numpy().tolist())

    return final_targets, final_outputs

In [20]:
train_params = {'batch_size': TRAIN_BATCH_SIZE,
                'shuffle': True,
                'num_workers': 0
                }

test_params = {'batch_size': VALID_BATCH_SIZE,
                'shuffle': True,
                'num_workers': 0
                }

In [21]:
from collections import defaultdict

results = defaultdict(dict)
for repo in datasets.keys():
    train_set, test_set = datasets[repo]['train'], datasets[repo]['test']
    train_set.index = range(0, len(train_set))
    test_set.index = range(0, len(test_set))
    training_set = Triage(train_set, tokenizer, MAX_LEN)
    testing_set = Triage(test_set, tokenizer, MAX_LEN)
    training_loader = DataLoader(training_set, **train_params)
    testing_loader = DataLoader(testing_set, **test_params)

    model = train(training_loader)
    labels, predictions = valid(model, testing_loader)
    results[repo]['metrics'] = classification_report(labels, predictions, digits=4, output_dict=True)
    results[repo]['predictions'] = predictions
    # results['label_mapping']= {train_set.features["label"].int2str(x): x for x in range(train_set.features["label"].num_classes)}

    repo_name = repo.split('/')[1]
    output_model_file = f"distilbert_{repo_name}.bin"
    torch.save(model, output_model_file)



Epoch: 1, Loss:  1.4022746086120605


  _warn_prf(average, modifier, msg_start, len(result))
  _warn_prf(average, modifier, msg_start, len(result))
  _warn_prf(average, modifier, msg_start, len(result))


Epoch: 1, Loss:  0.5847713351249695




Epoch: 1, Loss:  0.9736608266830444


  _warn_prf(average, modifier, msg_start, len(result))
  _warn_prf(average, modifier, msg_start, len(result))
  _warn_prf(average, modifier, msg_start, len(result))


Epoch: 1, Loss:  1.268402099609375


  _warn_prf(average, modifier, msg_start, len(result))
  _warn_prf(average, modifier, msg_start, len(result))
  _warn_prf(average, modifier, msg_start, len(result))


Epoch: 1, Loss:  1.3362524509429932


  _warn_prf(average, modifier, msg_start, len(result))
  _warn_prf(average, modifier, msg_start, len(result))
  _warn_prf(average, modifier, msg_start, len(result))


### Displaying the output

In [22]:
import json

for repo in repos:
    print(repo)
    print(json.dumps(results[repo]['metrics'], indent=4))

tensorflow/tensorflow
{
    "0": {
        "precision": 0.41201716738197425,
        "recall": 0.96,
        "f1-score": 0.5765765765765767,
        "support": 100
    },
    "1": {
        "precision": 0.5555555555555556,
        "recall": 0.05,
        "f1-score": 0.09174311926605504,
        "support": 100
    },
    "2": {
        "precision": 0.27586206896551724,
        "recall": 0.16,
        "f1-score": 0.20253164556962025,
        "support": 100
    },
    "accuracy": 0.39,
    "macro avg": {
        "precision": 0.41447826396768234,
        "recall": 0.38999999999999996,
        "f1-score": 0.29028378047075065,
        "support": 300
    },
    "weighted avg": {
        "precision": 0.4144782639676823,
        "recall": 0.39,
        "f1-score": 0.2902837804707506,
        "support": 300
    }
}
facebook/react
{
    "0": {
        "precision": 0.7121212121212122,
        "recall": 0.94,
        "f1-score": 0.810344827586207,
        "support": 100
    },
    "1": {
        "p

In [23]:
class_metrics_sum = defaultdict(defaultdict)
labels = [key for key in results[repos[0]]['metrics'].keys() if key.isnumeric()]

for repo in repos:
    for label in labels:
        for metric in results[repo]['metrics'][label]:
            class_metrics_sum[label][metric] = class_metrics_sum[label].get(metric, 0) + results[repo]['metrics'][label][metric]

class_metrics_avg = {
    label: {
        metric: class_metrics_sum[label][metric] / len(repos)
        for metric in class_metrics_sum[label]
    }
    for label in labels
}

# add the average of the metric over all classes
class_metrics_avg['average'] = {
    metric: sum(class_metrics_avg[label][metric] for label in labels)
    / len(labels)
    for metric in class_metrics_avg[labels[0]]
}

# add to the results
results['overall'] = {
    'metrics': class_metrics_avg
}


### Saving the output

In [24]:
import os

output_file_name = 'distilbert_results.json'
with open(output_file_name, 'w') as fp:
    json.dump(results, fp)

### Comparing with the State-of-the-Art

In [25]:
import urllib.request

your_url = 'https://github.com/nlbse2024/issue-report-classification/raw/main/output/results.json'
with urllib.request.urlopen(your_url) as url:
    sota_data = json.loads(url.read().decode())

In [26]:
comparison_data = []

comparison_data.append(results["overall"]["metrics"]["average"])
comparison_data[-1]["process"] = "Fine-tuned DistilBERT"
comparison_data.append(sota_data["overall"]["metrics"]["average"])
comparison_data[-1]["process"] = "SOTA"

comparison_df = pd.DataFrame(comparison_data)
comparison_df

Unnamed: 0,precision,recall,f1-score,support,process
0,0.231561,0.398667,0.255363,100.0,Fine-tuned DistilBERT
1,0.830455,0.826667,0.827046,100.0,SOTA
