#**Issue Report Classification using Traditional Machine Learning methods**

- This notebook contains the code for using traditional machine learning methods for multi-class issue report classification.

## Implementation details

### Algorithms used:
* XGBoost multi-class classification
* Random Forest multi-class classification


### Installing required libraries

In [1]:
!pip install datasets

Collecting datasets
  Downloading datasets-2.16.1-py3-none-any.whl (507 kB)
[2K     [90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━[0m [32m507.1/507.1 kB[0m [31m3.2 MB/s[0m eta [36m0:00:00[0m
Collecting dill<0.3.8,>=0.3.0 (from datasets)
  Downloading dill-0.3.7-py3-none-any.whl (115 kB)
[2K     [90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━[0m [32m115.3/115.3 kB[0m [31m11.3 MB/s[0m eta [36m0:00:00[0m
Collecting multiprocess (from datasets)
  Downloading multiprocess-0.70.15-py310-none-any.whl (134 kB)
[2K     [90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━[0m [32m134.8/134.8 kB[0m [31m14.5 MB/s[0m eta [36m0:00:00[0m
Installing collected packages: dill, multiprocess, datasets
Successfully installed datasets-2.16.1 dill-0.3.7 multiprocess-0.70.15


## XGBoost Classifier

### Importing necessary libraries

In [2]:
from xgboost import XGBClassifier
import pandas as pd
from sklearn.metrics import accuracy_score
from sklearn.model_selection import GridSearchCV
from datasets import Dataset

In [3]:
param_grid_xgb = {
    "max_depth": [4,5,6],
    "n_estimators": [200,300],
    "learning_rate": [0.1, 0.2, 0.25],
    "gamma": [10,50,100]
}

### Loading the datasets

In [4]:
train_file_path = r"https://github.com/lhamu/issue-report-classification/raw/main/preprocessed_data/preprocessed_issues_train.csv"
test_file_path = r"https://github.com/lhamu/issue-report-classification/raw/main/preprocessed_data/preprocessed_issues_test.csv"

In [5]:
train_set = pd.read_csv(train_file_path)
test_set = pd.read_csv(test_file_path)

In [6]:
train_set = train_set.rename(columns={"issue_text": "text"})
test_set = test_set.rename(columns={"issue_text": "text"})
train_set.columns


Index(['repo', 'text', 'label'], dtype='object')

In [7]:
unique_repo_values = list(set(train_set["repo"]))
unique_repo_values

['microsoft/vscode',
 'facebook/react',
 'opencv/opencv',
 'tensorflow/tensorflow',
 'bitcoin/bitcoin']

### Training and Inference

In [8]:
from collections import defaultdict
from sklearn.metrics import classification_report
from sklearn.feature_extraction.text import TfidfVectorizer

xgboost_results = defaultdict(dict)
for repo in unique_repo_values:
    repo_train_set = train_set[train_set['repo']==repo]
    repo_train_set["text"] = repo_train_set["text"].values.astype(str)
    repo_test_set = test_set[test_set['repo']==repo]
    repo_test_set["text"] = repo_test_set["text"].values.astype(str)
    vectorizer = TfidfVectorizer()
    X_train = repo_train_set["text"]
    X_train = vectorizer.fit_transform(X_train)
    y_train = repo_train_set["label"]
    X_test = repo_test_set["text"]
    X_test = vectorizer.transform(X_test)
    y_test = repo_test_set["label"]
    model = XGBClassifier(n_estimators=200, max_depth=6, learning_rate=0.25, gamma=10, reg_alpha=0.1,
                          reg_lambda=0.5, eval_metric="mlogloss", num_class=3,
                          objective="multi:softmax", random_state=42)

    model.fit(X_train, y_train)
    predictions = model.predict(X_test)
    xgboost_results[repo]['metrics'] = classification_report(repo_test_set['label'], predictions, digits=4, output_dict=True)
    xgboost_results[repo]['predictions'] = predictions.tolist()

A value is trying to be set on a copy of a slice from a DataFrame.
Try using .loc[row_indexer,col_indexer] = value instead

See the caveats in the documentation: https://pandas.pydata.org/pandas-docs/stable/user_guide/indexing.html#returning-a-view-versus-a-copy
  repo_train_set["text"] = repo_train_set["text"].values.astype(str)
A value is trying to be set on a copy of a slice from a DataFrame.
Try using .loc[row_indexer,col_indexer] = value instead

See the caveats in the documentation: https://pandas.pydata.org/pandas-docs/stable/user_guide/indexing.html#returning-a-view-versus-a-copy
  repo_test_set["text"] = repo_test_set["text"].values.astype(str)
A value is trying to be set on a copy of a slice from a DataFrame.
Try using .loc[row_indexer,col_indexer] = value instead

See the caveats in the documentation: https://pandas.pydata.org/pandas-docs/stable/user_guide/indexing.html#returning-a-view-versus-a-copy
  repo_train_set["text"] = repo_train_set["text"].values.astype(str)
A valu

### Displaying the results

In [9]:
import json

for repo in unique_repo_values:
    print(repo)
    print(json.dumps(xgboost_results[repo]['metrics'], indent=4))

microsoft/vscode
{
    "0": {
        "precision": 0.572463768115942,
        "recall": 0.79,
        "f1-score": 0.6638655462184875,
        "support": 100
    },
    "1": {
        "precision": 0.6666666666666666,
        "recall": 0.44,
        "f1-score": 0.5301204819277109,
        "support": 100
    },
    "2": {
        "precision": 0.7291666666666666,
        "recall": 0.7,
        "f1-score": 0.7142857142857142,
        "support": 100
    },
    "accuracy": 0.6433333333333333,
    "macro avg": {
        "precision": 0.6560990338164251,
        "recall": 0.6433333333333333,
        "f1-score": 0.6360905808106375,
        "support": 300
    },
    "weighted avg": {
        "precision": 0.656099033816425,
        "recall": 0.6433333333333333,
        "f1-score": 0.6360905808106375,
        "support": 300
    }
}
facebook/react
{
    "0": {
        "precision": 0.8846153846153846,
        "recall": 0.92,
        "f1-score": 0.9019607843137256,
        "support": 100
    },
    "1"

In [10]:
class_metrics_sum = defaultdict(defaultdict)
labels = [key for key in xgboost_results[unique_repo_values[0]]['metrics'].keys() if key.isnumeric()]

for repo in unique_repo_values:
    for label in labels:
        for metric in xgboost_results[repo]['metrics'][label]:
            class_metrics_sum[label][metric] = class_metrics_sum[label].get(metric, 0) + xgboost_results[repo]['metrics'][label][metric]

class_metrics_avg = {
    label: {
        metric: class_metrics_sum[label][metric] / len(unique_repo_values)
        for metric in class_metrics_sum[label]
    }
    for label in labels
}

# add the average of the metric over all classes
class_metrics_avg['average'] = {
    metric: sum(class_metrics_avg[label][metric] for label in labels)
    / len(labels)
    for metric in class_metrics_avg[labels[0]]
}

# add to the xgboost_results
xgboost_results['overall'] = {
    'metrics': class_metrics_avg
}


### Saving the results

In [11]:
import os

output_file_name = 'xgboost_results.json'
with open(output_file_name, 'w') as fp:
    json.dump(xgboost_results, fp)

## Random Forest Classifier

### Importing necessary libraries

In [12]:
# importing necessary libraries
import numpy as np
import pandas as pd
from sklearn.model_selection import train_test_split
from sklearn.preprocessing import StandardScaler
from sklearn.ensemble import RandomForestClassifier
from sklearn.metrics import confusion_matrix

### Loading the datasets

In [13]:
train_file_path = r"https://github.com/lhamu/issue-report-classification/raw/main/preprocessed_data/preprocessed_issues_train.csv"
test_file_path = r"https://github.com/lhamu/issue-report-classification/raw/main/preprocessed_data/preprocessed_issues_test.csv"

In [14]:
train_set = pd.read_csv(train_file_path)
test_set = pd.read_csv(test_file_path)

In [15]:
train_set = train_set.rename(columns={"issue_text": "text"})
test_set = test_set.rename(columns={"issue_text": "text"})
train_set.columns

Index(['repo', 'text', 'label'], dtype='object')

In [16]:
unique_repo_values = list(set(train_set["repo"]))
unique_repo_values

['microsoft/vscode',
 'facebook/react',
 'opencv/opencv',
 'tensorflow/tensorflow',
 'bitcoin/bitcoin']

### Training and Inference

In [17]:
from collections import defaultdict
from sklearn.metrics import classification_report
from sklearn.feature_extraction.text import TfidfVectorizer

results = defaultdict(dict)
for repo in unique_repo_values:
    repo_train_set = train_set[train_set['repo']==repo]
    repo_train_set["text"] = repo_train_set["text"].values.astype(str)
    print(repo_train_set.dtypes)
    repo_test_set = test_set[test_set['repo']==repo]
    repo_test_set["text"] = repo_test_set["text"].values.astype(str)
    vectorizer = TfidfVectorizer()
    X_train = repo_train_set["text"]
    X_train = vectorizer.fit_transform(X_train)
    y_train = repo_train_set["label"]
    X_test = repo_test_set["text"]
    X_test = vectorizer.transform(X_test)
    y_test = repo_test_set["label"]
    rf_classifier = RandomForestClassifier(n_estimators=10, criterion='entropy', random_state=42)
    rf_classifier.fit(X_train, y_train)
    predictions = rf_classifier.predict(X_test)
    results[repo]['metrics'] = classification_report(repo_test_set['label'], predictions, digits=4, output_dict=True)
    results[repo]['predictions'] = predictions.tolist()

A value is trying to be set on a copy of a slice from a DataFrame.
Try using .loc[row_indexer,col_indexer] = value instead

See the caveats in the documentation: https://pandas.pydata.org/pandas-docs/stable/user_guide/indexing.html#returning-a-view-versus-a-copy
  repo_train_set["text"] = repo_train_set["text"].values.astype(str)
A value is trying to be set on a copy of a slice from a DataFrame.
Try using .loc[row_indexer,col_indexer] = value instead

See the caveats in the documentation: https://pandas.pydata.org/pandas-docs/stable/user_guide/indexing.html#returning-a-view-versus-a-copy
  repo_test_set["text"] = repo_test_set["text"].values.astype(str)


repo     object
text     object
label     int64
dtype: object


A value is trying to be set on a copy of a slice from a DataFrame.
Try using .loc[row_indexer,col_indexer] = value instead

See the caveats in the documentation: https://pandas.pydata.org/pandas-docs/stable/user_guide/indexing.html#returning-a-view-versus-a-copy
  repo_train_set["text"] = repo_train_set["text"].values.astype(str)
A value is trying to be set on a copy of a slice from a DataFrame.
Try using .loc[row_indexer,col_indexer] = value instead

See the caveats in the documentation: https://pandas.pydata.org/pandas-docs/stable/user_guide/indexing.html#returning-a-view-versus-a-copy
  repo_test_set["text"] = repo_test_set["text"].values.astype(str)


repo     object
text     object
label     int64
dtype: object


A value is trying to be set on a copy of a slice from a DataFrame.
Try using .loc[row_indexer,col_indexer] = value instead

See the caveats in the documentation: https://pandas.pydata.org/pandas-docs/stable/user_guide/indexing.html#returning-a-view-versus-a-copy
  repo_train_set["text"] = repo_train_set["text"].values.astype(str)


repo     object
text     object
label     int64
dtype: object


A value is trying to be set on a copy of a slice from a DataFrame.
Try using .loc[row_indexer,col_indexer] = value instead

See the caveats in the documentation: https://pandas.pydata.org/pandas-docs/stable/user_guide/indexing.html#returning-a-view-versus-a-copy
  repo_test_set["text"] = repo_test_set["text"].values.astype(str)
A value is trying to be set on a copy of a slice from a DataFrame.
Try using .loc[row_indexer,col_indexer] = value instead

See the caveats in the documentation: https://pandas.pydata.org/pandas-docs/stable/user_guide/indexing.html#returning-a-view-versus-a-copy
  repo_train_set["text"] = repo_train_set["text"].values.astype(str)


repo     object
text     object
label     int64
dtype: object


A value is trying to be set on a copy of a slice from a DataFrame.
Try using .loc[row_indexer,col_indexer] = value instead

See the caveats in the documentation: https://pandas.pydata.org/pandas-docs/stable/user_guide/indexing.html#returning-a-view-versus-a-copy
  repo_test_set["text"] = repo_test_set["text"].values.astype(str)
A value is trying to be set on a copy of a slice from a DataFrame.
Try using .loc[row_indexer,col_indexer] = value instead

See the caveats in the documentation: https://pandas.pydata.org/pandas-docs/stable/user_guide/indexing.html#returning-a-view-versus-a-copy
  repo_train_set["text"] = repo_train_set["text"].values.astype(str)
A value is trying to be set on a copy of a slice from a DataFrame.
Try using .loc[row_indexer,col_indexer] = value instead

See the caveats in the documentation: https://pandas.pydata.org/pandas-docs/stable/user_guide/indexing.html#returning-a-view-versus-a-copy
  repo_test_set["text"] = repo_test_set["text"].values.astype(str)


repo     object
text     object
label     int64
dtype: object


### Displaying the results

In [18]:
import json

for repo in unique_repo_values:
    print(repo)
    print(json.dumps(results[repo]['metrics'], indent=4))

microsoft/vscode
{
    "0": {
        "precision": 0.5737704918032787,
        "recall": 0.7,
        "f1-score": 0.6306306306306306,
        "support": 100
    },
    "1": {
        "precision": 0.625,
        "recall": 0.5,
        "f1-score": 0.5555555555555556,
        "support": 100
    },
    "2": {
        "precision": 0.6938775510204082,
        "recall": 0.68,
        "f1-score": 0.686868686868687,
        "support": 100
    },
    "accuracy": 0.6266666666666667,
    "macro avg": {
        "precision": 0.6308826809412289,
        "recall": 0.6266666666666666,
        "f1-score": 0.6243516243516244,
        "support": 300
    },
    "weighted avg": {
        "precision": 0.6308826809412289,
        "recall": 0.6266666666666667,
        "f1-score": 0.6243516243516244,
        "support": 300
    }
}
facebook/react
{
    "0": {
        "precision": 0.9134615384615384,
        "recall": 0.95,
        "f1-score": 0.9313725490196078,
        "support": 100
    },
    "1": {
        "

In [19]:
class_metrics_sum = defaultdict(defaultdict)
labels = [key for key in results[unique_repo_values[0]]['metrics'].keys() if key.isnumeric()]

for repo in unique_repo_values:
    for label in labels:
        for metric in results[repo]['metrics'][label]:
            class_metrics_sum[label][metric] = class_metrics_sum[label].get(metric, 0) + results[repo]['metrics'][label][metric]

class_metrics_avg = {
    label: {
        metric: class_metrics_sum[label][metric] / len(unique_repo_values)
        for metric in class_metrics_sum[label]
    }
    for label in labels
}

# add the average of the metric over all classes
class_metrics_avg['average'] = {
    metric: sum(class_metrics_avg[label][metric] for label in labels)
    / len(labels)
    for metric in class_metrics_avg[labels[0]]
}

# add to the results
results['overall'] = {
    'metrics': class_metrics_avg
}

### Saving the results

In [20]:
import os

output_file_name = 'random_forest_results.json'
with open(output_file_name, 'w') as fp:
    json.dump(results, fp)

## Comparing with the State-of-the-Art

In [21]:
import urllib.request

your_url = 'https://github.com/nlbse2024/issue-report-classification/raw/main/output/results.json'
with urllib.request.urlopen(your_url) as url:
    sota_data = json.loads(url.read().decode())

In [22]:
comparison_data = []

comparison_data.append(xgboost_results["overall"]["metrics"]["average"])
comparison_data[-1]["process"] = "XGBoost"
comparison_data.append(results["overall"]["metrics"]["average"])
comparison_data[-1]["process"] = "Random Forest"
comparison_data.append(sota_data["overall"]["metrics"]["average"])
comparison_data[-1]["process"] = "SOTA"

comparison_df = pd.DataFrame(comparison_data)
comparison_df

Unnamed: 0,precision,recall,f1-score,support,process
0,0.669943,0.665333,0.66125,100.0,XGBoost
1,0.656971,0.657333,0.653641,100.0,Random Forest
2,0.830455,0.826667,0.827046,100.0,SOTA
