#**Issue Report Classification**

- Using simple Machine Learning algorithms for GitHub Issue Report Classification
- Multi-class classification

##Project Description
- Dataset used: Issue Report Classification competition 2024 dataset
<br/>
Link to dataset: https://github.com/nlbse2024/issue-report-classification
- Classes in dataset:
    - bug
    - feature
    - question

- The dataset was collected from:
    - bitcoin/bitcoin
    - facebook/react
    - microsoft/vscode
    - opencv/opencv
    - tensorflow/tensorflow


In [None]:
!pip install datasets

Collecting datasets
  Downloading datasets-2.16.1-py3-none-any.whl (507 kB)
[2K     [90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━[0m [32m507.1/507.1 kB[0m [31m6.3 MB/s[0m eta [36m0:00:00[0m
Collecting dill<0.3.8,>=0.3.0 (from datasets)
  Downloading dill-0.3.7-py3-none-any.whl (115 kB)
[2K     [90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━[0m [32m115.3/115.3 kB[0m [31m8.5 MB/s[0m eta [36m0:00:00[0m
Collecting multiprocess (from datasets)
  Downloading multiprocess-0.70.15-py310-none-any.whl (134 kB)
[2K     [90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━[0m [32m134.8/134.8 kB[0m [31m8.3 MB/s[0m eta [36m0:00:00[0m
Installing collected packages: dill, multiprocess, datasets
Successfully installed datasets-2.16.1 dill-0.3.7 multiprocess-0.70.15


### XGBoost Classifier

In [None]:
from xgboost import XGBClassifier
import pandas as pd
from sklearn.metrics import accuracy_score
from sklearn.model_selection import GridSearchCV
from datasets import Dataset

In [None]:
param_grid_xgb = {
    "max_depth": [4,5,6],
    "n_estimators": [200,300],
    "learning_rate": [0.1, 0.2, 0.25],
    "gamma": [10,50,100]
}

In [None]:
train_set = pd.read_csv("preprocessed_issues_train.csv")
test_set = pd.read_csv("preprocessed_issues_test.csv")

In [None]:
train_set = train_set.rename(columns={"issue_text": "text"})
test_set = test_set.rename(columns={"issue_text": "text"})
train_set.columns


Index(['repo', 'text', 'label'], dtype='object')

In [None]:
unique_repo_values = list(set(train_set["repo"]))
unique_repo_values

['facebook/react',
 'tensorflow/tensorflow',
 'bitcoin/bitcoin',
 'microsoft/vscode',
 'opencv/opencv']

In [None]:
from collections import defaultdict
from sklearn.metrics import classification_report
from sklearn.feature_extraction.text import TfidfVectorizer

results = defaultdict(dict)
for repo in unique_repo_values:
    repo_train_set = train_set[train_set['repo']==repo]
    repo_train_set["text"] = repo_train_set["text"].values.astype(str)
    print(repo_train_set.dtypes)
    repo_test_set = test_set[test_set['repo']==repo]
    repo_test_set["text"] = repo_test_set["text"].values.astype(str)
    vectorizer = TfidfVectorizer()
    X_train = repo_train_set["text"]
    X_train = vectorizer.fit_transform(X_train)
    y_train = repo_train_set["label"]
    X_test = repo_test_set["text"]
    X_test = vectorizer.transform(X_test)
    y_test = repo_test_set["label"]
    model = XGBClassifier(n_estimators=200, max_depth=6, learning_rate=0.25, gamma=10, reg_alpha=0.1,
                          reg_lambda=0.5, eval_metric="mlogloss", num_class=3,
                          objective="multi:softmax", random_state=42)

    model.fit(X_train, y_train)
    predictions = model.predict(X_test)
    results[repo]['metrics'] = classification_report(repo_test_set['label'], predictions, digits=4, output_dict=True)
    results[repo]['predictions'] = predictions.tolist()

A value is trying to be set on a copy of a slice from a DataFrame.
Try using .loc[row_indexer,col_indexer] = value instead

See the caveats in the documentation: https://pandas.pydata.org/pandas-docs/stable/user_guide/indexing.html#returning-a-view-versus-a-copy
  repo_train_set["text"] = repo_train_set["text"].values.astype(str)
A value is trying to be set on a copy of a slice from a DataFrame.
Try using .loc[row_indexer,col_indexer] = value instead

See the caveats in the documentation: https://pandas.pydata.org/pandas-docs/stable/user_guide/indexing.html#returning-a-view-versus-a-copy
  repo_test_set["text"] = repo_test_set["text"].values.astype(str)


repo     object
text     object
label     int64
dtype: object


A value is trying to be set on a copy of a slice from a DataFrame.
Try using .loc[row_indexer,col_indexer] = value instead

See the caveats in the documentation: https://pandas.pydata.org/pandas-docs/stable/user_guide/indexing.html#returning-a-view-versus-a-copy
  repo_train_set["text"] = repo_train_set["text"].values.astype(str)


repo     object
text     object
label     int64
dtype: object


A value is trying to be set on a copy of a slice from a DataFrame.
Try using .loc[row_indexer,col_indexer] = value instead

See the caveats in the documentation: https://pandas.pydata.org/pandas-docs/stable/user_guide/indexing.html#returning-a-view-versus-a-copy
  repo_test_set["text"] = repo_test_set["text"].values.astype(str)
A value is trying to be set on a copy of a slice from a DataFrame.
Try using .loc[row_indexer,col_indexer] = value instead

See the caveats in the documentation: https://pandas.pydata.org/pandas-docs/stable/user_guide/indexing.html#returning-a-view-versus-a-copy
  repo_train_set["text"] = repo_train_set["text"].values.astype(str)
A value is trying to be set on a copy of a slice from a DataFrame.
Try using .loc[row_indexer,col_indexer] = value instead

See the caveats in the documentation: https://pandas.pydata.org/pandas-docs/stable/user_guide/indexing.html#returning-a-view-versus-a-copy
  repo_test_set["text"] = repo_test_set["text"].values.astype(str)


repo     object
text     object
label     int64
dtype: object


A value is trying to be set on a copy of a slice from a DataFrame.
Try using .loc[row_indexer,col_indexer] = value instead

See the caveats in the documentation: https://pandas.pydata.org/pandas-docs/stable/user_guide/indexing.html#returning-a-view-versus-a-copy
  repo_train_set["text"] = repo_train_set["text"].values.astype(str)
A value is trying to be set on a copy of a slice from a DataFrame.
Try using .loc[row_indexer,col_indexer] = value instead

See the caveats in the documentation: https://pandas.pydata.org/pandas-docs/stable/user_guide/indexing.html#returning-a-view-versus-a-copy
  repo_test_set["text"] = repo_test_set["text"].values.astype(str)


repo     object
text     object
label     int64
dtype: object


A value is trying to be set on a copy of a slice from a DataFrame.
Try using .loc[row_indexer,col_indexer] = value instead

See the caveats in the documentation: https://pandas.pydata.org/pandas-docs/stable/user_guide/indexing.html#returning-a-view-versus-a-copy
  repo_train_set["text"] = repo_train_set["text"].values.astype(str)
A value is trying to be set on a copy of a slice from a DataFrame.
Try using .loc[row_indexer,col_indexer] = value instead

See the caveats in the documentation: https://pandas.pydata.org/pandas-docs/stable/user_guide/indexing.html#returning-a-view-versus-a-copy
  repo_test_set["text"] = repo_test_set["text"].values.astype(str)


repo     object
text     object
label     int64
dtype: object


In [None]:
import json

for repo in unique_repo_values:
    print(repo)
    print(json.dumps(results[repo]['metrics'], indent=4))

facebook/react
{
    "0": {
        "precision": 0.8846153846153846,
        "recall": 0.92,
        "f1-score": 0.9019607843137256,
        "support": 100
    },
    "1": {
        "precision": 0.6403508771929824,
        "recall": 0.73,
        "f1-score": 0.6822429906542056,
        "support": 100
    },
    "2": {
        "precision": 0.6463414634146342,
        "recall": 0.53,
        "f1-score": 0.5824175824175825,
        "support": 100
    },
    "accuracy": 0.7266666666666667,
    "macro avg": {
        "precision": 0.7237692417410004,
        "recall": 0.7266666666666666,
        "f1-score": 0.7222071191285045,
        "support": 300
    },
    "weighted avg": {
        "precision": 0.7237692417410003,
        "recall": 0.7266666666666667,
        "f1-score": 0.7222071191285044,
        "support": 300
    }
}
tensorflow/tensorflow
{
    "0": {
        "precision": 0.8421052631578947,
        "recall": 0.8,
        "f1-score": 0.8205128205128205,
        "support": 100
    },


In [None]:
class_metrics_sum = defaultdict(defaultdict)
labels = [key for key in results[unique_repo_values[0]]['metrics'].keys() if key.isnumeric()]

for repo in unique_repo_values:
    for label in labels:
        for metric in results[repo]['metrics'][label]:
            class_metrics_sum[label][metric] = class_metrics_sum[label].get(metric, 0) + results[repo]['metrics'][label][metric]

class_metrics_avg = {
    label: {
        metric: class_metrics_sum[label][metric] / len(unique_repo_values)
        for metric in class_metrics_sum[label]
    }
    for label in labels
}

# add the average of the metric over all classes
class_metrics_avg['average'] = {
    metric: sum(class_metrics_avg[label][metric] for label in labels)
    / len(labels)
    for metric in class_metrics_avg[labels[0]]
}

# add to the results
results['overall'] = {
    'metrics': class_metrics_avg
}


In [None]:
import os

output_file_name = 'xgboost_results.json'
with open(output_file_name, 'w') as fp:
    json.dump(results, fp)

### Random Forest Classifier

In [None]:
# importing necessary libraries
import numpy as np
import pandas as pd
from sklearn.model_selection import train_test_split
from sklearn.preprocessing import StandardScaler
from sklearn.ensemble import RandomForestClassifier
from sklearn.metrics import confusion_matrix

In [None]:
train_set = pd.read_csv("preprocessed_issues_train.csv")
test_set = pd.read_csv("preprocessed_issues_test.csv")

In [None]:
train_set = train_set.rename(columns={"issue_text": "text"})
test_set = test_set.rename(columns={"issue_text": "text"})
train_set.columns

Index(['repo', 'text', 'label'], dtype='object')

In [None]:
unique_repo_values = list(set(train_set["repo"]))
unique_repo_values

['facebook/react',
 'tensorflow/tensorflow',
 'bitcoin/bitcoin',
 'microsoft/vscode',
 'opencv/opencv']

In [43]:
from collections import defaultdict
from sklearn.metrics import classification_report
from sklearn.feature_extraction.text import TfidfVectorizer

results = defaultdict(dict)
for repo in unique_repo_values:
    repo_train_set = train_set[train_set['repo']==repo]
    repo_train_set["text"] = repo_train_set["text"].values.astype(str)
    print(repo_train_set.dtypes)
    repo_test_set = test_set[test_set['repo']==repo]
    repo_test_set["text"] = repo_test_set["text"].values.astype(str)
    vectorizer = TfidfVectorizer()
    X_train = repo_train_set["text"]
    X_train = vectorizer.fit_transform(X_train)
    y_train = repo_train_set["label"]
    X_test = repo_test_set["text"]
    X_test = vectorizer.transform(X_test)
    y_test = repo_test_set["label"]
    rf_classifier = RandomForestClassifier(n_estimators=10, criterion='entropy', random_state=42)
    rf_classifier.fit(X_train, y_train)
    predictions = rf_classifier.predict(X_test)
    results[repo]['metrics'] = classification_report(repo_test_set['label'], predictions, digits=4, output_dict=True)
    results[repo]['predictions'] = predictions.tolist()

A value is trying to be set on a copy of a slice from a DataFrame.
Try using .loc[row_indexer,col_indexer] = value instead

See the caveats in the documentation: https://pandas.pydata.org/pandas-docs/stable/user_guide/indexing.html#returning-a-view-versus-a-copy
  repo_train_set["text"] = repo_train_set["text"].values.astype(str)
A value is trying to be set on a copy of a slice from a DataFrame.
Try using .loc[row_indexer,col_indexer] = value instead

See the caveats in the documentation: https://pandas.pydata.org/pandas-docs/stable/user_guide/indexing.html#returning-a-view-versus-a-copy
  repo_test_set["text"] = repo_test_set["text"].values.astype(str)


repo     object
text     object
label     int64
dtype: object


A value is trying to be set on a copy of a slice from a DataFrame.
Try using .loc[row_indexer,col_indexer] = value instead

See the caveats in the documentation: https://pandas.pydata.org/pandas-docs/stable/user_guide/indexing.html#returning-a-view-versus-a-copy
  repo_train_set["text"] = repo_train_set["text"].values.astype(str)


repo     object
text     object
label     int64
dtype: object


A value is trying to be set on a copy of a slice from a DataFrame.
Try using .loc[row_indexer,col_indexer] = value instead

See the caveats in the documentation: https://pandas.pydata.org/pandas-docs/stable/user_guide/indexing.html#returning-a-view-versus-a-copy
  repo_test_set["text"] = repo_test_set["text"].values.astype(str)
A value is trying to be set on a copy of a slice from a DataFrame.
Try using .loc[row_indexer,col_indexer] = value instead

See the caveats in the documentation: https://pandas.pydata.org/pandas-docs/stable/user_guide/indexing.html#returning-a-view-versus-a-copy
  repo_train_set["text"] = repo_train_set["text"].values.astype(str)
A value is trying to be set on a copy of a slice from a DataFrame.
Try using .loc[row_indexer,col_indexer] = value instead

See the caveats in the documentation: https://pandas.pydata.org/pandas-docs/stable/user_guide/indexing.html#returning-a-view-versus-a-copy
  repo_test_set["text"] = repo_test_set["text"].values.astype(str)


repo     object
text     object
label     int64
dtype: object


A value is trying to be set on a copy of a slice from a DataFrame.
Try using .loc[row_indexer,col_indexer] = value instead

See the caveats in the documentation: https://pandas.pydata.org/pandas-docs/stable/user_guide/indexing.html#returning-a-view-versus-a-copy
  repo_train_set["text"] = repo_train_set["text"].values.astype(str)
A value is trying to be set on a copy of a slice from a DataFrame.
Try using .loc[row_indexer,col_indexer] = value instead

See the caveats in the documentation: https://pandas.pydata.org/pandas-docs/stable/user_guide/indexing.html#returning-a-view-versus-a-copy
  repo_test_set["text"] = repo_test_set["text"].values.astype(str)


repo     object
text     object
label     int64
dtype: object


A value is trying to be set on a copy of a slice from a DataFrame.
Try using .loc[row_indexer,col_indexer] = value instead

See the caveats in the documentation: https://pandas.pydata.org/pandas-docs/stable/user_guide/indexing.html#returning-a-view-versus-a-copy
  repo_train_set["text"] = repo_train_set["text"].values.astype(str)


repo     object
text     object
label     int64
dtype: object


A value is trying to be set on a copy of a slice from a DataFrame.
Try using .loc[row_indexer,col_indexer] = value instead

See the caveats in the documentation: https://pandas.pydata.org/pandas-docs/stable/user_guide/indexing.html#returning-a-view-versus-a-copy
  repo_test_set["text"] = repo_test_set["text"].values.astype(str)


In [44]:
import json

for repo in unique_repo_values:
    print(repo)
    print(json.dumps(results[repo]['metrics'], indent=4))

facebook/react
{
    "0": {
        "precision": 0.9134615384615384,
        "recall": 0.95,
        "f1-score": 0.9313725490196078,
        "support": 100
    },
    "1": {
        "precision": 0.6434782608695652,
        "recall": 0.74,
        "f1-score": 0.6883720930232557,
        "support": 100
    },
    "2": {
        "precision": 0.6419753086419753,
        "recall": 0.52,
        "f1-score": 0.574585635359116,
        "support": 100
    },
    "accuracy": 0.7366666666666667,
    "macro avg": {
        "precision": 0.732971702657693,
        "recall": 0.7366666666666667,
        "f1-score": 0.7314434258006598,
        "support": 300
    },
    "weighted avg": {
        "precision": 0.732971702657693,
        "recall": 0.7366666666666667,
        "f1-score": 0.7314434258006598,
        "support": 300
    }
}
tensorflow/tensorflow
{
    "0": {
        "precision": 0.75,
        "recall": 0.81,
        "f1-score": 0.7788461538461539,
        "support": 100
    },
    "1": {
     

In [45]:
class_metrics_sum = defaultdict(defaultdict)
labels = [key for key in results[unique_repo_values[0]]['metrics'].keys() if key.isnumeric()]

for repo in unique_repo_values:
    for label in labels:
        for metric in results[repo]['metrics'][label]:
            class_metrics_sum[label][metric] = class_metrics_sum[label].get(metric, 0) + results[repo]['metrics'][label][metric]

class_metrics_avg = {
    label: {
        metric: class_metrics_sum[label][metric] / len(unique_repo_values)
        for metric in class_metrics_sum[label]
    }
    for label in labels
}

# add the average of the metric over all classes
class_metrics_avg['average'] = {
    metric: sum(class_metrics_avg[label][metric] for label in labels)
    / len(labels)
    for metric in class_metrics_avg[labels[0]]
}

# add to the results
results['overall'] = {
    'metrics': class_metrics_avg
}

In [46]:
import os

output_file_name = 'random_forest_results.json'
with open(output_file_name, 'w') as fp:
    json.dump(results, fp)