In [None]:
#required packages
#pip install sklearn
#pip install gensim

In [2]:
#Imports required packages
import pandas as pd
import numpy as np
import sklearn
from sklearn.feature_extraction.text import TfidfVectorizer
from sklearn.feature_extraction.text import CountVectorizer
from sklearn.feature_extraction.text import TfidfTransformer
from sklearn.metrics import classification_report
from sklearn.metrics import accuracy_score
from sklearn.compose import ColumnTransformer
from sklearn.pipeline import Pipeline
from sklearn.metrics import classification_report, confusion_matrix
import gensim
from sklearn.linear_model import LogisticRegression
from sklearn.metrics import precision_score, recall_score,f1_score
from sklearn.metrics import roc_curve,auc,roc_auc_score
import time
from sklearn.model_selection import GridSearchCV
from sklearn.linear_model import SGDClassifier

In [None]:
#Data links
#Training Data: https://tickettagger.blob.core.windows.net/datasets/nlbse23-issue-classification-train.csv.tar.gz
#Testing Data: https://tickettagger.blob.core.windows.net/datasets/nlbse23-issue-classification-test.csv.tar.gz

#Please download the required training and testing data and ensure they are present and in the same directory as the code

In [3]:
# load training data
traindf = pd.read_csv('nlbse23-issue-classification-train.csv')

# load testing data
testdf = pd.read_csv('nlbse23-issue-classification-test.csv') 

In [4]:
# checking the shape of the data
traindf.shape,testdf.shape

((1275881, 5), (142320, 5))

In [5]:
#merge title and body of training data
traindf['Summary'] = traindf["title"]+" "+traindf["body"]
# removing null
traindf = traindf[pd.notnull(traindf['Summary'])]

#merge title and body of testing data
testdf['Summary'] = testdf["title"]+" "+testdf["body"]

# removing null
testdf = testdf[pd.notnull(testdf['Summary'])]

In [6]:
# defining a function to clean data
def preprocess(text):  
    text = str(text)
    
    # lowercase
    text = text.lower()
    
    # remove non_alphanum
    text = gensim.parsing.preprocessing.strip_non_alphanum(text)
    
    # remove html tags
    text = gensim.parsing.preprocessing.strip_tags(text)
  
    # remove punctuation
    text = gensim.parsing.preprocessing.strip_punctuation(text)
  
    # remove numerics
    text = gensim.parsing.preprocessing.strip_numeric(text)
  
    # remove consecutive whitespace characters and convert tabs to spaces
    text = gensim.parsing.preprocessing.strip_multiple_whitespaces(text)
  
    #text = gensim.parsing.preprocessing.strip_short(text, minsize=3)
  
    #remove stop-words
    text = gensim.parsing.preprocessing.remove_stopwords(text)
    
    # make stems
    text = gensim.parsing.preprocessing.stem_text(text)
    
    return text
    

In [7]:
# clean training data
for index, row in traindf.iterrows():
    text = row['Summary']
    text = preprocess(text)
    traindf.at[index, 'Summary'] = text

In [8]:
# clean testing data
for index, row in testdf.iterrows():
    text = row['Summary']
    text = preprocess(text)
    testdf.at[index, 'Summary'] = text    

In [9]:
#convert training data labels to numbers
traindf['labels'].replace('bug', 0,inplace=True)
traindf['labels'].replace('feature', 1,inplace=True)
traindf['labels'].replace('question', 2,inplace=True)
traindf['labels'].replace('documentation', 3,inplace=True)

In [10]:
#convert testing data lables to numbers
testdf['labels'].replace('bug', 0,inplace=True)
testdf['labels'].replace('feature', 1,inplace=True)
testdf['labels'].replace('question', 2,inplace=True)
testdf['labels'].replace('documentation', 3,inplace=True)

In [11]:
# select only summary and labels
traindf = traindf[['Summary','labels']]
#pd.options.display.max_colwidth = None
traindf.head(1)

Unnamed: 0,Summary,labels
0,set log handler bpo http bug python org issu n...,3


In [12]:
# select only summary and labels
testdf = testdf[['Summary','labels']]
testdf.head(1)

Unnamed: 0,Summary,labels
0,possibl mislead express virtual environ tutori...,3


In [13]:
#get actual labels of training data
y_train = traindf['labels'].values

# get summary of training data
traindf = traindf['Summary']

#get actual labels of testing data
y_test = testdf['labels'].values

# get summary of testing data
testdf =  testdf['Summary']

# define TfidfVectorizer
vectorizer = TfidfVectorizer(sublinear_tf=True, norm='l2', encoding='latin-1', ngram_range=(1, 2), stop_words='english')

# Extracting features from the training data using a sparse vectorizer
X_train = vectorizer.fit_transform(traindf)

# Extracting features from the test data using the same vectorizer
X_test  = vectorizer.transform(testdf)

In [14]:
# timer to check training time
start_time = time.time()

#define classifier
classifier = SGDClassifier(loss='hinge', penalty='l2',alpha=0.000001, random_state=42,max_iter=20, tol=0.001)

#classifier = RidgeClassifier(tol=1e-2, solver="sparse_cg")

#fit classifier on training data
classifier.fit(X_train, y_train)

# print training time
print("Training time:", time.time() - start_time)

Training time: 152.24521780014038


In [15]:
# timer to check training time
start_time = time.time()

# prediction on test data for evaluation
predicted = classifier.predict(X_test)

# print testing time
print("Testing time:", time.time() - start_time)

Testing time: 0.8996913433074951


In [16]:
#print classification report
print(classification_report(y_test, predicted))

              precision    recall  f1-score   support

           0       0.87      0.92      0.90     74691
           1       0.84      0.86      0.85     52466
           2       0.77      0.38      0.51      8479
           3       0.78      0.57      0.66      6218

    accuracy                           0.85    141854
   macro avg       0.82      0.68      0.73    141854
weighted avg       0.85      0.85      0.85    141854



In [17]:
# calculate micro precision score
P = sklearn.metrics.precision_score(y_test, predicted, average='micro')

# calculate micro recall score
R = sklearn.metrics.recall_score(y_test, predicted, average='micro')

# calculate micro f1 score
F1 = sklearn.metrics.f1_score(y_test, predicted, average='micro')

#print micro scores
print("=*= micro averages =*=")
print(f"precision:\t{P:.4f}")
print(f"recall:\t\t{R:.4f}")
print(f"F1 score:\t{F1:.4f}")

=*= micro averages =*=
precision:	0.8523
recall:		0.8523
F1 score:	0.8523


In [21]:
#print(confusion_matrix(y_test, predicted))