In [1]:
#Imports required packages
import pandas as pd
import numpy as np
import sklearn
from sklearn.feature_extraction.text import TfidfVectorizer
from sklearn.feature_extraction.text import CountVectorizer
from sklearn.feature_extraction.text import TfidfTransformer
from sklearn.metrics import classification_report
from sklearn.metrics import accuracy_score
from sklearn.compose import ColumnTransformer
from sklearn.pipeline import Pipeline
from sklearn.metrics import classification_report, confusion_matrix
import gensim
from sklearn.linear_model import LogisticRegression
from sklearn.metrics import precision_score, recall_score,f1_score
from sklearn.metrics import roc_curve,auc,roc_auc_score
import time
from sklearn.model_selection import GridSearchCV
from sklearn.linear_model import SGDClassifier



In [2]:
# load training data
traindf = pd.read_csv('nlbse23-issue-classification-train.csv')

# load testing data
testdf = pd.read_csv('nlbse23-issue-classification-test.csv') 

In [3]:
traindf.shape,testdf.shape

((1275881, 5), (142320, 5))

In [4]:
#merge title and body of training data
traindf['Summary'] = traindf["title"]+" "+traindf["body"]
# removing null
traindf = traindf[pd.notnull(traindf['Summary'])]

#merge title and body of testing data
testdf['Summary'] = testdf["title"]+" "+testdf["body"]

# removing null
testdf = testdf[pd.notnull(testdf['Summary'])]

In [5]:
def preprocess(text):  
    text = str(text)
    
    # lowercase
    text = text.lower()
    
    text = gensim.parsing.preprocessing.strip_non_alphanum(text)
    
    # remove html tags
    text = gensim.parsing.preprocessing.strip_tags(text)
  
    # remove punctuation
    text = gensim.parsing.preprocessing.strip_punctuation(text)
  
    # remove numerics
    text = gensim.parsing.preprocessing.strip_numeric(text)
  
    # remove consecutive whitespace characters and convert tabs to spaces
    text = gensim.parsing.preprocessing.strip_multiple_whitespaces(text)
  
    #text = gensim.parsing.preprocessing.strip_short(text, minsize=3)
  
    text = gensim.parsing.preprocessing.remove_stopwords(text)
    # make stems
    text = gensim.parsing.preprocessing.stem_text(text)
    
    return text
    

In [6]:
# clean training data
for index, row in traindf.iterrows():
    text = row['Summary']
    text = preprocess(text)
    traindf.at[index, 'Summary'] = text

In [7]:
# clean testing data
for index, row in testdf.iterrows():
    text = row['Summary']
    text = preprocess(text)
    testdf.at[index, 'Summary'] = text    

In [8]:
#convert training data labels to numbers
traindf['labels'].replace('bug', 0,inplace=True)
traindf['labels'].replace('feature', 1,inplace=True)
traindf['labels'].replace('question', 2,inplace=True)
traindf['labels'].replace('documentation', 3,inplace=True)

In [9]:
#convert testing data lables to numbers
testdf['labels'].replace('bug', 0,inplace=True)
testdf['labels'].replace('feature', 1,inplace=True)
testdf['labels'].replace('question', 2,inplace=True)
testdf['labels'].replace('documentation', 3,inplace=True)

In [10]:
traindf = traindf[['Summary','labels']]
#pd.options.display.max_colwidth = None
traindf.head(1)

Unnamed: 0,Summary,labels
0,set log handler bpo http bug python org issu n...,3


In [11]:
testdf = testdf[['Summary','labels']]
testdf.head(1)

Unnamed: 0,Summary,labels
0,possibl mislead express virtual environ tutori...,3


In [12]:
y_train = traindf['labels'].values
traindf = traindf['Summary']

y_test = testdf['labels'].values
testdf =  testdf['Summary']

# define TfidfVectorizer
vectorizer = TfidfVectorizer(sublinear_tf=True, norm='l2', encoding='latin-1', ngram_range=(1, 2), stop_words='english')

# Extracting features from the training data using a sparse vectorizer
X_train = vectorizer.fit_transform(traindf)

# Extracting features from the test data using the same vectorizer
X_test  = vectorizer.transform(testdf)

In [13]:
traindf.head(1)

0    set log handler bpo http bug python org issu n...
Name: Summary, dtype: object

In [14]:
# timer
start_time = time.time()

#classifier = LogisticRegression(max_iter=1000)
#classifier = SGDClassifier(loss='hinge', penalty='l2',alpha=1e-3, random_state=42,max_iter=5, tol=None)

classifier = SGDClassifier(loss='hinge', penalty='l2',alpha=0.000001, random_state=42,max_iter=20, tol=0.001)

#classifier = SGDClassifier(loss='hinge', penalty='l2',alpha=0.000001, random_state=42,max_iter=5, tol=None)

#classifier = RidgeClassifier(tol=1e-2, solver="sparse_cg")
#alpha=0.000001 log_loss

#classifier = SGDClassifier(random_state=30)

classifier.fit(X_train, y_train)

print("Training time:", time.time() - start_time)

Training time: 122.51208090782166


In [15]:
start_time = time.time()
predicted = classifier.predict(X_test)
score = classifier.score(X_test, y_test)                  
print("Testing time:", time.time() - start_time)

Testing time: 1.3617045879364014


In [16]:
predicted = classifier.predict(X_test)
score = classifier.score(X_test, y_test)

print(classification_report(y_test, predicted))

              precision    recall  f1-score   support

           0       0.87      0.92      0.90     74691
           1       0.84      0.86      0.85     52466
           2       0.77      0.38      0.51      8479
           3       0.78      0.57      0.66      6218

    accuracy                           0.85    141854
   macro avg       0.82      0.68      0.73    141854
weighted avg       0.85      0.85      0.85    141854



In [17]:
P = sklearn.metrics.precision_score(y_test, predicted, average='micro')
R = sklearn.metrics.recall_score(y_test, predicted, average='micro')
F1 = sklearn.metrics.f1_score(y_test, predicted, average='micro')

print("=*= micro averages =*=")
print(f"precision:\t{P:.4f}")
print(f"recall:\t\t{R:.4f}")
print(f"F1 score:\t{F1:.4f}")

=*= micro averages =*=
precision:	0.8523
recall:		0.8523
F1 score:	0.8523


In [None]:
#print(confusion_matrix(y_test, predicted))

In [None]:
#classifier = RidgeClassifier(tol=1e-2, solver="sparse_cg")
#=*= micro averages =*=
#precision:	0.8486
#recall:		0.8486
#F1 score:	0.8486

In [None]:
#Tunning SGD
# Define the pipeline with TfidfVectorizer and SGDClassifier
pipeline = Pipeline([
    ('tfidf', TfidfVectorizer()),
    ('clf', SGDClassifier())
])

# Define the hyperparameter grid to search
param_grid = {
    'clf__loss': ['hinge', 'log', 'modified_huber', 'squared_hinge', 'perceptron'],
    'clf__penalty': ['l1', 'l2', 'elasticnet'],
    'clf__alpha': [0.0001, 0.001, 0.01, 0.1],
    'clf__max_iter': [5, 10, 50, 100, 1000]
}

# Create the grid search object
grid_search = GridSearchCV(pipeline, param_grid, cv=5, scoring='f1_macro')

# Fit the grid search to the data
grid_search.fit(X_train, y_train)

# Print the best parameters and the best score
print("Best parameters: ", grid_search.best_params_)
print("Best F1 score: ", grid_search.best_score_)

In [None]:
# clean training data
for index, row in df.iterrows():
    text = row['Summary']
    
    text = str(text)
    
    # lowercase
    text = text.lower()
    
    text = gensim.parsing.preprocessing.strip_non_alphanum(text)
    
    # remove html tags
    text = gensim.parsing.preprocessing.strip_tags(text)
  
    # remove punctuation
    text = gensim.parsing.preprocessing.strip_punctuation(text)
  
    # remove numerics
    text = gensim.parsing.preprocessing.strip_numeric(text)
  
    # remove consecutive whitespace characters and convert tabs to spaces
    text = gensim.parsing.preprocessing.strip_multiple_whitespaces(text)
  
    #text = gensim.parsing.preprocessing.strip_short(text, minsize=3)
  
    text = gensim.parsing.preprocessing.remove_stopwords(text)
  
    text = gensim.parsing.preprocessing.stem_text(text)

    df.at[index, 'Summary'] = tex