In [1]:
#Imports required packages
import pandas as pd
import numpy as np
from numpy import mean
import sklearn
from sklearn.feature_selection import SelectKBest, f_classif
from sklearn.model_selection import cross_validate
from sklearn.feature_extraction.text import TfidfVectorizer
from sklearn.feature_extraction.text import CountVectorizer
from sklearn.feature_extraction.text import TfidfTransformer
from sklearn.metrics import classification_report
from sklearn.compose import ColumnTransformer
from sklearn.pipeline import Pipeline
from sklearn.metrics import classification_report, confusion_matrix
import gensim
from sklearn.metrics import precision_score, recall_score,f1_score
from sklearn.metrics import roc_curve,auc,roc_auc_score, make_scorer
import time
from sklearn.model_selection import GridSearchCV

from sklearn.model_selection import KFold
from sklearn.naive_bayes import MultinomialNB
from sklearn.svm import SVC
from sklearn.linear_model import LogisticRegression, SGDClassifier
from sklearn.neighbors import KNeighborsClassifier
from sklearn.ensemble import RandomForestClassifier
from sklearn.tree import DecisionTreeClassifier
#from xgboost import XGBClassifier
from sklearn.metrics import accuracy_score, recall_score, f1_score, matthews_corrcoef, roc_auc_score, precision_score
import time
import psutil

In [2]:
# load training data - description and title files
IR_titles = pd.read_csv('Herzig dataset/title.csv')
IR_desc = pd.read_csv('Herzig dataset/desc.csv')

In [4]:
IR_desc.iloc[:, 2].head(2)

0    there coupl place your catch interruptedioexce...
1    the execut method ha follow simplifi flow 1 ge...
Name: the httpstate class ha clearcooki method not synchron but should consid modifi arraylist which unsynchron all other method which modifi read arraylist synchron except clearcooki method I stumbl upon fact becaus webapp I am work use httpclient threw illegalargumentexcept indic one cooki array return methodnam null which shouldnt possibl upon further inspect and test onli possibl option threadsafeti hole left unsynchron clearcooki method caus issu, dtype: object

In [5]:
traindf = pd.DataFrame()
traindf['labels'] = IR_desc.iloc[:, 1] #labels: 1: Bug and 0; Other
traindf['Summary'] = IR_titles.iloc[:, 2]+" "+IR_desc.iloc[:, 2]

In [6]:
traindf.head(10)

Unnamed: 0,labels,Summary
0,1,catch sockettimeoutexcept not interruptedioexc...
1,1,except dure writerequest leav connect unreleas...
2,0,incorrect debug messag httpmethodbas methodnam...
3,1,host request header doe not contain port the h...
4,1,httpclient fail reconnect after keepal connect...
5,0,http client give sme messag proxyhttp endpoint...
6,0,there no way specifi differ auth scheme priori...
7,0,httpclient per default relentlessli spam stder...
8,0,implement ignorecooki cookiespec It would use ...
9,0,javadoc getconnect method connect manag the ja...


In [7]:
# checking the shape of the data
traindf.shape #,testdf.shape

(5590, 2)

In [8]:
# defining a function to clean data
def preprocess(text):  
    text = str(text)
    
    # lowercase
    text = text.lower()
    
    # remove non_alphanum
    #text = gensim.parsing.preprocessing.strip_non_alphanum(text)
    
    # remove html tags
    #text = gensim.parsing.preprocessing.strip_tags(text)
  
    # remove punctuation
    text = gensim.parsing.preprocessing.strip_punctuation(text)
  
    # remove numerics
    #text = gensim.parsing.preprocessing.strip_numeric(text)
  
    # remove consecutive whitespace characters and convert tabs to spaces
    text = gensim.parsing.preprocessing.strip_multiple_whitespaces(text)
  
    #text = gensim.parsing.preprocessing.strip_short(text, minsize=3)
  
    #remove stop-words
    text = gensim.parsing.preprocessing.remove_stopwords(text)
    
    # make stems
    text = gensim.parsing.preprocessing.stem_text(text)
    
    return text
    

In [9]:
# clean training data
for index, row in traindf.iterrows():
    text = row['Summary']
    text = preprocess(text)
    traindf.at[index, 'Summary'] = text

In [10]:
traindf.head(2)

Unnamed: 0,labels,Summary
0,1,catch sockettimeoutexcept interruptedioexcept ...
1,1,dure writerequest leav connect unrelea execut ...


In [11]:
#get actual labels of training data
y = traindf['labels'].values

# get summary of training data
X = traindf['Summary']

tfidf_vectorizer = TfidfVectorizer(sublinear_tf=True, norm='l2', encoding='latin-1', ngram_range=(1, 2),
                                   stop_words='english')

X_tfidf = tfidf_vectorizer.fit_transform(X)

In [None]:
# timer to check girdsearch time
start_time = time.time()
# Define the parameter grid for grid search for SGD classifier
sgd_param_grid = {
    'alpha': [0.0001, 0.001, 0.01],
    'penalty': ['l1', 'l2'],
    'max_iter': [1000, 2000],
    'loss': ['hinge', 'log']
}

sgdclassifier = SGDClassifier()
sgd_grid_search = GridSearchCV(sgdclassifier, sgd_param_grid,  cv=10, n_jobs=-1)
sgd_grid_search.fit(X_tfidf, y)

# Print the best parameters and best score
print("Best Parameters:", sgd_grid_search.best_params_)
print("Best Score:", sgd_grid_search.best_score_)

In [14]:
# start time for training
start_time = time.time()

# Define the algorithms to be evaluated
algorithms = {
    #'Logistic Regression (LR)': LogisticRegression(C = 100, penalty = 'l1', solver= 'saga')
    #'Naive Bayes (NB)': MultinomialNB(alpha= 0.1)
    #'Support Vector Machine (SVM)': SVC(C=10, gamma= 'scale', kernel= 'linear')
    #'K-Nearest Neighbors (KNN)': KNeighborsClassifier(n_neighbors= 7, p= 2, weights= 'distance')
    #'Random Forest (RF)': RandomForestClassifier(max_depth = None, min_samples_split = 5, n_estimators = 100),
    #'Decision Tree (DT)': DecisionTreeClassifier(max_depth = 20, min_samples_split = 5) #,
    'SGDClassifier (hinge)': SGDClassifier(alpha= 0.0001, loss= 'hinge', max_iter= 2000, penalty= 'l2')
}

# Define performance metric functions
performance_metrics = {
    'Accuracy': accuracy_score,
    'Recall': recall_score,
    'Precision': precision_score,
    'F1': f1_score,
    'MCC': matthews_corrcoef,
    'AUC': roc_auc_score
}

In [15]:
# Perform 10-fold cross-validation
num_folds = 10
kf = KFold(n_splits=num_folds, shuffle=True, random_state=42)

for algorithm_name, algorithm in algorithms.items():
    print(f"Algorithm: {algorithm_name}")
    
    bug_recall_scores = []
    bug_precision_scores = []
    bug_f1_scores = []

    nonbug_recall_scores = []
    nonbug_precision_scores = []
    nonbug_f1_scores = []

    accuracy_scores = []
    auc_scores = []
    mcc_scores = []
    
    for train_index, test_index in kf.split(X_tfidf):
        X_train, X_test = X_tfidf[train_index], X_tfidf[test_index]
        y_train, y_test = y[train_index], y[test_index]

        algorithm.fit(X_train, y_train)
        #print training time
        #print("Training time:", time.time() - start_time)
        y_pred = algorithm.predict(X_test)

        bug_recall_scores.append(recall_score(y_test, y_pred, pos_label=1))
        bug_precision_scores.append(precision_score(y_test, y_pred, pos_label=1))
        bug_f1_scores.append(f1_score(y_test, y_pred, pos_label=1))

        nonbug_recall_scores.append(recall_score(y_test, y_pred, pos_label=0))
        nonbug_precision_scores.append(precision_score(y_test, y_pred, pos_label=0))
        nonbug_f1_scores.append(f1_score(y_test, y_pred, pos_label=0))

        accuracy_scores.append(accuracy_score(y_test, y_pred))
        auc_scores.append(roc_auc_score(y_test, y_pred))
        mcc_scores.append(matthews_corrcoef(y_test, y_pred))

    #Compute average scores
    # print training time
    print("Training time:", time.time() - start_time)
    print('accuracy:', mean(accuracy_scores))
    print('auc:', mean(auc_scores))
    print('mcc:', mean(mcc_scores))
    print('nMCC:', ((1+mean(mcc_scores))/2))

    print('bug_recall_scores:', mean(bug_recall_scores))
    print('bug_precision_scores:', mean(bug_precision_scores))
    print('bug_f1_scores:', mean(bug_f1_scores))

    print('nonbug_recall_scores:', mean(nonbug_recall_scores))
    print('nonbug_precision_scores:', mean(nonbug_precision_scores))
    print('nonbug_f1_scores:', mean(nonbug_f1_scores))

Algorithm: SGDClassifier (hinge)
Training time: 2.064100503921509
accuracy: 0.8334525939177102
auc: 0.801352602490657
mcc: 0.6246041589615816
nMCC: 0.8123020794807908
bug_recall_scores: 0.6962537444946225
bug_precision_scores: 0.798373450142631
bug_f1_scores: 0.7432364603825572
nonbug_recall_scores: 0.9064514604866915
nonbug_precision_scores: 0.8490729329326572
nonbug_f1_scores: 0.8766707332842276


In [15]:
pid = psutil.Process().pid
memory_usage_in_bytes = psutil.Process(pid).memory_info().rss
memory_usage_in_megabytes = memory_usage_in_bytes / 1024**2

print(memory_usage_in_megabytes)

208.85546875


In [None]:
# Define the parameter grids for grid search for each classifier
knn_param_grid = {
    'n_neighbors': [1, 3, 5, 7],
    'weights': ['uniform', 'distance'],
    'p': [1, 2]
}

rf_param_grid = {
    'n_estimators': [50, 100, 200],
    'max_depth': [None, 10, 20],
    'min_samples_split': [2, 5, 10]
}

dt_param_grid = {
    'max_depth': [None, 10, 20, 30],
    'min_samples_split': [2, 5, 10]
}

# Create the GridSearchCV instances for each classifier
knn_grid_search = GridSearchCV(KNeighborsClassifier(), knn_param_grid, cv=5, n_jobs=-1)
rf_grid_search = GridSearchCV(RandomForestClassifier(), rf_param_grid, cv=5, n_jobs=-1)
dt_grid_search = GridSearchCV(DecisionTreeClassifier(), dt_param_grid, cv=5, n_jobs=-1)

# Fit the grid searches to the data for each classifier
knn_grid_search.fit(X_tfidf, y)
rf_grid_search.fit(X_tfidf, y)
dt_grid_search.fit(X_tfidf, y)

# Print the best parameters and best scores for each classifier
print("KNeighborsClassifier Best Parameters:", knn_grid_search.best_params_)
print("KNeighborsClassifier Best Score:", knn_grid_search.best_score_)

print("RandomForestClassifier Best Parameters:", rf_grid_search.best_params_)
print("RandomForestClassifier Best Score:", rf_grid_search.best_score_)

print("DecisionTreeClassifier Best Parameters:", dt_grid_search.best_params_)
print("DecisionTreeClassifier Best Score:", dt_grid_search.best_score_)

In [None]:
# Define the parameter grid for grid search for SVM
svm_param_grid = {
    'C': [0.1, 1, 10],
    'kernel': ['linear', 'poly', 'rbf', 'sigmoid'],
    'gamma': ['scale', 'auto'] + [0.01, 0.1, 1, 10]
}

svcclassifier = SVC()
svm_grid_search = GridSearchCV(svcclassifier, svm_param_grid, cv=5, n_jobs=-1)  # 5-fold cross-validation
svm_grid_search.fit(X_tfidf, y)

# Print the best parameters and best score
print("Best Parameters:", svm_grid_search.best_params_)
print("Best Score:", svm_grid_search.best_score_)

In [None]:
# Define the parameter grid for grid search for SGD classifier
sgd_param_grid = {
    'alpha': [0.0001, 0.001, 0.01],
    'penalty': ['l1', 'l2'],
    'max_iter': [1000, 2000],
    'loss': ['hinge', 'log']
}

sgdclassifier = SGDClassifier()
sgd_grid_search = GridSearchCV(sgdclassifier, sgd_param_grid, cv=5)  # 5-fold cross-validation
sgd_grid_search.fit(X_tfidf, y)

# Print the best parameters and best score
print("Best Parameters:", sgd_grid_search.best_params_)
print("Best Score:", sgd_grid_search.best_score_)

In [None]:
# Set up a pipeline with a CountVectorizer and MultinomialNB
pipeline = Pipeline([
    ('vectorizer', TfidfVectorizer()),
    ('classifier', MultinomialNB())
])


# Define the parameter grid for grid search
#mnb_param_grid = { 'vectorizer__max_features': [1000, 5000, 10000],'vectorizer__ngram_range': [(1, 1), (1, 2)],
   # 'classifier__alpha': [0.1, 1, 10]
#}

mnb_param_grid = { 'classifier__alpha': [0.1, 1, 10] }

mnbclf = MultinomialNB()

# Create the GridSearchCV instance
mnb_grid_search = GridSearchCV(mnbclf, mnb_param_grid, cv=5, n_jobs=-1)

# Fit the grid search to the data
mnb_grid_search.fit(X_tfidf, y)

# Print the best parameters and best score
print("Best Parameters:", mnb_grid_search.best_params_)
print("Best Score:", mnb_grid_search.best_score_)

In [None]:
Best Parameters: {'classifier__alpha': 1, 'vectorizer__max_features': 10000, 'vectorizer__ngram_range': (1, 2)}
Best Score: 0.8003577817531304

In [None]:
# Define the parameter grid for grid search for LR

lr_param_grid = {
    'C': [0.01, 0.1, 1, 10, 100],  # Inverse of regularization strength
    'penalty': ['l1', 'l2'],       # Regularization type (L1 or L2)
    'solver': ['liblinear', 'saga']  # Solver algorithms for logistic regression
}

logreg = LogisticRegression(max_iter=1000)  # Create a logistic regression classifier

lr_grid_search = GridSearchCV(logreg, lr_param_grid, cv=5, n_jobs=-1)  # 5-fold cross-validation
lr_grid_search.fit(X_tfidf, y)

best_params = lr_grid_search.best_params_
print("Best Parameters:", best_params)

In [None]:
# timer to check training time
start_time = time.time()

# prediction on test data for evaluation
predicted = classifier.predict(X_test)

# print testing time
print("Testing time:", time.time() - start_time)

In [None]:
#print classification report
print(classification_report(y_test, predicted))

In [None]:
# calculate micro precision score
P = sklearn.metrics.precision_score(y_test, predicted, average='micro')

# calculate micro recall score
R = sklearn.metrics.recall_score(y_test, predicted, average='micro')

# calculate micro f1 score
F1 = sklearn.metrics.f1_score(y_test, predicted, average='micro')

#print micro scores
print("=*= micro averages =*=")
print(f"precision:\t{P:.4f}")
print(f"recall:\t\t{R:.4f}")
print(f"F1 score:\t{F1:.4f}")

In [None]:
#print(confusion_matrix(y_test, predicted))