In [1]:
import nltk
#nltk.download('wordnet')
nltk.download('stopwords')

[nltk_data] Downloading package stopwords to
[nltk_data]     C:\Users\User\AppData\Roaming\nltk_data...
[nltk_data]   Package stopwords is already up-to-date!


True

In [2]:
import csv
import nltk
import numpy as np
import pandas as pd

from nltk.tokenize import RegexpTokenizer
from nltk.corpus import stopwords
from nltk.stem import WordNetLemmatizer
from nltk.stem import SnowballStemmer
from sklearn.feature_extraction.text import CountVectorizer
from sklearn.feature_extraction.text import TfidfVectorizer
import re

from sklearn.naive_bayes import MultinomialNB, GaussianNB
from sklearn.svm import SVC
from sklearn import metrics
from sklearn.metrics import confusion_matrix#, ConfusionMatrixDisplay
import seaborn as sb
from sklearn.model_selection import train_test_split
from sklearn.model_selection import KFold

import pickle
from scipy.sparse import save_npz, load_npz

## Load data

In [3]:
df = pd.read_csv("issues.csv")
type_list = df.issue_type.to_list()
priority_list = df.priority.to_list()
mat = load_npz('matrix.npz')

### Merge other features to matrix

## Train models on different data sets or with different labels

In [10]:
relevant_classes = ['Improvement', 'Bug', 'New Feature', 'Task', 'Sub-task']
models = ["MultinomialNB", "GaussianNB", "SVM"]

# use_given_types: uses labels from type_list instead of only the relevant_classes, use this
# if you want to check how the models perfoms on all classes (worse) or when the loaded data comes from
def build_model_from_files(name_extension, use_given_types = False, use_all_features = True, model = "MultinomialNB"):
    df = pd.read_csv('issues'+name_extension+'.csv')
    mat = load_npz('matrix'+name_extension+'.npz')
    type_list = df.issue_type.to_list()
    if  not use_given_types:
        type_list = [t if t in relevant_classes else 'Other' for t in type_list]
    priority_list = df.priority.to_list()
        
    X_train, X_test, y_train, y_test = train_test_split(mat, type_list, test_size=0.1, random_state=1234)
    
    if model == "MultinomialNB":
        clf = MultinomialNB()
        clf.fit(X_train, y_train)
    elif model == "GaussianNB":
        clf = GaussianNB()
        clf.fit(X_train.toarray(), y_train)
    elif model == "SVM":
        clf = SVC(gamma='auto')
        clf.fit(X_train, y_train)
    train_predicted= clf.predict(X_train.toarray())
    test_predicted = clf.predict(X_test.toarray())
    
    print(model + " Training accuracy:", metrics.accuracy_score(y_train, train_predicted))
    print(model + " Test accuracy:",metrics.accuracy_score(y_test, test_predicted))
    confusion = confusion_matrix(y_test, test_predicted)
    #ConfusionMatrixDisplay(confusion, binary_types)
    print(confusion)
    
    y_binary_train  = [y if y == 'Bug' else 'Other' for y in y_train]
    y_binary_test = [y if y == 'Bug' else 'Other' for y in y_test]
    if model == "MultinomialNB":
        binary_model = MultinomialNB().fit(X_train, y_binary_train)
    elif model == "GaussianNB":
        binary_model = GaussianNB().fit(X_train.toarray(), y_binary_train)
    elif model == "SVM":
        binary_model = SVC(gamma='auto').fit(X_train, y_binary_train)
        
        
    train_binary_predicted = binary_model.predict(X_train.toarray())
    test_binary_predicted = binary_model.predict(X_test.toarray())
    
    print(model + " Training accuracy (on binary data):",metrics.accuracy_score(y_binary_train, train_binary_predicted))
    print(model + " Test accuracy (on binary data):",metrics.accuracy_score(y_binary_test, test_binary_predicted))
    binary_confusion = confusion_matrix(y_binary_test, test_binary_predicted)
    #ConfusionMatrixDisplay(confusion, binary_types)
    print(binary_confusion)
    
    


In [12]:
# old model
build_model_from_files('', model="GaussianNB")

GaussianNB Training accuracy: 0.16510644918704456
GaussianNB Test accuracy: 0.14469078179696615
[[282  73 356 461 741 228]
 [ 24  22 247 203 616 125]
 [  3   0  85  44 159  20]
 [  0   3  14  32  52  12]
 [  1   1  17  17 133  12]
 [  3   3  19  23 188  66]]
GaussianNB Training accuracy (on binary data): 0.6560692892150507
GaussianNB Test accuracy (on binary data): 0.6387397899649941
[[ 778 1363]
 [ 185 1959]]


In [63]:
def cross_validation(name_extension, use_given_types = False, use_all_features = True, random_state = 1234, splits=10):
    df = pd.read_csv('issues'+name_extension+'.csv')
    mat = load_npz('matrix'+name_extension+'.npz')
    type_list = df.issue_type.to_list()
    if not use_given_types:
        type_list = [t if t in relevant_classes else 'Other' for t in type_list]
    priority_list = df.priority.to_list()
    
    kf = KFold(n_splits = splits)
    kf.random_state = 1234
    kf.shuffle = True
    kf.get_n_splits(mat)
    
    whole_data_accuracy = []
    binary_data_accuracy = []
    
    for train, test in kf.split(mat):
        X_train, X_test, y_train, y_test = mat[train], mat[test], [type_list[x] for x in train], [type_list[x] for x in test]
        clf = MultinomialNB().fit(X_train, y_train)
        predicted= clf.predict(X_test)
        whole_data_accuracy.append(metrics.accuracy_score(y_test, predicted))
        y_binary_train  = [y if y == 'Bug' else 'Other' for y in y_train]
        y_binary_test = [y if y == 'Bug' else 'Other' for y in y_test]
        binary_model = MultinomialNB().fit(X_train, y_binary_train)
        binary_predicted = binary_model.predict(X_test)
        binary_data_accuracy.append(metrics.accuracy_score(y_binary_test, binary_predicted))
    return np.array(whole_data_accuracy), np.array(binary_data_accuracy)


In [68]:
for i in range(5, 21):
    whole, binary = cross_validation('', use_given_types = False, use_all_features = True, random_state = 12345, splits=i)
    print("Split = "+ str(i) +"; mean accuracy (all data): " + str(np.mean(whole)) + "; mean accuracy (binary data): " + str(np.mean(binary)))

Split = 5; mean accuracy (all data): 0.6036220628127794; mean accuracy (binary data): 0.7635362633181696
Split = 6; mean accuracy (all data): 0.6039722138640019; mean accuracy (binary data): 0.7639565111721356
Split = 7; mean accuracy (all data): 0.6038788272780259; mean accuracy (binary data): 0.7638397915340672
Split = 8; mean accuracy (all data): 0.603575429424944; mean accuracy (binary data): 0.7635828976848393
Split = 9; mean accuracy (all data): 0.604252141240169; mean accuracy (binary data): 0.7643296834241643
Split = 10; mean accuracy (all data): 0.6041820967982681; mean accuracy (binary data): 0.7645632714384859
Split = 11; mean accuracy (all data): 0.6036688766129183; mean accuracy (binary data): 0.7642601320887598
Split = 12; mean accuracy (all data): 0.6042756176492813; mean accuracy (binary data): 0.7641431416214391
Split = 13; mean accuracy (all data): 0.6033187079910382; mean accuracy (binary data): 0.7640730022404779
Split = 14; mean accuracy (all data): 0.6037386096766

In [6]:
# 'Other' issues have been discarded
build_model_from_files('_reduced', use_given_types = True)

MultinomialNB Accuracy: 0.6098729930505632
[[1852  210    0    9    2]
 [ 587  657    2    1   12]
 [ 137  206    7    0    1]
 [  95   70    1   12    4]
 [ 154  132    0    5   17]]
MultinomialNB Accuracy (on binary data): 0.7620416966211359
[[1515  558]
 [ 435 1665]]


In [7]:
# 'Other' issues have been discarded but uses class 'Wish' in addition to relevant_classes
build_model_from_files('_reduced_plus_wish', use_given_types = True)

MultinomialNB Accuracy: 0.6111636707663197
[[1831  204    0    6    5    0]
 [ 587  715    3    2   13    0]
 [ 122  196    2    1    1    0]
 [  90   89    0   19    6    0]
 [ 142  114    0    9   17    0]
 [  25   29    0    0    0    0]]
MultinomialNB Accuracy (on binary data): 0.772469252601703
[[1510  536]
 [ 426 1756]]


In [8]:
# builds method based on new text processing method
build_model_from_files('_2')

MultinomialNB Accuracy: 0.5873978996499416
[[1940  183    0    1    7   10]
 [ 686  545    0    0    3    3]
 [ 133  170    2    0    5    1]
 [  62   48    0    0    0    3]
 [ 101   59    0    0   19    2]
 [ 170  108    1    0   12   11]]
MultinomialNB Accuracy (on binary data): 0.7409568261376897
[[1547  594]
 [ 516 1628]]


In [None]:
# TODO model where code has not been removed

## Model aggregation