In [1]:
import codecs
import json
import os
import warnings
import time
from joblib import dump, load
#from nltk import SnowballStemmer

warnings.filterwarnings('ignore')

import numpy
#!{sys.executable} -m pip install numpy
from sklearn.feature_extraction.text import TfidfVectorizer
#!{sys.executable} -m pip install sklearn
from sklearn.feature_selection import SelectKBest, chi2
from sklearn.model_selection import cross_val_score, train_test_split, cross_validate
from sklearn.neural_network import MLPClassifier

from sklearn.preprocessing import LabelBinarizer

In [2]:
boost_summary = 3
project_keys = ["HTTPCLIENT", "LUCENE", "JCR"]

Load data

In [3]:
def load_data():
    raw_data = []
    data_directory = ".." + os.path.sep + "data"
    for filename in os.listdir(data_directory):
        with codecs.open(data_directory + os.path.sep + filename, "r", "utf-8") as fin:
            raw_data += json.load(fin)
    return raw_data

In [4]:
def get_corpus_labels(raw_data, stemmer=None):
    # Corpus building.
    corpus = []
    labels = []
    n_bug = 0
    for n_file in raw_data:

        txt = ""
        for i in range(boost_summary):
            txt += n_file["summary"] + " "
            if stemmer is not None:
                txt = stemming_textual_data(stemmer,txt)

        if stemmer is not None:
            n_file["description"] = stemming_textual_data(stemmer,n_file["description"])

        corpus.append(txt + " " + n_file["description"])
        labels.append(n_file["label"])
        if n_file["label"] == "BUG":
            n_bug += 1
    print(f"{n_bug} BUG / {len(labels)} \n")
    return corpus, labels

In [5]:
def stemming_textual_data(stemmer, textual_data):
    return ' '.join([stemmer.stem(word) for word in textual_data.split(' ')])

In [6]:
def feature_computing(corpus, labels, vectorizer, feature_selection = True, k_best=30000):
    # TF-IDF.
    start_time = time.time()
    print("Feature computing.")
    X = vectorizer.fit_transform(corpus)
    print(f"\t{X.shape[1]} features.")

    if feature_selection:
        print("Extracting %d best features by a chi-squared test" % k_best)
        ch2 = SelectKBest(chi2, k=k_best)
        
        X = ch2.fit_transform(X, labels)
        
        #feature_names = vectorizer.get_feature_names()
        #if feature_names:  # keep selected feature names.
        #    feature_names = [feature_names[i] for i in ch2.get_support(indices=True)]
            
        #print(feature_names)
        print("--- %s seconds for feature computing ---" % (time.time() - start_time))
        return X, vectorizer, ch2
    
    print("--- %s seconds for feature computing ---" % (time.time() - start_time))
    return X, vectorizer

Split data by project

In [7]:
def split_data_by_project(raw_data, stemmer=None):
    # Create dicts of tickets for each project
    dict_data_split = {}
    print("Split data for each project")
    for project_key in project_keys:
        dict_data_split[project_key] = {}
        dict_data_split[project_key]["tickets"] = []
        dict_data_split[project_key]["corpus"] = []
        dict_data_split[project_key]["labels"] = []

    for ticket in raw_data:
        for project_key in project_keys:
            if project_key in ticket["key"]:
                dict_data_split[project_key]["tickets"].append(ticket)

    for project_key in project_keys:
        print("Get corpus and labels for project: ", project_key)
        tickets = dict_data_split[project_key]["tickets"]
        # Get corpus and labels for specific project tickets
        if stemmer is not None:
            corpus, labels = get_corpus_labels(tickets,stemmer)
        else:
            corpus, labels = get_corpus_labels(tickets)
        dict_data_split[project_key]["corpus"] = corpus
        dict_data_split[project_key]["labels"] = labels

    return dict_data_split

In [8]:
def labels_binarizing(labels):
    lb = LabelBinarizer()
    # Binarize labels with BUG = 0 and NBUG = 1
    labels = numpy.array([number[0] for number in lb.fit_transform(labels)])
    # Inverse 0 and 1 to have good labels, i.e BUG = 1 and NBUG = 0
    return numpy.logical_not(labels).astype(int)

In [9]:
def make_scoring(X, binarized_labels, clf, cv=10):
    start_time = time.time()
    #scores = cross_val_score(clf, X, binarized_labels, cv=cv, scoring='f1')
    scores = cross_validate(clf, X, binarized_labels, cv=10, scoring=('f1', 'precision', 'recall', 'accuracy'))
    
    print("F1score: %0.3f" % scores['test_f1'].mean())
    print("95%% Confidence Interval +/- %0.3f" % (scores['test_f1'].std() * 2))
    print("Standard deviation: %0.3f\n" % scores['test_f1'].std())

    #scores = cross_val_score(clf, X, binarized_labels, cv=cv, scoring='precision')
    print("Precision: %0.3f" % scores['test_precision'].mean())
    print("95%% Confidence Interval +/- %0.3f" % (scores['test_precision'].std() * 2))
    print("Standard deviation: %0.3f\n" % scores['test_precision'].std())

    #scores = cross_val_score(clf, X, binarized_labels, cv=cv, scoring='recall')
    print("Recall: %0.3f" % scores['test_recall'].mean())
    print("95%% Confidence Interval +/- %0.3f" % (scores['test_recall'].std() * 2))
    print("Standard deviation: %0.3f\n" % scores['test_recall'].std())

    #scores = cross_val_score(clf, X, binarized_labels, cv=cv, scoring='accuracy')
    print("Accuracy: %0.3f" % scores['test_accuracy'].mean())
    print("95%% Confidence Interval +/- %0.3f" % (scores['test_accuracy'].std() * 2))
    print("Standard deviation: %0.3f\n" % scores['test_accuracy'].std())
    print("--- %s seconds for scoring ---" % (time.time() - start_time))

# Setting 1

Most raw setting.
- Use only uni-grams
- No feature selection
- Multi-Layer Perceptron set as default
- No optimizations

Parameters: 
* No feature selection
* TfidfVectorizer(ngram_range=(1, 1), sublinear_tf=False)
* MLPClassifier()

In [10]:
def run_setting_1(raw_data):
    print("=> Run setting 1 <=")
    print("Get corpus and labels for all projects")
    corpus, labels = get_corpus_labels(raw_data)
    vectorizer = TfidfVectorizer(ngram_range=(1, 1), sublinear_tf=False)
    mlp = MLPClassifier(random_state=0)
    X, vectorizer = feature_computing(corpus, labels, vectorizer, feature_selection=False)
    binarized_labels = labels_binarizing(labels)
    dict_data_split = split_data_by_project(raw_data)

    print("=====> Scoring Cross project <=====")
    make_scoring(X,binarized_labels,mlp)

    for key in project_keys:
        corpus = dict_data_split[key]["corpus"]
        labels = dict_data_split[key]["labels"]

        binarized_labels = labels_binarizing(labels)

        X = vectorizer.transform(corpus)
        print("=====> Scoring " + key + " <=====")
        make_scoring(X,binarized_labels,mlp)
    print("=> End run setting 1 <=")

Run setting 1

In [None]:
raw_data = load_data()
run_setting_1(raw_data)

=> Run setting 1 <=
Get corpus and labels for all projects
1940 BUG / 5591 

Feature computing.
	27033 features.
Split data for each project
Get corpus and labels for project:  HTTPCLIENT
305 BUG / 746 

Get corpus and labels for project:  LUCENE
697 BUG / 2443 

Get corpus and labels for project:  JCR
938 BUG / 2402 

=====> Scoring Cross project <=====
F1score: 0.748
95% Confidence Interval +/- 0.058
Standard deviation: 0.029



# Setting 2

Setting using only corpus processing methods i.e, TF-Idf and Chi-2 for feature selection.
- Uni-grams and Bi-grams + Rare/Stop words dropping + Log function on term-frequency
- No Feature selection
- Multi-Layer Perceptron set as default
- No optimizations

Parameters:
* TfidfVectorizer(max_df=0.5, min_df=2, ngram_range=(1, 3), stop_words={"english"}, sublinear_tf=True)
* MLPClassifier(activation='tanh', learning_rate='adaptive', max_iter=100, random_state=0)

In [10]:
def run_setting_2(raw_data):
    print("=> Run setting 2 <=")
    print("Get corpus and labels for all projects")
    corpus, labels = get_corpus_labels(raw_data)
    vectorizer = TfidfVectorizer(max_df=0.5, min_df=2, ngram_range=(1, 2), stop_words={"english"},  sublinear_tf=True)
    mlp = MLPClassifier(random_state=0)
    X, vectorizer = feature_computing(corpus, labels, vectorizer, feature_selection=False)
    binarized_labels = labels_binarizing(labels)
    dict_data_split = split_data_by_project(raw_data)

    print("=====> Scoring Cross project <=====")
    make_scoring(X,binarized_labels,mlp)

    for key in project_keys:
        corpus = dict_data_split[key]["corpus"]
        labels = dict_data_split[key]["labels"]

        binarized_labels = labels_binarizing(labels)

        X = vectorizer.transform(corpus)
        print("=====> Scoring " + key + " <=====")
        make_scoring(X,binarized_labels,mlp)
    print("=> End run setting 2 <=")

Run setting 2

In [11]:
raw_data = load_data()
run_setting_2(raw_data)

=> Run setting 2 <=
Get corpus and labels for all projects
1940 BUG / 5591 

Feature computing.
	63924 features.
Split data for each project
Get corpus and labels for project:  HTTPCLIENT
305 BUG / 746 

Get corpus and labels for project:  LUCENE
697 BUG / 2443 

Get corpus and labels for project:  JCR
938 BUG / 2402 

=====> Scoring Cross project <=====
F1score: 0.774
95% Confidence Interval +/- 0.035
Standard deviation: 0.018

Precision: 0.816
95% Confidence Interval +/- 0.133
Standard deviation: 0.066

Recall: 0.743
95% Confidence Interval +/- 0.102
Standard deviation: 0.051

Accuracy: 0.849
95% Confidence Interval +/- 0.038
Standard deviation: 0.019

=====> Scoring HTTPCLIENT <=====
F1score: 0.713
95% Confidence Interval +/- 0.100
Standard deviation: 0.050

Precision: 0.738
95% Confidence Interval +/- 0.118
Standard deviation: 0.059

Recall: 0.692
95% Confidence Interval +/- 0.123
Standard deviation: 0.062

Accuracy: 0.772
95% Confidence Interval +/- 0.082
Standard deviation: 0.041

# Setting 3

Setting using only corpus processing methods i.e, TF-Idf and Chi-2 for feature selection.
- Uni-grams and Bi-grams + Rare/Stop words dropping + Log function on term-frequency
- Feature selection with Chi2 (20000)
- Multi-Layer Perceptron set as default
- No optimizations

Parameters: 
* Feature selection
* 30000 features
* TfidfVectorizer(max_df=0.5, min_df=2, ngram_range=(1, 3), stop_words={"english"}, sublinear_tf=True)
* MLPClassifier(activation='tanh', learning_rate='adaptive', max_iter=100, random_state=0)

In [12]:
def run_setting_3(raw_data):
    print("=> Run setting 3 <=")
    print("Get corpus and labels for all projects")
    corpus, labels = get_corpus_labels(raw_data)
    vectorizer = TfidfVectorizer(max_df=0.5, min_df=2, ngram_range=(1, 3), stop_words={"english"}, sublinear_tf=True)
    mlp = MLPClassifier(random_state=0)
    X, vectorizer, ch2 = feature_computing(corpus, labels, vectorizer)
    binarized_labels = labels_binarizing(labels)
    dict_data_split = split_data_by_project(raw_data)

    print("=====> Scoring Cross project <=====")
    make_scoring(X,binarized_labels,mlp)

    for key in project_keys:
        corpus = dict_data_split[key]["corpus"]
        labels = dict_data_split[key]["labels"]

        binarized_labels = labels_binarizing(labels)

        X = vectorizer.transform(corpus)
        X = ch2.transform(X)
        print("=====> Scoring " + key + " <=====")
        make_scoring(X,binarized_labels,mlp)
    print("=> End run setting 3 <=")

Run setting 3

In [None]:
raw_data = load_data()
run_setting_3(raw_data)

=> Run setting 3 <=
Get corpus and labels for all projects
1940 BUG / 5591 

Feature computing.
	99349 features.
Extracting 30000 best features by a chi-squared test
Split data for each project
Get corpus and labels for project:  HTTPCLIENT
305 BUG / 746 

Get corpus and labels for project:  LUCENE
697 BUG / 2443 

Get corpus and labels for project:  JCR
938 BUG / 2402 

=====> Scoring Cross project <=====
F1score: 0.866
95% Confidence Interval +/- 0.031
Standard deviation: 0.015

Precision: 0.929
95% Confidence Interval +/- 0.082
Standard deviation: 0.041

Recall: 0.813
95% Confidence Interval +/- 0.073
Standard deviation: 0.036

Accuracy: 0.913
95% Confidence Interval +/- 0.021
Standard deviation: 0.010

=====> Scoring HTTPCLIENT <=====
F1score: 0.761
95% Confidence Interval +/- 0.081
Standard deviation: 0.041

Precision: 0.909
95% Confidence Interval +/- 0.142
Standard deviation: 0.071

Recall: 0.656
95% Confidence Interval +/- 0.068
Standard deviation: 0.034

Accuracy: 0.831
95% Co

# Setting 4
Intermediate setting optimized only with Grid-Search:
- Uni-grams and Bi-grams + Rare/Stop words dropping + Log function on term-frequency
- A feature number not optimized (20000)
- MLP parameters optimized with Grid-Search

Parameters
* Feature selection
* 30000 features
* TfidfVectorizer(max_df=0.5, min_df=2, ngram_range=(1, 3), stop_words={"english"}, sublinear_tf=True)
* MLPClassifier(activation='tanh', learning_rate='adaptive', max_iter=100, random_state=0)

In [10]:
def run_setting_4(raw_data):
    print("=> Run setting 4 <=")
    print("Get corpus and labels for all projects")
    corpus, labels = get_corpus_labels(raw_data)
    vectorizer = TfidfVectorizer(max_df=0.5, min_df=2, ngram_range=(1, 3), stop_words={"english"}, sublinear_tf=True)
    mlp = MLPClassifier(activation='tanh', learning_rate='adaptive', max_iter=100, random_state=0)
    X, vectorizer, ch2 = feature_computing(corpus, labels, vectorizer)
    binarized_labels = labels_binarizing(labels)
    dict_data_split = split_data_by_project(raw_data)

    print("=====> Scoring Cross project <=====")
    make_scoring(X,binarized_labels,mlp)

    for key in project_keys:
        corpus = dict_data_split[key]["corpus"]
        labels = dict_data_split[key]["labels"]

        binarized_labels = labels_binarizing(labels)

        X = vectorizer.transform(corpus)
        X = ch2.transform(X)
        print("=====> Scoring " + key + " <=====")
        make_scoring(X,binarized_labels,mlp)
    print("=> End run setting 4 <=")

Run setting 4

In [None]:
raw_data = load_data()
run_setting_4(raw_data)

=> Run setting 4 <=
Get corpus and labels for all projects
1940 BUG / 5591 

Feature computing.
	99349 features.
Extracting 30000 best features by a chi-squared test
Split data for each project
Get corpus and labels for project:  HTTPCLIENT
305 BUG / 746 

Get corpus and labels for project:  LUCENE
697 BUG / 2443 

Get corpus and labels for project:  JCR
938 BUG / 2402 

=====> Scoring Cross project <=====


# Setting 5

Setting most optimized (Grid-Search + Genetic Algorithm) using:
- Uni-grams and Bi-grams + Rare/Stop words dropping + Log function on term-frequency
- A feature number optimized with GA (37362)
- MLP parameters optimized with Grid-Search and GA

Parameters:
* 37362 features
* TfidfVectorizer(max_df=0.5, min_df=2, ngram_range=(1, 3), sublinear_tf=True)
* MLPClassifier(hidden_layer_sizes=(15, 9, 10, 11, 9, 15, 11), activation='tanh', learning_rate='adaptive', max_iter=100, random_state=0)

In [10]:
def run_setting_5(raw_data):
    print("=> Run setting 5 <=")
    print("Get corpus and labels for all projects")
    #stemmer = SnowballStemmer("english")
    stemmer = None
    corpus, labels = get_corpus_labels(raw_data,stemmer)
    vectorizer = TfidfVectorizer(max_df=0.5, min_df=2, ngram_range=(1, 3), sublinear_tf=True, stop_words={'english'})

    mlp = MLPClassifier(hidden_layer_sizes=(15, 9, 10, 11, 9, 15, 11), activation='tanh', learning_rate='adaptive',
                        max_iter=100, random_state=0)

    X, vectorizer, ch2 = feature_computing(corpus, labels, vectorizer, k_best=37362)
    binarized_labels = labels_binarizing(labels)
    dict_data_split = split_data_by_project(raw_data, stemmer)

    print("=====> Scoring Cross project <=====")
    make_scoring(X,binarized_labels,mlp)

    for key in project_keys:
        corpus = dict_data_split[key]["corpus"]
        labels = dict_data_split[key]["labels"]

        binarized_labels = labels_binarizing(labels)

        X = vectorizer.transform(corpus)
        X = ch2.transform(X)
        print("=====> Scoring " + key + " <=====")
        make_scoring(X,binarized_labels,mlp)
    print("=> End run setting 5 <=")


Run setting 5

In [None]:
raw_data = load_data()
run_setting_5(raw_data)

=> Run setting 5 <=
Get corpus and labels for all projects
1940 BUG / 5591 

Feature computing.
	99349 features.
Extracting 37362 best features by a chi-squared test
--- 3.042996883392334 seconds for feature computing ---
Split data for each project
Get corpus and labels for project:  HTTPCLIENT
305 BUG / 746 

Get corpus and labels for project:  LUCENE
697 BUG / 2443 

Get corpus and labels for project:  JCR
938 BUG / 2402 

=====> Scoring Cross project <=====


# Save fitted models

In [15]:
def save_fitted_models(raw_data):
    print("=> Run save fitted models <=")
    print("Get corpus and labels for all projects")
    #stemmer = SnowballStemmer("english")
    stemmer = None
    corpus, labels = get_corpus_labels(raw_data,stemmer)
    vectorizer = TfidfVectorizer(max_df=0.5, min_df=2, ngram_range=(1, 3), sublinear_tf=True, stop_words={'english'})

    mlp = MLPClassifier(hidden_layer_sizes=(15, 9, 10, 11, 9, 15, 11), activation='tanh', learning_rate='adaptive',
                        max_iter=100, random_state=0)

    X, vectorizer, ch2 = feature_computing(corpus, labels, vectorizer, k_best=37362)
    binarized_labels = labels_binarizing(labels)
    
    
    #mlp.fit(X,binarized_labels)
    #dump(mlp, "mlp_model.pkl")
    #dump(vectorizer, "vectorizer_model.pkl")
    #dump(ch2,"ch2_model.pkl")

    print("=> End run setting 5 <=")

In [32]:
raw_data = load_data()
save_fitted_models(raw_data)

=> Run save fitted models <=
Get corpus and labels for all projects
1940 BUG / 5591 

Feature computing.
	99349 features.
Extracting 37362 best features by a chi-squared test
37362
=> End run setting 5 <=
