In [1]:
import os
import pandas as pd
import numpy as np
import sklearn as sk
from math import floor
from sklearn.feature_extraction.text import CountVectorizer, TfidfTransformer
from sklearn.naive_bayes import BernoulliNB, MultinomialNB
from sklearn.tree import DecisionTreeClassifier
from sklearn.linear_model import LogisticRegression, SGDClassifier
from sklearn.pipeline import FeatureUnion, Pipeline
from sklearn.neighbors import KNeighborsClassifier
from sklearn.svm import LinearSVC
from sklearn import metrics
from sklearn.model_selection import GridSearchCV, cross_validate, cross_val_score, train_test_split
from sklearn.base import BaseEstimator, TransformerMixin
from sklearn.preprocessing import Binarizer, StandardScaler
from sklearn.decomposition import PCA, TruncatedSVD
from sklearn.metrics import recall_score

In [2]:
DATASET_DIR = 'DATASET-SNAPSHOT-01'

PROJECTS = ['abdera', 'activemq', 'airflow', 'arrow', 'calcite', 'flink', 'geode',
            'hadoop', 'hbase', 'hudi', 'jena', 'kafka', 'math', 'maven', 'rat']

def load_sample_df(project_name):
    '''Load a project from disk into a pandas dataframe'''
    cols = ['hash', 'msg', 'n_files', 'loc_added', 'loc_removed', 'issue_id', 'is_bug']
    filename = os.path.join(DATASET_DIR, project_name + '_samples.csv')
    return pd.read_csv(filename, names=cols)
    
def print_cv_results(pl, X, y, cv=5):
    '''Print the results of a cross-validation.'''
    scoring = ['precision_macro', 'recall_macro', 'f1_macro']
    scores = cross_validate(pl, X, y, scoring=scoring, cv=cv)
    for p, r, f1 in zip(scores['test_precision_macro'], scores['test_recall_macro'], scores['test_f1_macro']):
        print('p: {}\tr: {}\tf1: {}'.format(p, r, f1))

def report_results(y_test, predicted):
    '''Print the results of a regular train / test split.'''
    print(metrics.classification_report(y_test, predicted))
    print('Confusion Matrix:')
    print(metrics.confusion_matrix(y_test, predicted))

def print_predictions(X_test, y_test, predicted, n=5):
    '''Print out examples of TP, TN, FP, and FN for a classifier.'''
    tps, fps, tns, fns = list(), list(), list(), list()
    for x, y, pred in zip(list(X_test['msg']), y_test, predicted):
        if y == True and pred == True:
            tps.append(x)
        if y == False and pred == True:
            fps.append(x)
        if y == False and pred == False:
            tns.append(x)
        if y == True and pred == False:
            fns.append(x)
    # Optionally shuffle the lists here
    print('True Positives (Real Bug):')
    for msg in tps[:n]: print(msg.splitlines()[0])
    print('\nTrue Negatives (Real Non-bug):')
    for msg in tns[:n]: print(msg.splitlines()[0])
    print('\nFalse Positive (Wrong bug):')
    for msg in fps[:n]: print(msg.splitlines()[0])
    print('\nFalse Negative (Wrong Non-bug):')
    for msg in fns[:n]: print(msg.splitlines()[0])

In [3]:
class StrExtractor(BaseEstimator, TransformerMixin):
    'Takes in dataframe and extracts a column into a 1D python list of strings.'

    def __init__(self, col_name):
        self.col_name = col_name

    def transform(self, df, y=None):
        return [str(e) for e in df[self.col_name]]

    def fit(self, df, y=None): return self

In [4]:
class IntExtractor(BaseEstimator, TransformerMixin):
    'Takes in dataframe and extracts a column into a (n_samples, 1) ndarray.'

    def __init__(self, col_name):
        self.col_name = col_name

    def transform(self, df, y=None):
        return np.array([int(e) for e in df[self.col_name]]).reshape(-1, 1)

    def fit(self, df, y=None): return self

In [5]:
class LineLimiter(BaseEstimator, TransformerMixin):
    'Takes in an array of strings and truncates lines to max_lines.'

    def __init__(self, max_lines=1):
        self.max_lines = max_lines

    def transform(self, str_arr, y=None):
        if self.max_lines == -1: return str_arr
        return ['\n'.join(s.splitlines()[:self.max_lines]) for s in str_arr]

    def fit(self, df, y=None): return self

In [6]:
# Load all sample data
dfs = [load_sample_df(name) for name in PROJECTS]

# Combine into a single dataframe
df = pd.concat(dfs)

# Shuffle this dataframe
np.random.seed(0)
df = df.sample(frac=1).reset_index(drop=True)

# # Split into train and test
X = df
y = df['is_bug']
X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=1/3, random_state=0)

## Bag of Words Features

In [7]:
# This is our Bag of Words pipeline
bow_pl = Pipeline([
    ('msg', StrExtractor(col_name='msg')),
    ('line_limit', LineLimiter(max_lines=1)),
    ('vect', CountVectorizer(ngram_range=(1, 2), stop_words='english')),
    ('tfidf', TfidfTransformer(use_idf=True)),
])

In [8]:
# Bernoulli Naive Bayes
pl = Pipeline([
    ('bow', bow_pl),
    ('clf', BernoulliNB(alpha=0.5)),
])

# print_cv_results(pl, X, y)
# print_predictions(X_test, y_test, pred)
pred = pl.fit(X_train, y_train).predict(X_test)
report_results(y_test, pred)

              precision    recall  f1-score   support

       False       0.86      0.84      0.85     22423
        True       0.88      0.89      0.89     29143

    accuracy                           0.87     51566
   macro avg       0.87      0.87      0.87     51566
weighted avg       0.87      0.87      0.87     51566

Confusion Matrix:
[[18946  3477]
 [ 3099 26044]]


In [9]:
# SVM (LinearSVC Classifier) Regression
pl = Pipeline([
    ('bow', bow_pl),
    ('clf', LinearSVC())
])

# print_cv_results(pl, X, y)
# print_predictions(X_test, y_test, pred)
pred = pl.fit(X_train, y_train).predict(X_test)
report_results(y_test, pred)

              precision    recall  f1-score   support

       False       0.87      0.86      0.87     22423
        True       0.89      0.90      0.90     29143

    accuracy                           0.89     51566
   macro avg       0.88      0.88      0.88     51566
weighted avg       0.89      0.89      0.89     51566

Confusion Matrix:
[[19305  3118]
 [ 2790 26353]]


In [10]:
messages = [
    'Fix issue with Dog.java',
    'Update Dog.java'
]

pl.predict(pd.DataFrame(data={'msg': messages}))

array([ True, False])

## Non Text Features

In [11]:
has_adds_pl = Pipeline([
    ('loc_added_ext', IntExtractor(col_name='loc_added')),
    ('loc_added_thr', Binarizer())
])

has_dels_pl = Pipeline([
    ('loc_removed_ext', IntExtractor(col_name='loc_removed')),
    ('loc_removed_thr', Binarizer())
])

nontext_pl = FeatureUnion([
    ('bow', bow_pl),
    ('n_files', IntExtractor(col_name='n_files')),
    ('has_adds', has_adds_pl),
    ('has_dels', has_dels_pl)
])

In [12]:
pl = Pipeline([
    ('features', nontext_pl),
    ('clf', BernoulliNB(alpha=0.05)),
])

# print_cv_results(pl, X, y)
# print_predictions(X_test, y_test, pred)
pred = pl.fit(X_train, y_train).predict(X_test)
report_results(y_test, pred)

              precision    recall  f1-score   support

       False       0.85      0.87      0.86     22423
        True       0.90      0.88      0.89     29143

    accuracy                           0.88     51566
   macro avg       0.87      0.88      0.87     51566
weighted avg       0.88      0.88      0.88     51566

Confusion Matrix:
[[19510  2913]
 [ 3440 25703]]


## Test Split Between Projects

In [13]:
# TRAIN_PROJECTS = ['abdera', 'activemq', 'airflow', 'arrow', 'calcite', 'flink', 'geode', 'hudi', 'jena', 'kafka', 'math', 'maven', 'rat']
# TEST_PROJECTS  = ['hadoop']
TRAIN_PROJECTS = ['abdera', 'activemq', 'airflow', 'calcite', 'geode', 'hudi', 'jena', 'kafka', 'math', 'maven', 'rat']
TEST_PROJECTS = ['flink', 'arrow']

# Load all sample data
train_df = pd.concat([load_sample_df(name) for name in TRAIN_PROJECTS])
test_df  = pd.concat([load_sample_df(name) for name in TEST_PROJECTS])

# Shuffle these dataframes
np.random.seed(0)
train_df = train_df.sample(frac=1).reset_index(drop=True)
test_df = test_df.sample(frac=1).reset_index(drop=True)

# Truncate test
# test_df = test_df.truncate(after=14999)

# Split into train and test
X_train = train_df
X_test = test_df
y_train = train_df['is_bug']
y_test = test_df['is_bug']

In [14]:
X_train

Unnamed: 0,hash,msg,n_files,loc_added,loc_removed,issue_id,is_bug
0,4386cd4145c15f03c61911d1c24c698d00db25b1,[AIRFLOW-1677] Fix typo in example_qubole_oper...,1,2,1,AIRFLOW-1677,True
1,367e0a1557307ab6b8ba30c5a8b8c15324f6b552,GEODE-8144 another attempt to fix a failing te...,1,2,0,GEODE-8144,True
2,efa5ba81275f806fbeaf85993007ead3aa8d7381,[AIRFLOW-3677] Improve CheckOperator test cove...,2,88,54,AIRFLOW-3677,False
3,e377ed7d3d76a244a1494025231851fac8b37350,GEODE-6850: change int stats to long (#3713)\n...,33,648,673,GEODE-6850,False
4,7d36773fb79bb84609a58033b5dd0c8974dd18eb,"KAFKA-8340, KAFKA-8819: Use PluginClassLoader ...",19,1402,86,KAFKA-8340,True
...,...,...,...,...,...,...,...
30166,b55215dcb64c86554d531b0b475e85f013e76fc6,GEODE-3563: use a timeout for newly created so...,7,52,30,GEODE-3563,True
30167,db232c9b499403f7bca7eea75d21ae0b195ad2cd,JENA-151 - fix maven warnings.\n\ngit-svn-id: ...,1,14,14,JENA-151,True
30168,5e00b10e76023eb847052eb72d8e8ce5a15cb38e,GEODE-8134: convert String commands to use Fun...,43,1073,533,GEODE-8134,False
30169,343a7d226f12e6a6b690ba98168665b88c16e97c,GEODE-7194: simplify CMS 'get' return type (#4...,27,375,156,GEODE-7194,False


In [15]:
X_test

Unnamed: 0,hash,msg,n_files,loc_added,loc_removed,issue_id,is_bug
0,b422fe27879ab2d5ad54f782b7321b1b5808c245,[FLINK-990] Added constant fields and combinab...,3,218,2,FLINK-990,True
1,564f468e3fffd24d258c46edd6d368b6acdc787d,ARROW-6813: [Ruby] Arrow::Table.load with head...,2,85,3,ARROW-6813,True
2,2d431104c10a500973711d3151d6896704a535fa,ARROW-7264: [Java] RangeEqualsVisitor type che...,8,94,40,ARROW-7264,True
3,2cdec3fb9ed2d2502ce6acdbcf0322e535efab47,[FLINK-10222] [table] Fix parsing of keywords....,2,70,1,FLINK-10222,True
4,7f048a4b8bdc6a20cd8f6eeca928ecbb6db7dd96,ARROW-356: Add documentation about reading Par...,6,355,108,ARROW-356,False
...,...,...,...,...,...,...,...
23615,c30c86b8935caa3c43a2e8c83c863bce70d98724,[FLINK-5529] [docs] Improve / extends windowin...,1,272,381,FLINK-5529,False
23616,0523ef6451a93da450c6bdf5dd4757c3702f3962,[FLINK-15090][api] Reverse the dependency from...,17,243,298,FLINK-15090,True
23617,e2eb6d41a0abcc206966249d83ceb0f450b2cf6b,[FLINK-13384][runtime] Set context class loade...,1,5,3,FLINK-13384,True
23618,5f5f02b1272ceba5e72ac8bb29e3d260d66bd493,[FLINK-12726][table-common] Fix ANY type seria...,2,13,11,FLINK-12726,False


In [16]:
# Bernoulli Naive Bayes
pl = Pipeline([
    ('bow', bow_pl),
    ('clf', BernoulliNB(alpha=0.5)),
])

pred = pl.fit(X_train, y_train).predict(X_test)
report_results(y_test, pred)

              precision    recall  f1-score   support

       False       0.69      0.62      0.65     13436
        True       0.56      0.63      0.59     10184

    accuracy                           0.62     23620
   macro avg       0.62      0.63      0.62     23620
weighted avg       0.63      0.62      0.63     23620

Confusion Matrix:
[[8325 5111]
 [3759 6425]]


In [17]:
# SVM (LinearSVC Classifier) Regression
pl = Pipeline([
    ('bow', bow_pl),
    ('clf', LinearSVC())
])

pred = pl.fit(X_train, y_train).predict(X_test)
report_results(y_test, pred)

              precision    recall  f1-score   support

       False       0.73      0.67      0.70     13436
        True       0.60      0.67      0.64     10184

    accuracy                           0.67     23620
   macro avg       0.67      0.67      0.67     23620
weighted avg       0.67      0.67      0.67     23620

Confusion Matrix:
[[8961 4475]
 [3349 6835]]
