In [3]:
import os
import pandas as pd
import numpy as np
import sklearn as sk
from math import floor
from sklearn.feature_extraction.text import CountVectorizer, TfidfTransformer
from sklearn.naive_bayes import BernoulliNB, MultinomialNB
from sklearn.tree import DecisionTreeClassifier
from sklearn.linear_model import LogisticRegression, SGDClassifier
from sklearn.pipeline import FeatureUnion, Pipeline
from sklearn.neighbors import KNeighborsClassifier
from sklearn.svm import LinearSVC
from sklearn import metrics
from sklearn.model_selection import GridSearchCV, cross_validate, cross_val_score, train_test_split
from sklearn.base import BaseEstimator, TransformerMixin
from sklearn.preprocessing import Binarizer, StandardScaler
from sklearn.decomposition import PCA, TruncatedSVD
from sklearn.metrics import recall_score
from nltk import word_tokenize
from nltk.stem import WordNetLemmatizer

In [4]:
DATASET_DIR = 'DATASET-SNAPSHOT-01'

PROJECTS = ['abdera', 'activemq', 'airflow', 'arrow', 'calcite', 'flink', 'geode',
            'hadoop', 'hbase', 'hudi', 'jena', 'kafka', 'math', 'maven', 'rat']

def load_sample_df(project_name):
    '''Load a project from disk into a pandas dataframe'''
    cols = ['hash', 'msg', 'n_files', 'loc_added', 'loc_removed', 'issue_id', 'is_bug']
    filename = os.path.join(DATASET_DIR, project_name + '_samples.csv')
    return pd.read_csv(filename, names=cols)
    
def print_cv_results(pl, X, y, cv=5):
    '''Print the results of a cross-validation.'''
    scoring = ['precision_macro', 'recall_macro', 'f1_macro']
    scores = cross_validate(pl, X, y, scoring=scoring, cv=cv)
    for p, r, f1 in zip(scores['test_precision_macro'], scores['test_recall_macro'], scores['test_f1_macro']):
        print('p: {}\tr: {}\tf1: {}'.format(p, r, f1))

def report_results(y_test, predicted):
    '''Print the results of a regular train / test split.'''
    print(metrics.classification_report(y_test, predicted))
    print('Confusion Matrix:')
    print(metrics.confusion_matrix(y_test, predicted))

def print_predictions(X_test, y_test, predicted, n=5):
    '''Print out examples of TP, TN, FP, and FN for a classifier.'''
    tps, fps, tns, fns = list(), list(), list(), list()
    for x, y, pred in zip(list(X_test['msg']), y_test, predicted):
        if y == True and pred == True:
            tps.append(x)
        if y == False and pred == True:
            fps.append(x)
        if y == False and pred == False:
            tns.append(x)
        if y == True and pred == False:
            fns.append(x)
    # Optionally shuffle the lists here
    print('True Positives (Real Bug):')
    for msg in tps[:n]: print(msg.splitlines()[0])
    print('\nTrue Negatives (Real Non-bug):')
    for msg in tns[:n]: print(msg.splitlines()[0])
    print('\nFalse Positive (Wrong bug):')
    for msg in fps[:n]: print(msg.splitlines()[0])
    print('\nFalse Negative (Wrong Non-bug):')
    for msg in fns[:n]: print(msg.splitlines()[0])

In [5]:
class StrExtractor(BaseEstimator, TransformerMixin):
    'Takes in dataframe and extracts a column into a 1D python list of strings.'

    def __init__(self, col_name):
        self.col_name = col_name

    def transform(self, df, y=None):
        return [str(e) for e in df[self.col_name]]

    def fit(self, df, y=None): return self

In [6]:
class IntExtractor(BaseEstimator, TransformerMixin):
    'Takes in dataframe and extracts a column into a (n_samples, 1) ndarray.'

    def __init__(self, col_name):
        self.col_name = col_name

    def transform(self, df, y=None):
        return np.array([int(e) for e in df[self.col_name]]).reshape(-1, 1)

    def fit(self, df, y=None): return self

In [7]:
class LineLimiter(BaseEstimator, TransformerMixin):
    'Takes in an array of strings and truncates lines to max_lines.'

    def __init__(self, max_lines=1):
        self.max_lines = max_lines

    def transform(self, str_arr, y=None):
        if self.max_lines == -1: return str_arr
        return ['\n'.join(s.splitlines()[:self.max_lines]) for s in str_arr]

    def fit(self, df, y=None): return self

In [8]:
class MyTokenizer(BaseEstimator, TransformerMixin):
    'Takes in an array of strings and returns an array of arrays of tokens.'

    def __init__(self):
        pass

    def transform(self, arr, y=None):
        return [gensim.utils.simple_preprocess(s) for s in arr]

    def fit(self, df, y=None): return self

In [9]:
class AvgWord2Vec(BaseEstimator, TransformerMixin):
    'dd'

    def __init__(self, size=100):
        self.size = size
        self.model = None
        self.vocab = None

    def transform(self, arr, y=None):
        'Transforms a list of list of tokens to 2D numpy array of (samples, word_vecs)'
        # Model not yet traiend
        if self.model == None: raise ValueError

        word_vecs = np.empty([len(arr), self.size])
        for row in range(len(arr)):
            tokens = arr[row]
            # TODO: Consider 0 vecs
            vecs = [self.model.wv[t] for t in tokens if t in self.vocab]
            mean_vec = np.mean(np.array(vecs), axis=0)
            for i in range(len(mean_vec)):
                word_vecs[row][i] = mean_vec[i]
            # print('{}: {}'.format(len(tokens), mean_vec))
        print(word_vecs.shape)
        return word_vecs

    def fit(self, arr, y=None):
        self.model = gensim.models.Word2Vec(sentences=arr, size=self.size)
        self.vocab = list(self.model.wv.vocab)
        return self

In [10]:
# Load all sample data
dfs = [load_sample_df(name) for name in PROJECTS]

# Combine into a single dataframe
df = pd.concat(dfs)

# Shuffle this dataframe
np.random.seed(0)
df = df.sample(frac=1).reset_index(drop=True)

# # Split into train and test
X = df
y = df['is_bug']
X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=1/3, random_state=0)

## Bag of Words Features

In [1]:
# This is our Bag of Words pipeline
bow_pl = Pipeline([
    ('msg', StrExtractor(col_name='msg')),
    ('line_limit', LineLimiter(max_lines=1)),
    ('vect', CountVectorizer(ngram_range=(1, 2), stop_words='english')),
    ('tfidf', TfidfTransformer(use_idf=True)),
])

NameError: name 'Pipeline' is not defined

In [None]:
# Bernoulli Naive Bayes
pl = Pipeline([
    ('bow', bow_pl),
    ('clf', BernoulliNB(alpha=0.5)),
])

print_cv_results(pl, X, y)
pred = pl.fit(X_train, y_train).predict(X_test)
print_predictions(X_test, y_test, pred)

In [2]:
# SVM (LinearSVC Classifier) Regression
pl = Pipeline([
    ('bow', bow_pl),
    ('clf', LinearSVC())
])

print_cv_results(pl, X, y)

NameError: name 'Pipeline' is not defined

## Non Textual Features

In [18]:
has_adds_pl = Pipeline([
    ('loc_added_ext', IntExtractor(col_name='loc_added')),
    ('loc_added_thr', Binarizer())
])

has_dels_pl = Pipeline([
    ('loc_removed_ext', IntExtractor(col_name='loc_removed')),
    ('loc_removed_thr', Binarizer())
])

nontext_pl = FeatureUnion([
    ('bow', bow_pl),
    ('n_files', IntExtractor(col_name='n_files')),
    ('has_adds', has_adds_pl),
    ('has_dels', has_dels_pl)
])

In [19]:
pl = Pipeline([
    ('features', nontext_pl),
    ('clf', BernoulliNB(alpha=0.05)),
])

print_cv_results(pl, X, y)

p: 0.8926518215516154	r: 0.8946309897171532	f1: 0.8935448991087573
p: 0.8904073574599995	r: 0.8924550259162589	f1: 0.8913262170923635
p: 0.8925794955865008	r: 0.8947238209228148	f1: 0.893536635629868
p: 0.8946343512794714	r: 0.8965311468153789	f1: 0.8954955638904609
p: 0.8914941345938737	r: 0.8939482797589222	f1: 0.8925652606938594
