In [1]:
import pandas as pd
import numpy as np

from sklearn.ensemble import RandomForestClassifier, ExtraTreesClassifier
from sklearn.naive_bayes import GaussianNB
from sklearn.tree import DecisionTreeClassifier
from sklearn.neural_network import MLPClassifier
from sklearn.metrics import accuracy_score, recall_score, precision_score, f1_score
from xgboost import XGBClassifier

We use a Random Forest, Decision Tree, and Naive Bayes classifier to predict defects using the suspicious clusters we found in Notebook 4.

In [2]:
datasets = [('all_metrics', 3),
            ('all_no_outliers', 5),
            ('absolute', 5),
            ('absolute_no_outliers', 3),
            ('relative' , 4),
            ('relative_no_outliers', 5),
           ('no_corr_metrics', 5),
           ('no_corr_metrics_no_outliers', 5)]


classifiers = {'RandomForestClassifier': RandomForestClassifier(n_estimators=100),
               'Decision Tree': DecisionTreeClassifier(),
               'Naive Bayes': GaussianNB(),
              }

important_metrics = {'all_metrics': ['ntkn_count', 'loc_count', 'nkeys_count','ncd_count','etp_count',
                                    'nnnv_count','nun_count','nts_count','ntun_count','nlp_count'],
                    'all_no_outliers': ['ntkn_count', 'loc_count', 'nkeys_count','ntun_count','nun_count',
                                    'ncd_count','etp_count','nmd_count','atss_count','ntvr_count'],
                    'absolute': ['ntkn_count', 'loc_count', 'nkeys_count','ncd_count','etp_count',
                                    'nnnv_count','nun_count','nts_count','ntun_count','atss_count'],
                    'absolute_no_outliers': ['ntkn_count', 'loc_count', 'nkeys_count','ntun_count','nun_count',
                                    'ncd_count','etp_count','nun_count','atss_count','ntvr_count'],
                    'relative': ['ntkn_count','loc_count','nkeys_count','ncd_count','nts_count',
                                 'etp_count','ntvr_count','atss_count','nmd_count','nlp_count'],
                    'relative_no_outliers': ['ntkn_count','loc_count','nkeys_count','etp_count','ncd_count',
                                             'atss_count','nts_count','ntvr_count','nmd_count','nsh_count'],
                     'no_corr_metrics': ['loc_count','atss_count','etp_count','nmd_count','nfl_count',
                                         'nemd_count_relative','nlo_count','ntnn_count','nlp_count','nun_count_relative'],
                     'no_corr_metrics_no_outliers': ['loc_count','atss_count','etp_count','nfl_count','nlo_count',
                                            'nmd_count','nlp_count','ntnn_count','nemd_count_relative','nun_count_relative']
                
                }

test_prior = pd.read_csv('results/metrics_prior.csv')
test_fix = pd.read_csv('results/metrics_fix.csv')
test_prior = test_prior.dropna()
test_fix = test_fix.dropna()
test = pd.concat([test_prior, test_fix])

results = {}

np.random.seed(0)

for df, suspicious_cluster in datasets:
    print(df)
    data = pd.read_csv('results/{}_clusters.csv'.format(df))
    data = data.drop(columns=['Unnamed: 0'])
    data['clusters'] = data['clusters'].astype('category')
    columns = data.columns[:-1]
    metrics = important_metrics[df][:10]
    results[df] = {}
    
    for name, clf in classifiers.items():
        results[df][name] = {}
        
        X_train = np.array(data[metrics])
        y = np.array(data['clusters'])
        y_train = [1 if v == suspicious_cluster else 0 for v in y]
        
        X_test = test[metrics]
        y_test = test['y']
    
        clf.fit(X_train, y_train)
        
        pred = clf.predict(X_test)
        accuracy = accuracy_score(y_test, pred)
        precision = precision_score(y_test, pred)
        recall = recall_score(y_test, pred)
        fscore =  f1_score(y_test, pred)
        
        results[df][name]['important_accuracy'] = round(accuracy,2)
        results[df][name]['important_precision'] = round(precision,2)
        results[df][name]['important_recall'] = round(recall,2)
        results[df][name]['important_fscore'] = round(fscore,2)
        
        print(name, "Imp Accuracy: ", accuracy)
        print(name, "Imp Precision: ", precision)
        print(name, "Imp Recall: ", recall)
        print(name, "Imp fscore: ", fscore)
        print()
        
        X_train = np.array(data[data.columns[:-1]])
        y = np.array(data['clusters'])
        y_train = [1 if v == suspicious_cluster else 0 for v in y]
        
        X_test = test[columns]
        y_test = test['y']
    
        clf.fit(X_train, y_train)
        
        pred = clf.predict(X_test)
        accuracy = accuracy_score(y_test, pred)
        precision = precision_score(y_test, pred)
        recall = recall_score(y_test, pred)
        fscore =  f1_score(y_test, pred)
        
        results[df][name]['all_accuracy'] = round(accuracy,2)
        results[df][name]['all_precision'] = round(precision,2)
        results[df][name]['all_recall'] = round(recall,2)
        results[df][name]['all_fscore'] = round(fscore,2)
        
        print(name, "all Accuracy: ", accuracy)
        print(name, "all Precision: ", precision)
        print(name, "all Recall: ", recall)
        print(name, "all fscore: ", fscore)
        print()
        


    print()

all_metrics
RandomForestClassifier Imp Accuracy:  0.5081967213114754
RandomForestClassifier Imp Precision:  0.5
RandomForestClassifier Imp Recall:  0.06666666666666667
RandomForestClassifier Imp fscore:  0.11764705882352941

RandomForestClassifier all Accuracy:  0.5081967213114754
RandomForestClassifier all Precision:  0.5
RandomForestClassifier all Recall:  0.06666666666666667
RandomForestClassifier all fscore:  0.11764705882352941

Decision Tree Imp Accuracy:  0.5081967213114754
Decision Tree Imp Precision:  0.5
Decision Tree Imp Recall:  0.06666666666666667
Decision Tree Imp fscore:  0.11764705882352941

Decision Tree all Accuracy:  0.5081967213114754
Decision Tree all Precision:  0.5
Decision Tree all Recall:  0.06666666666666667
Decision Tree all fscore:  0.11764705882352941

Naive Bayes Imp Accuracy:  0.5081967213114754
Naive Bayes Imp Precision:  0.5
Naive Bayes Imp Recall:  0.1
Naive Bayes Imp fscore:  0.16666666666666669

Naive Bayes all Accuracy:  0.5081967213114754
Naive Bay

We use the same techniques to recognize bug fixing commits

In [4]:
datasets = [('all_metrics', 3),
            ('all_no_outliers', 5),
            ('absolute', 5),
            ('absolute_no_outliers', 3),
            ('relative' , 4),
            ('relative_no_outliers', 5),
           ('no_corr_metrics', 5),
           ('no_corr_metrics_no_outliers', 5)]

classifiers = {'RandomForestClassifier': RandomForestClassifier(n_estimators=100),
               'Decision Tree': DecisionTreeClassifier(),
               'Naive Bayes': GaussianNB()
              }

important_metrics = {'all_metrics': ['ntkn_count', 'loc_count', 'nkeys_count','ncd_count','etp_count',
                                    'nnnv_count','nun_count','nts_count','ntun_count','nlp_count'],
                    'all_no_outliers': ['ntkn_count', 'loc_count', 'nkeys_count','ntun_count','nun_count',
                                    'ncd_count','etp_count','nmd_count','atss_count','ntvr_count'],
                    'absolute': ['ntkn_count', 'loc_count', 'nkeys_count','ncd_count','etp_count',
                                    'nnnv_count','nun_count','nts_count','ntun_count','atss_count'],
                    'absolute_no_outliers': ['ntkn_count', 'loc_count', 'nkeys_count','ntun_count','nun_count',
                                    'ncd_count','etp_count','nun_count','atss_count','ntvr_count'],
                    'relative': ['ntkn_count','loc_count','nkeys_count','ncd_count','nts_count',
                                 'etp_count','ntvr_count','atss_count','nmd_count','nlp_count'],
                    'relative_no_outliers': ['ntkn_count','loc_count','nkeys_count','etp_count','ncd_count',
                                             'atss_count','nts_count','ntvr_count','nmd_count','nsh_count'],
                     'no_corr_metrics': ['loc_count','atss_count','etp_count','nmd_count','nfl_count',
                                         'nemd_count_relative','nlo_count','ntnn_count','nlp_count','nun_count_relative'],
                     'no_corr_metrics_no_outliers': ['loc_count','atss_count','etp_count','nfl_count','nlo_count',
                                            'nmd_count','nlp_count','ntnn_count','nemd_count_relative','nun_count_relative']
                
                }

test_prior = pd.read_csv('results/metrics_prior.csv')
test_fix = pd.read_csv('results/metrics_fix.csv')
test_prior = test_prior.dropna()
test_fix = test_fix.dropna()
test = pd.concat([test_prior, test_fix])

names_prior = [f.split("__")[:2] for f in test_prior['file']]
names_fix = [f.split("__")[:2] for f in test_fix['file']]
mapper = []
for index, names in enumerate(names_prior):
    if names in names_fix:
        mapper.append([index, names_fix.index(names)])
        
mapper

results = {}

np.random.seed(0)

for df, suspicious_cluster in datasets:
    print(df)
    data = pd.read_csv('results/{}_clusters.csv'.format(df))
    data = data.drop(columns=['Unnamed: 0'])
    data['clusters'] = data['clusters'].astype('category')
    columns = data.columns[:-1]
    metrics = important_metrics[df][:10]
    results[df] = {}
    
    for name, clf in classifiers.items():
        results[df][name] = {}
        X_train = np.array(data[data.columns[:-1]])
        y = np.array(data['clusters'])
        y_train = [1 if v == suspicious_cluster else 0 for v in y]
        
        X_test_prior = test_prior[columns]
        y_test_prior = test_prior['y']
        
        X_test_fix = test_fix[columns]
        y_test_fix = test_fix['y']
    
        clf.fit(X_train, y_train)
        
        predict_prior = clf.predict(X_test_prior)
        predict_fix = clf.predict(X_test_fix)
        score_prior = accuracy_score(y_test_prior, predict_prior)
        score_fix = accuracy_score(y_test_fix, predict_fix)
        precision = sum(predict_prior)/(sum(predict_prior) + sum(predict_fix))
        recall = sum(predict_prior)/(sum(predict_prior) + (predict_prior==0).sum())
        
        change_pred = []
        for i,j in mapper:
            change_pred.append([predict_prior[i], predict_fix[j]])
        correct_change_pred = [i for i in change_pred if i[0]==1 and i[1]==0]
        
        results[df][name]['all changes'] = len(correct_change_pred)
        results[df][name]['all_prior'] = score_prior
        results[df][name]['all_fix'] = score_fix
        results[df][name]['all_precision'] = precision
        results[df][name]['all_recall'] = recall
        
        
        print(name, "all changes: ", len(correct_change_pred))
        
        X_train = np.array(data[metrics])
        y = np.array(data['clusters'])
        y_train = [1 if v == suspicious_cluster else 0 for v in y]
        
        X_test_prior = test_prior[metrics]
        y_test_prior = test_prior['y']
        
        X_test_fix = test_fix[metrics]
        y_test_fix = test_fix['y']
    
        clf.fit(X_train, y_train)
        
        predict_prior = clf.predict(X_test_prior)
        predict_fix = clf.predict(X_test_fix)
        score_prior = accuracy_score(y_test_prior, predict_prior)
        score_fix = accuracy_score(y_test_fix, predict_fix)
        precision = sum(predict_prior)/(sum(predict_prior) + sum(predict_fix))
        recall = sum(predict_prior)/(sum(predict_prior) + (predict_prior==0).sum())
        
        score_prior = accuracy_score(y_test_prior, clf.predict(X_test_prior))
        score_fix = accuracy_score(y_test_fix, clf.predict(X_test_fix))
        
        change_pred = []
        for i,j in mapper:
            change_pred.append([predict_prior[i], predict_fix[j]])
        correct_change_pred = [i for i in change_pred if i[0]==1 and i[1]==0]
        
        results[df][name]['important changes'] = len(correct_change_pred)
        results[df][name]['important_prior'] = score_prior
        results[df][name]['important_fix'] = score_fix
        results[df][name]['important precision'] = precision
        results[df][name]['important recall'] = recall
        
        print(name, "changes: ", len(correct_change_pred))

    print()

all_metrics
RandomForestClassifier all changes:  0
RandomForestClassifier changes:  0
Decision Tree all changes:  0
Decision Tree changes:  0
Naive Bayes all changes:  1
Naive Bayes changes:  1

all_no_outliers
RandomForestClassifier all changes:  0
RandomForestClassifier changes:  1
Decision Tree all changes:  1
Decision Tree changes:  1
Naive Bayes all changes:  0
Naive Bayes changes:  3

absolute
RandomForestClassifier all changes:  0
RandomForestClassifier changes:  0
Decision Tree all changes:  0
Decision Tree changes:  0
Naive Bayes all changes:  7
Naive Bayes changes:  0

absolute_no_outliers
RandomForestClassifier all changes:  1
RandomForestClassifier changes:  1
Decision Tree all changes:  1
Decision Tree changes:  1
Naive Bayes all changes:  1
Naive Bayes changes:  3

relative
RandomForestClassifier all changes:  0
RandomForestClassifier changes:  0
Decision Tree all changes:  0
Decision Tree changes:  0
Naive Bayes all changes:  3
Naive Bayes changes:  3

relative_no_outlie

In [5]:
res = pd.DataFrame.from_dict({(i,j): results[i][j] for i in results.keys() for j in results[i].keys()}, orient='index')
nb_df = []
for index, row in res.iterrows():
    if 'Naive Bayes' in row.name:
        nb_df.append(row)

In [6]:
print(pd.DataFrame(nb_df).to_latex())

\begin{tabular}{lrrrrrrrrrr}
\toprule
{} &  all changes &  all\_prior &   all\_fix &  all\_precision &  all\_recall &  important changes &  important\_prior &  important\_fix &  important precision &  important recall \\
\midrule
(all\_metrics, Naive Bayes)                 &          1.0 &   0.233333 &  0.774194 &       0.500000 &    0.233333 &                1.0 &         0.100000 &       0.903226 &                  0.5 &          0.100000 \\
(all\_no\_outliers, Naive Bayes)             &          0.0 &   0.233333 &  0.774194 &       0.500000 &    0.233333 &                3.0 &         0.266667 &       0.741935 &                  0.5 &          0.266667 \\
(absolute, Naive Bayes)                    &          7.0 &   0.266667 &  0.774194 &       0.533333 &    0.266667 &                0.0 &         0.066667 &       0.903226 &                  0.4 &          0.066667 \\
(absolute\_no\_outliers, Naive Bayes)        &          1.0 &   0.333333 &  0.677419 &       0.500000 &    0.333333 