In [13]:
import pandas as pd
import numpy as np

from sklearn.ensemble import RandomForestClassifier, ExtraTreesClassifier
from sklearn.naive_bayes import GaussianNB
from sklearn.tree import DecisionTreeClassifier
from sklearn.neural_network import MLPClassifier
from sklearn.metrics import accuracy_score, recall_score, precision_score, f1_score
from xgboost import XGBClassifier

In [2]:
# Load necessary data, drop irrelevant columns
data = pd.read_csv('results/output.csv')
outlier_data = pd.read_csv('results/results_with_outliers.csv')

filenames = data['file']
data = data.drop(columns = 'file')

to_drop = ['apls_count', 'niu_count', 'npl_count', 'npnn_count', 
           'npnn_count_relative', 'npun_count', 'npun_count_relative', 
           'npvr_count', 'nrgv_count', 'nrl_count', 'nrvr_count', 
           'nvr_count', 'nscm_count', 'nmo_count', 'nimpp_count']

data = data.drop(columns=to_drop)
data = data.dropna()
print("Data shape after dropping na columns and rows: ", data.shape)
data.head()

Data shape after dropping na columns and rows:  (9285, 50)


Unnamed: 0,atss_count,bloc_count,bloc_count_relative,cloc_count,cloc_count_relative,dpt_count,etp_count,loc_count,nbeh_count,nbeh_count_relative,...,nsh_count,ntkn_count,ntnn_count,ntnn_count_relative,nts_count,ntun_count,ntun_count_relative,ntvr_count,nun_count,nun_count_relative
0,3.0,0,0.0,0,0.0,2,-5.81,25,0.0,0.0,...,5.0,113,1.0,0.11,9.0,8.0,0.89,5.0,8.0,1.0
1,10.0,0,0.0,0,0.0,2,-4.01,10,0.0,0.0,...,0.0,31,0.0,0.0,1.0,1.0,1.0,5.0,2.0,1.0
2,9.0,0,0.0,2,0.02,5,-5.57,132,0.0,0.0,...,0.0,373,0.0,0.0,15.0,7.0,0.47,8.0,7.0,0.29
3,6.0,0,0.0,0,0.0,2,-5.34,37,0.0,0.0,...,1.0,133,0.0,0.0,6.0,6.0,1.0,4.0,6.0,1.0
4,13.0,0,0.0,0,0.0,3,-4.3,13,0.0,0.0,...,0.0,34,1.0,1.0,1.0,0.0,0.0,2.0,2.0,1.0


In [11]:
all_metrics = outlier_data
all_metrics = all_metrics.drop(columns=['outlier_scores', 'file'])

relative_cols = [col for col in data.columns if 'relative' in col]
absolute = outlier_data.drop(columns = relative_cols)
absolute = absolute.drop(columns=['outlier_scores', 'file'])

absolute_cols = [col.strip('relative').rstrip('_') for col in relative_cols]
relative = outlier_data.drop(columns = absolute_cols)
relative = relative.drop(columns=['outlier_scores', 'file'])

no_corr_cols = ['ncd_count', 'nkeys_count', 'nnnv_count', 'ntkn_count', 'nun_count', 'nco_count',
               'nemd_count', 'ntun_count', 'ntnn_count_relative', 'nts_count', 'ntvr_count']
no_corr_metrics = outlier_data.drop(columns=no_corr_cols)
no_corr_metrics = no_corr_metrics.drop(columns=['outlier_scores', 'file'])

In [18]:
datasets = {'all_metrics': all_metrics, 
            'absolute': absolute, 
            'relative': relative, 
            'no_corr_metrics': no_corr_metrics}

classifiers = {'RandomForestClassifier': RandomForestClassifier(n_estimators=100),
               #'ExtraTreesClassifier': ExtraTreesClassifier(n_estimators=100),
               #'XGBClassifier': XGBClassifier(),
               'Decision Tree': DecisionTreeClassifier(),
               'Naive Bayes': GaussianNB(),
               'MLP' :MLPClassifier(solver='lbfgs', alpha=1e-5, 
                                    hidden_layer_sizes=(5, 2), random_state=1)
              }

important_metrics = {'all_metrics': ['ntkn_count', 'loc_count', 'nkeys_count','ncd_count','etp_count',
                                    'nnnv_count','nun_count','nts_count','ntun_count','nlp_count'],
                    'absolute': ['ntkn_count', 'loc_count', 'nkeys_count','ncd_count','etp_count',
                                    'nnnv_count','nun_count','nts_count','ntun_count','atss_count'],
                    'relative': ['ntkn_count','loc_count','nkeys_count','ncd_count','nts_count',
                                 'etp_count','ntvr_count','atss_count','nmd_count','nlp_count'],
                     'no_corr_metrics': ['loc_count','atss_count','etp_count','nmd_count','nfl_count',
                                         'nemd_count_relative','nlo_count','ntnn_count','nlp_count','nun_count_relative']
                      }

test_prior = pd.read_csv('results/metrics_prior.csv')
test_fix = pd.read_csv('results/metrics_fix.csv')
test_prior = test_prior.dropna()
test_fix = test_fix.dropna()
test = pd.concat([test_prior, test_fix])

names_prior = [f.split("__")[:2] for f in test_prior['file']]
names_fix = [f.split("__")[:2] for f in test_fix['file']]
mapper = []
for index, names in enumerate(names_prior):
    if names in names_fix:
        mapper.append([index, names_fix.index(names)])
        
mapper

results = {}

np.random.seed(0)

for df_name, data in datasets.items():
    print(df_name)
    metrics = important_metrics[df_name][:10]
    results[df_name] = {}
    
    for clf_name, clf in classifiers.items():
        results[df_name][clf_name] = {}
        
        X_train = np.array(data[metrics])
        y_train = np.array(data['outlier'])
        
        X_test = test[metrics]
        y_test = test['y']
    
        clf.fit(X_train, y_train)
        
        pred = clf.predict(X_test)
        accuracy = accuracy_score(y_test, pred)
        precision = precision_score(y_test, pred)
        recall = recall_score(y_test, pred)
        fscore =  f1_score(y_test, pred)
        
        results[df_name][clf_name]['important_accuracy'] = round(accuracy,2)
        results[df_name][clf_name]['important_precision'] = round(precision,2)
        results[df_name][clf_name]['important_recall'] = round(recall,2)
        results[df_name][clf_name]['important_fscore'] = round(fscore,2)
        
        print(clf_name, "Imp Accuracy: ", accuracy)
        print(clf_name, "Imp Precision: ", precision)
        print(clf_name, "Imp Recall: ", recall)
        print(clf_name, "Imp fscore: ", fscore)
        
        X_train = np.array(data[data.columns[:-1]])
        y_train = np.array(data['outlier'])
        
        X_test = test[data.columns[:-1]]
        y_test = test['y']
    
        clf.fit(X_train, y_train)
        
        pred = clf.predict(X_test)
        accuracy = accuracy_score(y_test, pred)
        precision = precision_score(y_test, pred)
        recall = recall_score(y_test, pred)
        fscore =  f1_score(y_test, pred)
        
        results[df_name][clf_name]['all_accuracy'] = round(accuracy,2)
        results[df_name][clf_name]['all_precision'] = round(precision,2)
        results[df_name][clf_name]['all_recall'] = round(recall,2)
        results[df_name][clf_name]['all_fscore'] = round(fscore,2)
        
        print(clf_name, "all Accuracy: ", accuracy)
        print(clf_name, "all Precision: ", precision)
        print(clf_name, "all Recall: ", recall)
        print(clf_name, "all fscore: ", fscore)
        


    print()

all_metrics
RandomForestClassifier Imp Accuracy:  0.5081967213114754
RandomForestClassifier Imp Precision:  0.5
RandomForestClassifier Imp Recall:  0.1
RandomForestClassifier Imp fscore:  0.16666666666666669
RandomForestClassifier all Accuracy:  0.5245901639344263
RandomForestClassifier all Precision:  0.5714285714285714
RandomForestClassifier all Recall:  0.13333333333333333
RandomForestClassifier all fscore:  0.21621621621621623
Decision Tree Imp Accuracy:  0.4918032786885246
Decision Tree Imp Precision:  0.4444444444444444
Decision Tree Imp Recall:  0.13333333333333333
Decision Tree Imp fscore:  0.20512820512820512
Decision Tree all Accuracy:  0.5081967213114754
Decision Tree all Precision:  0.5
Decision Tree all Recall:  0.2
Decision Tree all fscore:  0.28571428571428575
Naive Bayes Imp Accuracy:  0.4918032786885246
Naive Bayes Imp Precision:  0.4444444444444444
Naive Bayes Imp Recall:  0.13333333333333333
Naive Bayes Imp fscore:  0.20512820512820512
Naive Bayes all Accuracy:  0.49

  'precision', 'predicted', average, warn_for)
  'precision', 'predicted', average, warn_for)


RandomForestClassifier Imp Accuracy:  0.5081967213114754
RandomForestClassifier Imp Precision:  0.5
RandomForestClassifier Imp Recall:  0.1
RandomForestClassifier Imp fscore:  0.16666666666666669
RandomForestClassifier all Accuracy:  0.4918032786885246
RandomForestClassifier all Precision:  0.4
RandomForestClassifier all Recall:  0.06666666666666667
RandomForestClassifier all fscore:  0.1142857142857143
Decision Tree Imp Accuracy:  0.45901639344262296
Decision Tree Imp Precision:  0.36363636363636365
Decision Tree Imp Recall:  0.13333333333333333
Decision Tree Imp fscore:  0.1951219512195122
Decision Tree all Accuracy:  0.5081967213114754
Decision Tree all Precision:  0.5
Decision Tree all Recall:  0.1
Decision Tree all fscore:  0.16666666666666669
Naive Bayes Imp Accuracy:  0.5081967213114754
Naive Bayes Imp Precision:  0.5
Naive Bayes Imp Recall:  0.2
Naive Bayes Imp fscore:  0.28571428571428575
Naive Bayes all Accuracy:  0.5081967213114754
Naive Bayes all Precision:  0.5
Naive Bayes

  'precision', 'predicted', average, warn_for)
  'precision', 'predicted', average, warn_for)


RandomForestClassifier Imp Accuracy:  0.5081967213114754
RandomForestClassifier Imp Precision:  0.5
RandomForestClassifier Imp Recall:  0.06666666666666667
RandomForestClassifier Imp fscore:  0.11764705882352941
RandomForestClassifier all Accuracy:  0.5245901639344263
RandomForestClassifier all Precision:  0.6
RandomForestClassifier all Recall:  0.1
RandomForestClassifier all fscore:  0.17142857142857143
Decision Tree Imp Accuracy:  0.47540983606557374
Decision Tree Imp Precision:  0.4
Decision Tree Imp Recall:  0.13333333333333333
Decision Tree Imp fscore:  0.2
Decision Tree all Accuracy:  0.5245901639344263
Decision Tree all Precision:  0.6
Decision Tree all Recall:  0.1
Decision Tree all fscore:  0.17142857142857143
Naive Bayes Imp Accuracy:  0.5081967213114754
Naive Bayes Imp Precision:  0.5
Naive Bayes Imp Recall:  0.3333333333333333
Naive Bayes Imp fscore:  0.4
Naive Bayes all Accuracy:  0.47540983606557374
Naive Bayes all Precision:  0.4722222222222222
Naive Bayes all Recall:  0

  'precision', 'predicted', average, warn_for)
  'precision', 'predicted', average, warn_for)


In [19]:
#cols = ['all_prior', 'important_prior', 'all_fix','important_fix', 'all_recall', 'important recall', 'all_precision', 'important precision']

res = pd.DataFrame.from_dict({(i,j): results[i][j] for i in results.keys() for j in results[i].keys()}, orient='index')
nb_df = []
for index, row in res.iterrows():
    if 'Naive Bayes' in row.name:
        nb_df.append(row)

In [22]:
print(pd.DataFrame(res).to_latex())

\begin{tabular}{llrrrrrrrr}
\toprule
                &     &  important\_accuracy &  important\_precision &  important\_recall &  important\_fscore &  all\_accuracy &  all\_precision &  all\_recall &  all\_fscore \\
\midrule
all\_metrics & RandomForestClassifier &                0.51 &                 0.50 &              0.10 &              0.17 &          0.52 &           0.57 &        0.13 &        0.22 \\
                & Decision Tree &                0.49 &                 0.44 &              0.13 &              0.21 &          0.51 &           0.50 &        0.20 &        0.29 \\
                & Naive Bayes &                0.49 &                 0.44 &              0.13 &              0.21 &          0.49 &           0.48 &        0.43 &        0.46 \\
                & MLP &                0.49 &                 0.49 &              1.00 &              0.66 &          0.49 &           0.49 &        1.00 &        0.66 \\
absolute & RandomForestClassifier &                0.49 &

In [26]:
datasets = {'all_metrics': all_metrics, 
            'absolute': absolute, 
            'relative': relative, 
            'no_corr_metrics': no_corr_metrics}

classifiers = {'RandomForestClassifier': RandomForestClassifier(n_estimators=100),
               #'ExtraTreesClassifier': ExtraTreesClassifier(n_estimators=100),
               #'XGBClassifier': XGBClassifier(),
               'Decision Tree': DecisionTreeClassifier(),
               'Naive Bayes': GaussianNB(),
               'MLP' :MLPClassifier(solver='lbfgs', alpha=1e-5, 
                                    hidden_layer_sizes=(5, 2), random_state=1)
              }

important_metrics = {'all_metrics': ['ntkn_count', 'loc_count', 'nkeys_count','ncd_count','etp_count',
                                    'nnnv_count','nun_count','nts_count','ntun_count','nlp_count'],
                    'absolute': ['ntkn_count', 'loc_count', 'nkeys_count','ncd_count','etp_count',
                                    'nnnv_count','nun_count','nts_count','ntun_count','atss_count'],
                    'relative': ['ntkn_count','loc_count','nkeys_count','ncd_count','nts_count',
                                 'etp_count','ntvr_count','atss_count','nmd_count','nlp_count'],
                     'no_corr_metrics': ['loc_count','atss_count','etp_count','nmd_count','nfl_count',
                                         'nemd_count_relative','nlo_count','ntnn_count','nlp_count','nun_count_relative']
                      }

test_prior = pd.read_csv('results/metrics_prior.csv')
test_fix = pd.read_csv('results/metrics_fix.csv')
test_prior = test_prior.dropna()
test_fix = test_fix.dropna()
test = pd.concat([test_prior, test_fix])

names_prior = [f.split("__")[:2] for f in test_prior['file']]
names_fix = [f.split("__")[:2] for f in test_fix['file']]
mapper = []
for index, names in enumerate(names_prior):
    if names in names_fix:
        mapper.append([index, names_fix.index(names)])
        
mapper

results = {}

np.random.seed(0)

for df_name, data in datasets.items():
    print(df_name)
    metrics = important_metrics[df_name][:10]
    results[df_name] = {}
    
    for clf_name, clf in classifiers.items():
        results[df_name][clf_name] = {}
        
        X_train = np.array(data[metrics])
        y_train = np.array(data['outlier'])
        
        X_test_prior = test_prior[metrics]
        y_test_prior = test_prior['y']
        
        X_test_fix = test_fix[metrics]
        y_test_fix = test_fix['y']
    
        clf.fit(X_train, y_train)
        
        predict_prior = clf.predict(X_test_prior)
        predict_fix = clf.predict(X_test_fix)
        score_prior = accuracy_score(y_test_prior, predict_prior)
        score_fix = accuracy_score(y_test_fix, predict_fix)
        precision = sum(predict_prior)/(sum(predict_prior) + sum(predict_fix))
        recall = sum(predict_prior)/(sum(predict_prior) + (predict_prior==0).sum())
        
        score_prior = accuracy_score(y_test_prior, clf.predict(X_test_prior))
        score_fix = accuracy_score(y_test_fix, clf.predict(X_test_fix))
        
        change_pred = []
        for i,j in mapper:
            change_pred.append([predict_prior[i], predict_fix[j]])
        correct_change_pred = [i for i in change_pred if i[0]==1 and i[1]==0]
        
        results[df_name][clf_name]['important changes'] = len(correct_change_pred)
        results[df_name][clf_name]['important_prior'] = score_prior
        results[df_name][clf_name]['important_fix'] = score_fix
        results[df_name][clf_name]['important precision'] = precision
        results[df_name][clf_name]['important recall'] = recall
        
#         print(name, "Precision: ", precision)
#         print(name, "Recall: ", recall)
#         print(name, " important metrics prior:", accuracy_score(y_test_prior, clf.predict(X_test_prior)))
#         print(name, " important metrics fix:", accuracy_score(y_test_fix, clf.predict(X_test_fix)))
        print(clf_name, "changes: ", len(correct_change_pred))
        
        X_train = np.array(data[data.columns[:-1]])
        y_train = np.array(data['outlier'])
        
        X_test_prior = test_prior[data.columns[:-1]]
        y_test_prior = test_prior['y']
        
        X_test_fix = test_fix[data.columns[:-1]]
        y_test_fix = test_fix['y']
    
        clf.fit(X_train, y_train)
        
        predict_prior = clf.predict(X_test_prior)
        predict_fix = clf.predict(X_test_fix)
        score_prior = accuracy_score(y_test_prior, predict_prior)
        score_fix = accuracy_score(y_test_fix, predict_fix)
        precision = sum(predict_prior)/(sum(predict_prior) + sum(predict_fix))
        recall = sum(predict_prior)/(sum(predict_prior) + (predict_prior==0).sum())
        
        change_pred = []
        for i,j in mapper:
            change_pred.append([predict_prior[i], predict_fix[j]])
        correct_change_pred = [i for i in change_pred if i[0]==1 and i[1]==0]
        
        results[df_name][clf_name]['all changes'] = len(correct_change_pred)
        results[df_name][clf_name]['all_prior'] = score_prior
        results[df_name][clf_name]['all_fix'] = score_fix
        results[df_name][clf_name]['all_precision'] = precision
        results[df_name][clf_name]['all_recall'] = recall
        
        
#         print(name, "Precision: ", precision)
#         print(name, "Recall: ", recall)
#         print(name, " all metrics prior:", accuracy_score(y_test_prior, clf.predict(X_test_prior)))
#         print(name, " all metrics fix:", accuracy_score(y_test_fix, clf.predict(X_test_fix)))
        print(clf_name, "all changes: ", len(correct_change_pred))
        


    print()

all_metrics
RandomForestClassifier changes:  1
RandomForestClassifier all changes:  2
Decision Tree changes:  2
Decision Tree all changes:  1
Naive Bayes changes:  1
Naive Bayes all changes:  2
MLP changes:  0
MLP all changes:  0

absolute
RandomForestClassifier changes:  0
RandomForestClassifier all changes:  2
Decision Tree changes:  2
Decision Tree all changes:  2
Naive Bayes changes:  1
Naive Bayes all changes:  2
MLP changes:  0
MLP all changes:  0

relative




RandomForestClassifier changes:  1
RandomForestClassifier all changes:  0
Decision Tree changes:  2
Decision Tree all changes:  1
Naive Bayes changes:  3
Naive Bayes all changes:  2
MLP changes:  0
MLP all changes:  0

no_corr_metrics




RandomForestClassifier changes:  0
RandomForestClassifier all changes:  1
Decision Tree changes:  0
Decision Tree all changes:  1
Naive Bayes changes:  3
Naive Bayes all changes:  2
MLP changes:  0
MLP all changes:  0



