In [1]:
import pandas as pd
import numpy as np
from sklearn import metrics
from sklearn.model_selection import KFold

from sklearn.model_selection import KFold
from sklearn import metrics

In [3]:
data = pd.read_csv(r"data_no_corr_85.csv")

data_defects = data.loc[data['defects'] == True]

data_no_defects = data.loc[data['defects'] == False]
data_no_defects = data_no_defects.sample(n=len(data_defects))

print(len(data_no_defects))
print(len(data_defects))

balanced_data = data_defects.append(data_no_defects)
balanced_data = balanced_data.sample(frac = 1).reset_index(drop = True)
#balanced_data

2665
2665


In [4]:
features = list(balanced_data.columns.values)
features.remove('defects')

In [5]:
train = balanced_data.sample(frac = 0.8, random_state = 1)
test = balanced_data.loc[~balanced_data.index.isin(train.index)]

train_data_array=train.as_matrix(columns = features)
train_class_array= train['defects'].values
test_data_array=test.as_matrix(columns = features)
test_class_array= test['defects'].values

In [5]:
from sklearn.naive_bayes import MultinomialNB

clf = MultinomialNB()

kf=KFold(n_splits=10, random_state=None, shuffle=False)
validation_score=0
for train_index, test_index in kf.split(train_data_array):
    X_train, X_test = train_data_array[train_index], train_data_array[test_index]
    y_train, y_test = train_class_array[train_index], train_class_array[test_index]
    clf.fit(X_train, y_train)
    validation_pred=clf.predict(X_test)
    validation_score+=metrics.accuracy_score(y_test,validation_pred)

print("Validation Accuracy:   %0.3f" % (validation_score/kf.get_n_splits()))

pred = clf.predict(test_data_array)
score = metrics.accuracy_score(test_class_array, pred)
print("accuracy:   %0.3f" % score)

score = metrics.precision_score(test_class_array, pred)
print("Precision:   %0.3f" % score)

score = metrics.recall_score(test_class_array, pred)
print("Recall:   %0.3f" % score)

score = metrics.f1_score(test_class_array, pred)
print("F-measure:   %0.3f" % score)

Validation Accuracy:   0.532
accuracy:   0.522
Precision:   0.611
Recall:   0.063
F-measure:   0.115


In [9]:
from sklearn.ensemble import RandomForestClassifier

clf = RandomForestClassifier(n_estimators=100)

clf.fit(train_data_array, train_class_array)

pred = clf.predict(test_data_array)
score = metrics.accuracy_score(test_class_array, pred)
print("accuracy:   %0.3f" % score)

score = metrics.precision_score(test_class_array, pred)
print("Precision:   %0.3f" % score)

score = metrics.recall_score(test_class_array, pred)
print("Recall:   %0.3f" % score)

score = metrics.f1_score(test_class_array, pred)
print("F-measure:   %0.3f" % score)

accuracy:   0.694
Precision:   0.682
Recall:   0.703
F-measure:   0.692


In [10]:
importances = clf.feature_importances_
std = np.std([tree.feature_importances_ for tree in clf.estimators_], axis=0)
indices = np.argsort(importances)[::-1]

print("Feature ranking:")
for f in range(X_train.shape[1]):
    print("%d. %s (%f)" % (f + 1, features[indices[f]], importances[indices[f]]))

Feature ranking:
1. lineCount (0.142700)
2. intelligence (0.063674)
3. hlineCount (0.062190)
4. volume (0.056934)
5. operatorsOperands (0.054312)
6. timeEstimator (0.052604)
7. effort (0.051243)
8. totalOperators (0.050544)
9. difficulty (0.049714)
10. hlineBlanks (0.048828)
11. totalOperands (0.045744)
12. uniqueOperands (0.043966)
13. designComplexity (0.042470)
14. branchCount (0.040173)
15. cyclomaticComplexity (0.039375)
16. uniqueOperators (0.034656)
17. h (0.028061)
18. hlineComments (0.026942)
19. essentialComplexity (0.026260)
20. programLength (0.025903)
21. hCodeAndComment (0.013707)


In [7]:
from sklearn import svm

clf = svm.SVC()

clf.fit(train_data_array, train_class_array)

pred = clf.predict(test_data_array)
score = metrics.accuracy_score(test_class_array, pred)
print("accuracy:   %0.3f" % score)

score = metrics.precision_score(test_class_array, pred)
print("Precision:   %0.3f" % score)

score = metrics.recall_score(test_class_array, pred)
print("Recall:   %0.3f" % score)

score = metrics.f1_score(test_class_array, pred)
print("F-measure:   %0.3f" % score)

accuracy:   0.611
Precision:   0.574
Recall:   0.915
F-measure:   0.706


## Hyperparameter Tuning

In [None]:
from sklearn.svm import SVC
from sklearn.metrics import classification_report
from sklearn.model_selection import GridSearchCV

tuned_parameters = [{'kernel': ['rbf'], 'gamma': [1e-3, 1e-4], 'C': [1, 10, 100, 1000]}, {'kernel': ['linear'], 'C': [1, 10, 100, 1000]}]

clf = GridSearchCV(SVC(C=1), tuned_parameters, cv=5,
                       scoring='f1_macro')
clf.fit(train_data_array, train_class_array)