# Import Basics

In [1]:
import pandas as pd
import numpy as np
from sklearn import metrics
from sklearn.model_selection import KFold

# Prepare Data

In [2]:
data = pd.read_csv(r"pc1.csv")
data = data.sample(frac = 1).reset_index(drop = True)

train = data.sample(frac = 0.7, random_state = 1)
test = data.loc[~data.index.isin(train.index)]

data_columns = ['lineCount', 'cyclomaticComplexity', 'essentialComplexity', 'designComplexity', 'operatorsOperands', 'volume', 
                'programLength', 'difficulty', 'intelligence', 'effort', 'h', 'timeEstimator', 'hlineCount', 'hlineComments', 
                'hlineBlanks', 'hCodeAndComment', 'uniqueOperators', 'uniqueOperands', 'totalOperators', 'totalOperands', 
                'branchCount']

train_data_array = train.as_matrix(columns = data_columns)
train_class_array = train['defects'].values

test_data_array = test.as_matrix(columns = data_columns)
test_class_array = test['defects'].values

# Classify Away!!

## Multinomial Naive Bayes

In [3]:
from sklearn.naive_bayes import MultinomialNB
mnb = MultinomialNB()

kf = KFold(n_splits = 10, random_state = None, shuffle = False)
validation_score = 0
for train_index, test_index in kf.split(train_data_array):
    X_train, X_test = train_data_array[train_index], train_data_array[test_index]
    y_train, y_test = train_class_array[train_index], train_class_array[test_index]
    mnb.fit(X_train, y_train)
    validation_pred = mnb.predict(X_test)
    validation_score += metrics.accuracy_score(y_test, validation_pred)

print("Validation Accuracy:   %0.3f" % (validation_score/kf.get_n_splits()))

pred = mnb.predict(test_data_array)

score = metrics.accuracy_score(test_class_array, pred)
print("Accuracy:   %0.3f" % score)

score = metrics.precision_score(test_class_array, pred)
print("Precision:   %0.3f" % score)

score = metrics.recall_score(test_class_array, pred)
print("Recall:   %0.3f" % score)

score = metrics.f1_score(test_class_array, pred)
print("F-measure:   %0.3f" % score)

Validation Accuracy:   0.907
Accuracy:   0.919
Precision:   0.000
Recall:   0.000
F-measure:   0.000


## Random Forest

In [4]:
from sklearn.ensemble import RandomForestClassifier
rf = RandomForestClassifier(n_estimators = 100)

rf.fit(train_data_array, train_class_array)

pred = rf.predict(test_data_array)

score = metrics.accuracy_score(test_class_array, pred)
print("Accuracy:   %0.3f" % score)

score = metrics.precision_score(test_class_array, pred)
print("Precision:   %0.3f" % score)

score = metrics.recall_score(test_class_array, pred)
print("Recall:   %0.3f" % score)

score = metrics.f1_score(test_class_array, pred)
print("F-measure:   %0.3f" % score)

Accuracy:   0.949
Precision:   0.636
Recall:   0.350
F-measure:   0.452


## Support Vector Machine

In [5]:
from sklearn import svm
sv = svm.SVC()

sv.fit(train_data_array, train_class_array)

pred = sv.predict(test_data_array)

score = metrics.accuracy_score(test_class_array, pred)
print("Accuracy:   %0.3f" % score)

score = metrics.precision_score(test_class_array, pred)
print("Precision:   %0.3f" % score)

score = metrics.recall_score(test_class_array, pred)
print("Recall:   %0.3f" % score)

score = metrics.f1_score(test_class_array, pred)
print("F-measure:   %0.3f" % score)

Accuracy:   0.949
Precision:   0.800
Recall:   0.200
F-measure:   0.320
