In [1]:
import csv
import numpy as np

with open('HTRU_2.csv', 'r') as csvFile:
    reader = csv.reader(csvFile)
    is_first = True
    data_list = []
    for row in reader:
        if is_first:
            feature_name = row
            is_first = False
            continue
        data_list.append(row)
    data = np.array(data_list, dtype=float)
    X = data[:, :8]
    y = data[:, 8]

In [2]:
from sklearn.model_selection import KFold
from sklearn.svm import SVC
from sklearn import tree
from sklearn.ensemble import RandomForestClassifier
from sklearn.metrics import precision_score
from sklearn.metrics import recall_score

kf = KFold(n_splits=5, shuffle=True)

print("Linear SVM:")
for c in [0.1, 1, 10]:
    svm_score = []
    svm_precision = []
    svm_recall = []
    for train_index, test_index in kf.split(X):
        X_train, X_test = X[train_index], X[test_index]
        y_train, y_test = y[train_index], y[test_index]

        clf = SVC(kernel='linear', C=c)
        clf.fit(X_train, y_train)
        y_pred = clf.predict(X_test)
        svm_score.append(clf.score(X_test, y_test))
        svm_precision.append(precision_score(y_test, y_pred))
        svm_recall.append(recall_score(y_test, y_pred))
    
    print("######")
    print("Regularization parameter C == " + str(c))
    print("Accuracy: ")
    print("Mean: " + str(np.mean(svm_score)))
    print("Std: " + str(np.std(svm_score)))
    print("Precision: ")
    print("Mean: " + str(np.mean(svm_precision)))
    print("Std: " + str(np.std(svm_precision)))
    print("Recall: ")
    print("Mean: " + str(np.mean(svm_recall)))
    print("Std: " + str(np.std(svm_recall)))

Linear SVM:
######
Regularization parameter C == 0.1
Accuracy: 
Mean: 0.9782655808791507
Std: 0.001441988571828264
Precision: 
Mean: 0.9466143079144448
Std: 0.02035684499400951
Recall: 
Mean: 0.808839269934458
Std: 0.02202655990033721
######
Regularization parameter C == 1
Accuracy: 
Mean: 0.9791596541588815
Std: 0.002841206749617268
Precision: 
Mean: 0.9408829251925482
Std: 0.008472785043512157
Recall: 
Mean: 0.8251695702647084
Std: 0.014419102865686609
######
Regularization parameter C == 10
Accuracy: 
Mean: 0.9793272207055121
Std: 0.001186521753330317
Precision: 
Mean: 0.9397131023228467
Std: 0.008627314443040459
Recall: 
Mean: 0.8273857363468036
Std: 0.00842260163703791


In [3]:
kf = KFold(n_splits=5, shuffle=True)

print("Decision Trees: ")
for d in [3, 4, 6]:
    dt_score = []
    dt_precision = []
    dt_recall = []
    for train_index, test_index in kf.split(X):
        X_train, X_test = X[train_index], X[test_index]
        y_train, y_test = y[train_index], y[test_index]

        # Decision Tree: max_depth = 3
        clf = tree.DecisionTreeClassifier(max_depth=d)
        clf = clf.fit(X_train, y_train)
        y_pred = clf.predict(X_test)
        dt_score.append(clf.score(X_test, y_test))
        dt_precision.append(precision_score(y_test, y_pred))
        dt_recall.append(recall_score(y_test, y_pred))
    
    print("######")
    print("Max Depth == " + str(d))
    print("Accuracy: ")
    print("Mean: " + str(np.mean(dt_score)))
    print("Std: " + str(np.std(dt_score)))
    print("Precision: ")
    print("Mean: " + str(np.mean(dt_precision)))
    print("Std: " + str(np.std(dt_precision)))
    print("Recall: ")
    print("Mean: " + str(np.mean(dt_recall)))
    print("Std: " + str(np.std(dt_recall)))

Decision Trees: 
######
Max Depth == 3
Accuracy: 
Mean: 0.9781539426917728
Std: 0.0021741933196146625
Precision: 
Mean: 0.910335303603518
Std: 0.013242800427548135
Recall: 
Mean: 0.845700801245747
Std: 0.02661256239285133
######
Max Depth == 4
Accuracy: 
Mean: 0.9792156917836978
Std: 0.004836350744421382
Precision: 
Mean: 0.9175668128289939
Std: 0.03755851954890161
Recall: 
Mean: 0.8467586295395513
Std: 0.03995045620966137
######
Max Depth == 6
Accuracy: 
Mean: 0.9782656901447144
Std: 0.0019869627381800567
Precision: 
Mean: 0.9146576946747741
Std: 0.013569262108133296
Recall: 
Mean: 0.8414511910434618
Std: 0.011558184734534493


In [4]:
kf = KFold(n_splits=5, shuffle=True)

print("Random Forests: ")
for e in [5, 11, 13]:
    rf_score = []
    rf_precision = []
    rf_recall = []
    for train_index, test_index in kf.split(X):
        X_train, X_test = X[train_index], X[test_index]
        y_train, y_test = y[train_index], y[test_index]

        # Random Forests: number of trees = 5, max_depth = 5
        clf = RandomForestClassifier(n_estimators=e, max_depth=5)
        clf = clf.fit(X_train, y_train)
        y_pred = clf.predict(X_test)
        rf_score.append(clf.score(X_test, y_test))
        rf_precision.append(precision_score(y_test, y_pred))
        rf_recall.append(recall_score(y_test, y_pred))
   
    print("######")
    print("Number of estimators == " + str(e))
    print("Accuracy: ")
    print("Mean: " + str(np.mean(rf_score)))
    print("Std: " + str(np.std(rf_score)))
    print("Precision: ")
    print("Mean: " + str(np.mean(rf_precision)))
    print("Std: " + str(np.std(rf_precision)))
    print("Recall: ")
    print("Mean: " + str(np.mean(rf_recall)))
    print("Std: " + str(np.std(rf_recall)))

Random Forests: 
######
Number of estimators == 5
Accuracy: 
Mean: 0.9781539426917728
Std: 0.0021453892830976366
Precision: 
Mean: 0.938018415594159
Std: 0.01154879033171521
Recall: 
Mean: 0.8148007670179632
Std: 0.016455382083391236
######
Number of estimators == 11
Accuracy: 
Mean: 0.9780981548168164
Std: 0.0017177755111102576
Precision: 
Mean: 0.9334944884287868
Std: 0.005054541978561878
Recall: 
Mean: 0.8190654927744727
Std: 0.02480314690447081
######
Number of estimators == 13
Accuracy: 
Mean: 0.9786567203784958
Std: 0.002176347294617576
Precision: 
Mean: 0.9364667634025912
Std: 0.014526748878917233
Recall: 
Mean: 0.8231358086756726
Std: 0.007845047025502701


In [5]:
from sklearn import tree

kf = KFold(n_splits=5, shuffle=True)
for train_index, test_index in kf.split(X):
    X_train, X_test = X[train_index], X[test_index]
    y_train, y_test = y[train_index], y[test_index]

    # Decision Tree: max_depth = 3
    clf = tree.DecisionTreeClassifier(max_depth=3)
    clf = clf.fit(X_train, y_train)
    y_pred = clf.predict(X_test)
    Y_Classes = ['0','1']
    tree.export_graphviz(clf, out_file = 'htru_tree.dot', feature_names = feature_name[:8], class_names = Y_Classes, filled=True, rounded=True)