In [50]:
%matplotlib inline
from load_data import load_abalone, load_heart_disease
from datetime import datetime
from sklearn.model_selection import cross_val_score, KFold, train_test_split
from sklearn.metrics import accuracy_score, confusion_matrix
import numpy as np
import matplotlib.pyplot as plt


seed = 1
np.random.seed(seed)
test_ratios = np.arange(10) / 20 + 0.05 # from 0.05 to .5 
test_ratio = .2

x, y = load_heart_disease()
x_tr, x_te, y_tr, y_te = train_test_split(x, y, test_size=test_ratio, random_state=seed)
split_presets = [train_test_split(x, y, test_size=ratio, random_state=seed) for ratio in test_ratios]
# Other visualization notes: confusion matrix
# 60 20 20 best split
# Consider oversampling/undersampling to balance bias: https://www.datascience.com/blog/imbalanced-data
def r_acc(y_te, y_pred):
    return round(accuracy_score(y_te, y_pred), 3)

In [3]:
# Hyperparameters
validation_splits = 3
kf = KFold(n_splits=validation_splits)
print("Train KFold k={}".format(validation_splits))

Train KFold k=3


In [79]:
# Decision Tree: https://scikit-learn.org/stable/modules/tree.html
# Pruning: https://stackoverflow.com/questions/49428469/pruning-decision-trees
from sklearn.tree import DecisionTreeClassifier
clf = DecisionTreeClassifier(random_state=seed, max_depth=7)
accuracy = cross_val_score(clf, x_tr, y_tr, scoring='accuracy', cv=kf)
print(accuracy.mean()) # Easy graph over number of classes here
time_start = datetime.now()
clf = clf.fit(x_tr, y_tr)
print("Decision tree trained in {}".format(datetime.now() - time_start))
y_pred = clf.predict(x_te)
print(accuracy_score(y_te, y_pred))

0.7509259259259259
Decision tree trained in 0:00:00.001575
0.6557377049180327


In [8]:
# TODO: vary DT over hyperparams: train_set size (dubious), pruning hyperparams, GINI/info gain,
# https://scikit-learn.org/stable/modules/generated/sklearn.tree.DecisionTreeClassifier.html#sklearn.tree.DecisionTreeClassifier


In [10]:
import graphviz 
dot_data = tree.export_graphviz(clf, out_file=None) 
graph = graphviz.Source(dot_data) 
graph.render("abalone.pdf") 
graph.view()

AttributeError: 'str' object has no attribute 'render'

In [15]:
# Neural Net: https://scikit-learn.org/stable/modules/neural_networks_supervised.html
# Sample sanity run
from sklearn.neural_network import MLPClassifier
clf = MLPClassifier(solver='lbfgs', alpha=1e-3,
                     hidden_layer_sizes=(10, 10), random_state=seed)
accuracy = cross_val_score(clf, x_tr, y_tr, scoring='accuracy', cv=kf)
print(accuracy.mean()) # Easy graph over number of classes here
time_start = datetime.now()
clf = clf.fit(x_tr, y_tr)
print("Neural net trained in {}".format(datetime.now() - time_start))
y_pred = clf.predict(x_te)
print(accuracy_score(y_te, y_pred))

0.7635288065843623
Neural net trained in 0:00:00.057156
0.8360655737704918


In [42]:
# Neural net hyperparams
from sklearn.utils.extmath import cartesian
net_hyperparams = {"activation": ['identity','relu', 'logistic', 'tanh'],
               "epochs": np.arange(200) + 1, # iterations - when testing iterations set tol to 1e-8 or something
               "arch1": [()] + [(i) for i in (np.arange(100) + 1)],
               "arch2": cartesian((np.arange(1, 21), np.arange(1, 21))), # 2d plane search - reshape 20x20x2 for graph
               "arch3": [[10] * i for i in range(3, 10)], # Deeper architectures with 10 per layer
               "alpha": (np.arange(100) + 1) * 1e-5} # 1e-5 -> 1e-3
# Base params: relu, max_iter: 200 (==> convergence), lbfgs (good for smaller datasets), standard alpha
net_results = {}

In [53]:
net_results["tr_size"] = []
clf = MLPClassifier(solver='lbfgs', hidden_layer_sizes=(10, 10), random_state=seed)
print()
for splits in split_presets:
    split_x_tr, split_x_te, split_y_tr, split_y_te = splits
    clf = clf.fit(split_x_tr, split_y_tr)
    y_pred = clf.predict(split_x_te)
    net_results["tr_size"].append(r_acc(split_y_te, y_pred))
print(net_results["tr_size"])


[0.5, 0.548, 0.739, 0.787, 0.763, 0.802, 0.783, 0.785, 0.772, 0.781]


In [None]:
net_results["activation"] = []
for act in net_hyperparams["activation"]:
    clf = MLPClassifier(solver='lbfgs', hidden_layer_sizes=(10, 10), activation=act, random_state=seed)
    clf = clf.fit(x_tr, y_tr)
    y_pred = clf.predict(x_te)
    net_results["activation"].append(r_acc(y_te, y_pred))
print(net_results["activation"])

In [27]:
results["epochs"] = [] # also include time
results["epochs_time"] = []
for ep in net_hyperparams["epochs"]:
    clf = MLPClassifier(solver='lbfgs', hidden_layer_sizes=(10, 10), max_iter=ep, tol=1e-10, random_state=seed)
    time_start = datetime.now()
    clf = clf.fit(x_tr, y_tr)
    time_elapsed = (datetime.now() - time_start).microseconds
    y_pred = clf.predict(x_te)    
    net_results["epochs"].append(r_acc(y_te, y_pred))
    net_results["epochs_time"].append(round(time_elapsed, 3))
net_results["epochs_time"] = np.asarray(net_results["epochs_time"]) * 1e-3 # Microseconds -> ms
print(net_results["epochs"])
print(net_results["epochs_time"])

{'epochs': [0.59, 0.623, 0.754, 0.77, 0.77, 0.787, 0.82, 0.82, 0.82, 0.803, 0.803, 0.787, 0.77, 0.754, 0.787, 0.787, 0.787, 0.787, 0.787, 0.803, 0.803, 0.803, 0.803, 0.803, 0.803, 0.82, 0.803, 0.803, 0.803, 0.787, 0.803, 0.803, 0.803, 0.803, 0.803, 0.803, 0.803, 0.803, 0.787, 0.77, 0.803, 0.787, 0.787, 0.787, 0.787, 0.77, 0.754, 0.754, 0.77, 0.787, 0.787, 0.787, 0.787, 0.787, 0.787, 0.787, 0.787, 0.787, 0.787, 0.787, 0.787, 0.787, 0.787, 0.787, 0.787, 0.787, 0.787, 0.787, 0.787, 0.787, 0.787, 0.787, 0.787, 0.787, 0.787, 0.787, 0.787, 0.787, 0.787, 0.787, 0.787, 0.787, 0.787, 0.787, 0.787, 0.787, 0.787, 0.787, 0.787, 0.787, 0.787, 0.787, 0.787, 0.787, 0.77, 0.77, 0.787, 0.787, 0.787, 0.787, 0.787, 0.787, 0.787, 0.787, 0.787, 0.787, 0.787, 0.787, 0.787, 0.787, 0.787, 0.787, 0.787, 0.787, 0.787, 0.787, 0.787, 0.787, 0.787, 0.787, 0.787, 0.787, 0.787, 0.787, 0.787, 0.787, 0.787, 0.787, 0.787, 0.787, 0.803, 0.803, 0.803, 0.803, 0.803, 0.803, 0.803, 0.803, 0.803, 0.803, 0.803, 0.803, 0.803, 

In [31]:
net_results["alpha"] = []
for a in net_hyperparams["alpha"]:
    clf = MLPClassifier(solver='lbfgs', hidden_layer_sizes=(10, 10), alpha=a, random_state=seed)
    clf = clf.fit(x_tr, y_tr)
    y_pred = clf.predict(x_te)
    net_results["alpha"].append(r_acc(y_te, y_pred))
print(net_results["alpha"])

{'alpha': [0.7868852459016393, 0.7868852459016393, 0.7704918032786885, 0.8032786885245902, 0.8032786885245902, 0.8032786885245902, 0.8032786885245902, 0.8032786885245902, 0.7868852459016393, 0.7868852459016393, 0.8032786885245902, 0.8032786885245902, 0.819672131147541, 0.819672131147541, 0.7868852459016393, 0.819672131147541, 0.7868852459016393, 0.7868852459016393, 0.7868852459016393, 0.7704918032786885, 0.7868852459016393, 0.7868852459016393, 0.7868852459016393, 0.7868852459016393, 0.7868852459016393, 0.7868852459016393, 0.8032786885245902, 0.7868852459016393, 0.8032786885245902, 0.8032786885245902, 0.819672131147541, 0.8032786885245902, 0.8032786885245902, 0.819672131147541, 0.819672131147541, 0.819672131147541, 0.8360655737704918, 0.819672131147541, 0.8360655737704918, 0.819672131147541, 0.8360655737704918, 0.819672131147541, 0.819672131147541, 0.819672131147541, 0.819672131147541, 0.819672131147541, 0.819672131147541, 0.819672131147541, 0.8360655737704918, 0.819672131147541, 0.8196

In [48]:
arches = ["arch1", "arch2", "arch3"]
for arch_name in arches:
    net_results[arch_name] = []
    for arch in net_hyperparams[arch_name]:
        clf = MLPClassifier(solver='lbfgs', hidden_layer_sizes=arch, random_state=seed)
        clf = clf.fit(x_tr, y_tr)
        y_pred = clf.predict(x_te)
        net_results[arch_name].append(r_acc(y_te, y_pred))
print(net_results["arch1"], net_results["arch2"], net_results["arch3"])

{'arch2': [0.82, 0.82, 0.836, 0.869, 0.803, 0.82, 0.836, 0.852, 0.82, 0.836, 0.836, 0.836, 0.852, 0.836, 0.803, 0.869, 0.82, 0.836, 0.82, 0.852, 0.787, 0.787, 0.82, 0.803, 0.803, 0.852, 0.82, 0.803, 0.836, 0.803, 0.82, 0.787, 0.787, 0.754, 0.787, 0.803, 0.82, 0.82, 0.787, 0.82, 0.59, 0.82, 0.82, 0.82, 0.82, 0.787, 0.803, 0.803, 0.738, 0.77, 0.738, 0.738, 0.754, 0.77, 0.803, 0.721, 0.82, 0.787, 0.738, 0.754, 0.721, 0.77, 0.803, 0.77, 0.787, 0.787, 0.77, 0.787, 0.787, 0.689, 0.754, 0.672, 0.705, 0.787, 0.689, 0.803, 0.738, 0.836, 0.738, 0.738, 0.59, 0.77, 0.738, 0.738, 0.754, 0.77, 0.721, 0.721, 0.82, 0.803, 0.836, 0.721, 0.787, 0.77, 0.738, 0.82, 0.82, 0.77, 0.754, 0.738, 0.59, 0.836, 0.754, 0.754, 0.787, 0.754, 0.721, 0.754, 0.705, 0.721, 0.705, 0.672, 0.836, 0.754, 0.754, 0.82, 0.803, 0.77, 0.738, 0.77, 0.787, 0.754, 0.672, 0.787, 0.82, 0.787, 0.705, 0.803, 0.672, 0.754, 0.77, 0.82, 0.738, 0.672, 0.787, 0.77, 0.787, 0.738, 0.787, 0.738, 0.803, 0.754, 0.738, 0.787, 0.754, 0.738, 0.836,

In [7]:
# Boosting (Adaboost): https://scikit-learn.org/stable/modules/generated/sklearn.ensemble.AdaBoostClassifier.html
from sklearn.ensemble import AdaBoostClassifier
clf = AdaBoostClassifier(tree.DecisionTreeClassifier(max_depth=1, random_state=seed), random_state=seed)

accuracy = cross_val_score(clf, x_tr, y_tr, scoring='accuracy', cv=kf)
print(accuracy.mean()) # Easy graph over number of classes here
time_start = datetime.now()
clf = clf.fit(x_tr, y_tr)
print("AdaBoosted classifier trained in {}".format(datetime.now() - time_start))
y_pred = clf.predict(x_te)
y_pred = clf.predict(x_te)
print(accuracy_score(y_te, y_pred))


  from numpy.core.umath_tests import inner1d


0.8135288065843621
AdaBoosted classifier trained in 0:00:00.061199
0.6721311475409836


In [None]:
from sklearn.linear_model import SGDClassifier
base_classifiers = [DecisionTreeClassifier(max_depth=1, random_state=seed), 
                    SGDClassifier(loss='log', random_state=seed),
                    SGDClassifier(loss='hinge', random_state=seed)]
ada_hyperparams = {"base": base_classifiers,
                   "n": np.arange(1, 51),
                   "max_depth": np.arange(1,5)} # for decision tree
ada_results = {}

In [8]:
# TODO: vary over training size, base estimator (in trees, max_depth), n_estimators (boosting iterations)
ada_results["tr_size"] = []
clf = AdaBoostClassifier(random_state=seed)
for splits in split_presets:
    split_x_tr, split_x_te, split_y_tr, split_y_te = splits
    clf = clf.fit(split_x_tr, split_y_tr)
    y_pred = clf.predict(split_x_te)
    ada_results["tr_size"].append(r_acc(split_y_te, y_pred))
print(ada_results["tr_size"])

In [67]:
# SVM: https://scikit-learn.org/stable/modules/generated/sklearn.svm.SVC.html
from sklearn.svm import SVC
clf = SVC(C=1.0,kernel='rbf',random_state=seed)

accuracy = cross_val_score(clf, x_tr, y_tr, scoring='accuracy', cv=kf)
print(accuracy.mean()) # Easy graph over number of classes here
time_start = datetime.now()
clf = clf.fit(x_tr, y_tr)
print("SVC trained in {}".format(datetime.now() - time_start))
y_pred = clf.predict(x_te)
y_pred = clf.predict(x_te)
print(accuracy_score(y_te, y_pred))

0.8341563786008231
SVC trained in 0:00:00.002588
0.7377049180327869


In [72]:
svm_hyperparams = {"C": np.exp(np.arange(11)-5), # https://scikit-learn.org/stable/auto_examples/svm/plot_rbf_parameters.html
                   "kernel": ["linear", "poly", "sigmoid", "rbf"],
                   "gamma": np.exp(np.arange(11)-5), # https://scikit-learn.org/stable/auto_examples/svm/plot_rbf_parameters.html
                   "max_iter": np.arange(100) + 1,
                   }
svm_results = {}

In [73]:
svm_results["tr_size"] = []
clf = SVC()
for splits in split_presets:
    split_x_tr, split_x_te, split_y_tr, split_y_te = splits
    clf = clf.fit(split_x_tr, split_y_tr)
    y_pred = clf.predict(split_x_te)
    svm_results["tr_size"].append(r_acc(split_y_te, y_pred))
print(svm_results["tr_size"])

[0.562, 0.677, 0.717, 0.738, 0.737, 0.78, 0.811, 0.826, 0.794, 0.788]


In [74]:
svm_results["C"] = []
for C in svm_hyperparams["C"]:
    clf = SVC(C=C)
    clf = clf.fit(x_tr, y_tr)
    y_pred = clf.predict(x_te)
    svm_results["C"].append(r_acc(y_te, y_pred))
print(svm_results["C"])

[0.59, 0.59, 0.59, 0.59, 0.754, 0.738, 0.721, 0.705, 0.705, 0.705, 0.705]


In [75]:
svm_results["kernel"] = []
for ker in svm_hyperparams["kernel"]:
    clf = SVC(kernel=ker)
    clf = clf.fit(x_tr, y_tr)
    y_pred = clf.predict(x_te)
    svm_results["kernel"].append(r_acc(y_te, y_pred))
print(svm_results["kernel"])

[0.803, 0.689, 0.787, 0.738]


In [77]:
svm_results["gamma"] = []
for gamma in svm_hyperparams["gamma"]:
    clf = SVC(gamma=gamma)
    clf = clf.fit(x_tr, y_tr)
    y_pred = clf.predict(x_te)
    svm_results["gamma"].append(r_acc(y_te, y_pred))
print(svm_results["gamma"])

[0.77, 0.77, 0.77, 0.721, 0.623, 0.59, 0.59, 0.59, 0.59, 0.59, 0.59]


In [78]:
"""This is pretty hard to interpet --> ignore"""
svm_results["max_iter"] = []
for max_iter in svm_hyperparams["max_iter"]:
    clf = SVC(max_iter=max_iter)
    clf = clf.fit(x_tr, y_tr)
    y_pred = clf.predict(x_te)
    svm_results["max_iter"].append(r_acc(y_te, y_pred))
print(svm_results["max_iter"])



[0.689, 0.59, 0.59, 0.59, 0.672, 0.689, 0.672, 0.689, 0.689, 0.689, 0.721, 0.689, 0.689, 0.672, 0.721, 0.803, 0.787, 0.721, 0.787, 0.77, 0.803, 0.738, 0.738, 0.738, 0.721, 0.787, 0.77, 0.803, 0.77, 0.77, 0.77, 0.738, 0.787, 0.77, 0.754, 0.754, 0.787, 0.803, 0.836, 0.787, 0.754, 0.754, 0.787, 0.787, 0.77, 0.77, 0.77, 0.77, 0.754, 0.754, 0.787, 0.787, 0.77, 0.787, 0.754, 0.754, 0.738, 0.738, 0.738, 0.738, 0.738, 0.738, 0.738, 0.738, 0.738, 0.754, 0.754, 0.754, 0.754, 0.754, 0.754, 0.738, 0.738, 0.738, 0.738, 0.738, 0.738, 0.738, 0.738, 0.738, 0.738, 0.738, 0.738, 0.738, 0.738, 0.738, 0.738, 0.738, 0.738, 0.738, 0.738, 0.738, 0.738, 0.738, 0.738, 0.738, 0.738, 0.738, 0.738, 0.738]




In [55]:
# KNN: https://scikit-learn.org/stable/modules/generated/sklearn.neighbors.KNeighborsClassifier.html
from sklearn.neighbors import KNeighborsClassifier
clf = KNeighborsClassifier()

accuracy = cross_val_score(clf, x_tr, y_tr, scoring='accuracy', cv=kf)
print(accuracy.mean()) # Easy graph over number of classes here
time_start = datetime.now()
clf = clf.fit(x_tr, y_tr)
print("KNN trained in {}".format(datetime.now() - time_start))
y_pred = clf.predict(x_te)
y_pred = clf.predict(x_te)
print(accuracy_score(y_te, y_pred))

0.8133230452674898
SVC trained in 0:00:00.001325
0.7704918032786885


In [58]:
# KNN Hyperparams
knn_hyperparams = {"k": np.arange(10) + 1,
                   "weighting": ['uniform', 'distance'],
                   "p": np.arange(10) + 1}
# Base params: relu, max_iter: 200 (==> convergence), lbfgs (good for smaller datasets), standard alpha
knn_results = {}

In [59]:
knn_results["tr_size"] = []
clf = KNeighborsClassifier()
for splits in split_presets:
    split_x_tr, split_x_te, split_y_tr, split_y_te = splits
    clf = clf.fit(split_x_tr, split_y_tr)
    y_pred = clf.predict(split_x_te)
    knn_results["tr_size"].append(r_acc(split_y_te, y_pred))
print(knn_results["tr_size"])

[0.75, 0.774, 0.761, 0.77, 0.776, 0.769, 0.802, 0.826, 0.794, 0.788]


In [61]:
knn_results["k"] = []
for k in knn_hyperparams["k"]:
    clf = KNeighborsClassifier(n_neighbors=k)
    clf = clf.fit(x_tr, y_tr)
    y_pred = clf.predict(x_te)
    knn_results["k"].append(r_acc(y_te, y_pred))
print(knn_results["k"])

[0.705, 0.689, 0.738, 0.672, 0.77, 0.672, 0.721, 0.689, 0.738, 0.705]


In [63]:
knn_results["weighting"] = []
for weighting in knn_hyperparams["weighting"]:
    clf = KNeighborsClassifier(weights=weighting)
    clf = clf.fit(x_tr, y_tr)
    y_pred = clf.predict(x_te)
    knn_results["weighting"].append(r_acc(y_te, y_pred))
print(knn_results["weighting"]) # Presumably due to dimensionality -> space too sparse

[0.77, 0.77]


In [62]:
knn_results["p"] = []
for p in knn_hyperparams["p"]:
    clf = KNeighborsClassifier(p=p)
    clf = clf.fit(x_tr, y_tr)
    y_pred = clf.predict(x_te)
    knn_results["p"].append(r_acc(y_te, y_pred))
print(knn_results["p"])

[0.721, 0.77, 0.77, 0.754, 0.754, 0.77, 0.77, 0.787, 0.787, 0.787]
