In [1]:
import numpy as np
import pandas as pd
import matplotlib.pyplot as plt
import sys, getopt
from sklearn.model_selection import KFold, GridSearchCV
from sklearn.neighbors import KNeighborsClassifier
from sklearn.tree import DecisionTreeClassifier
from sklearn.impute import KNNImputer
from sklearn.ensemble import RandomForestClassifier
from sklearn import svm
from sklearn.preprocessing import StandardScaler
from sklearn.neural_network import MLPClassifier
from sklearn.metrics import classification_report

pd.set_option('display.max_rows', None)
pd.set_option('display.max_columns', None)
pd.set_option('mode.chained_assignment', 'raise')

In [2]:
"""
Helper Functions
"""

# def perf(test_y, pred_y):
#     """
#     params:
#         pred_y : predicted y labels for data
#         test_y : actual y labels for data
#     return:
#         Fraction of correctly classified samples
#     """
#     res = pred_y - test_y
#     return res[res == 0].count() / res.count()


def split_xy(df):
    """
    params:
        df : dataframe
    return:
        tuple
            0 - dataframe of data w
            1 - ser
    """
    y = df['Class']
    x = df.drop('Class',axis=1)
    return x,y

In [3]:
"""
Data Collection
"""
colnames = ['Sample code number','Clump Thickness','Uniformity of Cell Size','Uniformity of Cell Shape','Marginal Adhesion','Single Epithelial Cell Size','Bare Nuclei','Bland Chromatin','Normal Nucleoli','Mitoses','Class']        
classes = {1 : 'benign', -1 : 'malignant'}
# All columns have values b/w 1-10 except for first (id) and last (class)
data = pd.read_csv('breast-cancer-wisconsin.data', names=colnames)
data.loc[(data['Class'] == 2),'Class'] = 1
data.loc[(data['Class'] == 4),'Class'] = -1
# ADJUST FOR MISISNG DATA '?'
data = data.replace('?', np.nan)

test_data = data.sample(frac=.1)
pre_train_data = data.drop(test_data.index).reset_index(drop=True)

In [4]:
"""
Cross Validation
https://machinelearningmastery.com/repeated-k-fold-cross-validation-with-python/
"""

def knnMethod(train_x, train_y, test_x, test_y, imputer, n_neighbors=1, p=1):
    xTri, xTei = imputer.fit_transform(train_x), imputer.fit_transform(test_x)
    knn = KNeighborsClassifier(n_neighbors=n_neighbors, p=p) # p-l1 vs l2 distance
    knn.fit(xTri, train_y)
    preds_knn = knn.predict(xTei)
    score = knn.score(xTei, test_y)
    print("1-kNN %f" % score)
    perf = classification_report(test_y, preds_knn,output_dict=True)
    print(perf)
    print("-----------------------------------------\n")
    return score, perf
    
def decisionTree(train_x, train_y, test_x, test_y, imputer):
    xTri, xTei = imputer.fit_transform(train_x), imputer.fit_transform(test_x)
    dt = DecisionTreeClassifier(criterion="entropy")
    dt.fit(xTri, train_y)
    preds_dt = dt.predict(xTei)
    score = dt.score(xTei,test_y)
    print("Decision Tree %f" % score)
    perf = classification_report(test_y, preds_dt,output_dict=True)
    print(perf)
    print("-----------------------------------------\n")
    return score, perf

def randomForest(train_x, train_y, test_x, test_y, imputer, n_estimators=100):
    xTri, xTei = imputer.fit_transform(train_x), imputer.fit_transform(test_x)
    rf = RandomForestClassifier(criterion="gini", n_estimators=100)
    rf.fit(xTri, train_y)
    preds_rf = rf.predict(xTei)
    score = rf.score(xTei,test_y)
    print("Random Forest %f" % score)
    perf = classification_report(test_y, preds_rf,output_dict=True)
    print(perf)
    print("-----------------------------------------\n")
    return score,perf

def polynomialSVC(train_x, train_y, test_x, test_y, imputer):
    xTri, xTei = imputer.fit_transform(train_x), imputer.fit_transform(test_x)
    svm_3 = svm.SVC(kernel='poly', degree=3, gamma='scale', coef0=131, C=12, tol=.0001)
    svm_3.fit(xTri, train_y)
    preds_svm3 = svm_3.predict(xTei)
    score = svm_3.score(xTei,test_y)
    print("[Polynomial^3] SVM %f" % score)
    perf = classification_report(test_y, preds_svm3,output_dict=True)
    print(perf)
    print("-----------------------------------------\n")
    return score,perf
    
def gaussianSVC(train_x, train_y, test_x, test_y, imputer):
    xTri, xTei = imputer.fit_transform(train_x), imputer.fit_transform(test_x)
    svm_g = svm.SVC(kernel='rbf', gamma='auto', C=12, tol=.0001)
    svm_g.fit(xTri, train_y)
    preds_svmg = svm_g.predict(xTei)
    score = svm_g.score(xTei,test_y)
    print("[Gaussian] SVM %f" % score)
    perf = classification_report(test_y, preds_svmg,output_dict=True)
    print(perf)
    print("-----------------------------------------\n")
    return score,perf
    
def sigmoidNN(train_x, train_y, test_x, test_y, imputer):
    xTri, xTei = imputer.fit_transform(train_x), imputer.fit_transform(test_x)
    scaler = StandardScaler()
    scaler.fit(xTri)
    xTrnn, xTenn = scaler.transform(xTri), scaler.transform(xTei)
    mlp_sig = MLPClassifier(hidden_layer_sizes=(5,5,5), activation='logistic', max_iter=1500, alpha=.00001, tol=.0001, beta_1=.8, beta_2=.95, epsilon=.00000000001)
    mlp_sig.fit(xTri, train_y) # train_y.values.ravel() converts Series -> np.ndarray
    preds_nn_sig = mlp_sig.predict(xTei)
    score = mlp_sig.score(xTei,test_y)
    print("Sigmoid NN %f" % score)
    perf = classification_report(test_y, preds_nn_sig,output_dict=True)
    print(perf)
    print("-----------------------------------------\n")
    return score,perf
    
def reluNN(train_x, train_y, test_x, test_y, imputer):
    xTri, xTei = imputer.fit_transform(train_x), imputer.fit_transform(test_x)
    scaler = StandardScaler()
    scaler.fit(xTri)
    xTrnn, xTenn = scaler.transform(xTri), scaler.transform(xTei)
    mlp_relu = MLPClassifier(hidden_layer_sizes=(5,5,5), activation='relu', max_iter=1500, alpha=.00001, tol=.001, beta_1=.8, beta_2=.95, epsilon=.00000000001)
    mlp_relu.fit(xTri, train_y) # train_y.values.ravel() converts Series -> np.ndarray
    preds_nn_relu = mlp_relu.predict(xTei)
    score = mlp_relu.score(xTei,test_y)
    print("Relu NN %f" % score)
    perf = classification_report(test_y, preds_nn_relu, output_dict=True)
    print(perf)
    print("-----------------------------------------\n")
    return score,perf
    
cv_datasets = []
imputers = [KNNImputer(n_neighbors=1)]
perf_data = {}
for train_idx,valid_idx in KFold(n_splits=10, random_state=1, shuffle=True).split(pre_train_data):
    train_data = pre_train_data.iloc[train_idx]
    valid_data = pre_train_data.iloc[valid_idx]
    cv_datasets.append((train_data,valid_data))
    train_x, train_y = split_xy(train_data)
    valid_x, valid_y = split_xy(valid_data)
    test_x, test_y = split_xy(test_data)
    
    
    
    perf_arr['knn'] = knnMethod(train_x, train_y, test_x, test_y, imputers[0])
    perf_arr['dt'] = decisionTree(train_x, train_y, test_x, test_y, imputers[0])
    perf_arr['rf'] = randomForest(train_x, train_y, test_x, test_y, imputers[0])
    perf_arr['svmp'] = polynomialSVC(train_x, train_y, test_x, test_y, imputers[0])
    perf_arr['svmg'] = gaussianSVC(train_x, train_y, test_x, test_y, imputers[0])
    perf_arr['nnsig'] = sigmoidNN(train_x, train_y, test_x, test_y, imputers[0])
    perf_arr['nnrelu'] = reluNN(train_x, train_y, test_x, test_y, imputers[0])

print("FINISHED")

train = cv_datasets[0][0]
train_x, train_y = split_xy(train)
valid = cv_datasets[0][1]
valid_x, valid_y = split_xy(valid)
test_x, test_y = split_xy(test_data)

1-kNN 0.642857
{'-1': {'precision': 0.5652173913043478, 'recall': 0.4642857142857143, 'f1-score': 0.5098039215686274, 'support': 28}, '1': {'precision': 0.6808510638297872, 'recall': 0.7619047619047619, 'f1-score': 0.7191011235955055, 'support': 42}, 'accuracy': 0.6428571428571429, 'macro avg': {'precision': 0.6230342275670675, 'recall': 0.6130952380952381, 'f1-score': 0.6144525225820665, 'support': 70}, 'weighted avg': {'precision': 0.6345975948196115, 'recall': 0.6428571428571429, 'f1-score': 0.6353822427847543, 'support': 70}}
-----------------------------------------

Decision Tree 1.000000
{'-1': {'precision': 1.0, 'recall': 1.0, 'f1-score': 1.0, 'support': 28}, '1': {'precision': 1.0, 'recall': 1.0, 'f1-score': 1.0, 'support': 42}, 'accuracy': 1.0, 'macro avg': {'precision': 1.0, 'recall': 1.0, 'f1-score': 1.0, 'support': 70}, 'weighted avg': {'precision': 1.0, 'recall': 1.0, 'f1-score': 1.0, 'support': 70}}
-----------------------------------------

Random Forest 0.985714
{'-1':

  _warn_prf(average, modifier, msg_start, len(result))
  _warn_prf(average, modifier, msg_start, len(result))
  _warn_prf(average, modifier, msg_start, len(result))
  _warn_prf(average, modifier, msg_start, len(result))
  _warn_prf(average, modifier, msg_start, len(result))
  _warn_prf(average, modifier, msg_start, len(result))
  _warn_prf(average, modifier, msg_start, len(result))
  _warn_prf(average, modifier, msg_start, len(result))
  _warn_prf(average, modifier, msg_start, len(result))


Random Forest 0.985714
{'-1': {'precision': 0.9655172413793104, 'recall': 1.0, 'f1-score': 0.9824561403508771, 'support': 28}, '1': {'precision': 1.0, 'recall': 0.9761904761904762, 'f1-score': 0.9879518072289156, 'support': 42}, 'accuracy': 0.9857142857142858, 'macro avg': {'precision': 0.9827586206896552, 'recall': 0.9880952380952381, 'f1-score': 0.9852039737898963, 'support': 70}, 'weighted avg': {'precision': 0.9862068965517242, 'recall': 0.9857142857142858, 'f1-score': 0.9857535404777003, 'support': 70}}
-----------------------------------------

[Polynomial^3] SVM 0.571429
{'-1': {'precision': 0.0, 'recall': 0.0, 'f1-score': 0.0, 'support': 28}, '1': {'precision': 0.5882352941176471, 'recall': 0.9523809523809523, 'f1-score': 0.7272727272727274, 'support': 42}, 'accuracy': 0.5714285714285714, 'macro avg': {'precision': 0.29411764705882354, 'recall': 0.47619047619047616, 'f1-score': 0.3636363636363637, 'support': 70}, 'weighted avg': {'precision': 0.35294117647058826, 'recall': 0.57

  _warn_prf(average, modifier, msg_start, len(result))
  _warn_prf(average, modifier, msg_start, len(result))
  _warn_prf(average, modifier, msg_start, len(result))
  _warn_prf(average, modifier, msg_start, len(result))
  _warn_prf(average, modifier, msg_start, len(result))
  _warn_prf(average, modifier, msg_start, len(result))
  _warn_prf(average, modifier, msg_start, len(result))
  _warn_prf(average, modifier, msg_start, len(result))
  _warn_prf(average, modifier, msg_start, len(result))


Sigmoid NN 0.600000
{'-1': {'precision': 0.0, 'recall': 0.0, 'f1-score': 0.0, 'support': 28}, '1': {'precision': 0.6, 'recall': 1.0, 'f1-score': 0.7499999999999999, 'support': 42}, 'accuracy': 0.6, 'macro avg': {'precision': 0.3, 'recall': 0.5, 'f1-score': 0.37499999999999994, 'support': 70}, 'weighted avg': {'precision': 0.36, 'recall': 0.6, 'f1-score': 0.44999999999999996, 'support': 70}}
-----------------------------------------

Relu NN 0.600000
{'-1': {'precision': 0.0, 'recall': 0.0, 'f1-score': 0.0, 'support': 28}, '1': {'precision': 0.6, 'recall': 1.0, 'f1-score': 0.7499999999999999, 'support': 42}, 'accuracy': 0.6, 'macro avg': {'precision': 0.3, 'recall': 0.5, 'f1-score': 0.37499999999999994, 'support': 70}, 'weighted avg': {'precision': 0.36, 'recall': 0.6, 'f1-score': 0.44999999999999996, 'support': 70}}
-----------------------------------------

1-kNN 0.642857
{'-1': {'precision': 0.56, 'recall': 0.5, 'f1-score': 0.5283018867924528, 'support': 28}, '1': {'precision': 0.688

  _warn_prf(average, modifier, msg_start, len(result))
  _warn_prf(average, modifier, msg_start, len(result))
  _warn_prf(average, modifier, msg_start, len(result))
  _warn_prf(average, modifier, msg_start, len(result))
  _warn_prf(average, modifier, msg_start, len(result))
  _warn_prf(average, modifier, msg_start, len(result))
  _warn_prf(average, modifier, msg_start, len(result))
  _warn_prf(average, modifier, msg_start, len(result))
  _warn_prf(average, modifier, msg_start, len(result))
  _warn_prf(average, modifier, msg_start, len(result))
  _warn_prf(average, modifier, msg_start, len(result))
  _warn_prf(average, modifier, msg_start, len(result))


Relu NN 0.400000
{'-1': {'precision': 0.4, 'recall': 1.0, 'f1-score': 0.5714285714285715, 'support': 28}, '1': {'precision': 0.0, 'recall': 0.0, 'f1-score': 0.0, 'support': 42}, 'accuracy': 0.4, 'macro avg': {'precision': 0.2, 'recall': 0.5, 'f1-score': 0.28571428571428575, 'support': 70}, 'weighted avg': {'precision': 0.16, 'recall': 0.4, 'f1-score': 0.22857142857142862, 'support': 70}}
-----------------------------------------

1-kNN 0.671429
{'-1': {'precision': 0.6086956521739131, 'recall': 0.5, 'f1-score': 0.5490196078431373, 'support': 28}, '1': {'precision': 0.7021276595744681, 'recall': 0.7857142857142857, 'f1-score': 0.7415730337078651, 'support': 42}, 'accuracy': 0.6714285714285714, 'macro avg': {'precision': 0.6554116558741906, 'recall': 0.6428571428571428, 'f1-score': 0.6452963207755011, 'support': 70}, 'weighted avg': {'precision': 0.6647548566142462, 'recall': 0.6714285714285714, 'f1-score': 0.6645516633619739, 'support': 70}}
-----------------------------------------

De

  _warn_prf(average, modifier, msg_start, len(result))
  _warn_prf(average, modifier, msg_start, len(result))
  _warn_prf(average, modifier, msg_start, len(result))
  _warn_prf(average, modifier, msg_start, len(result))
  _warn_prf(average, modifier, msg_start, len(result))
  _warn_prf(average, modifier, msg_start, len(result))
  _warn_prf(average, modifier, msg_start, len(result))
  _warn_prf(average, modifier, msg_start, len(result))
  _warn_prf(average, modifier, msg_start, len(result))


Sigmoid NN 0.600000
{'-1': {'precision': 0.0, 'recall': 0.0, 'f1-score': 0.0, 'support': 28}, '1': {'precision': 0.6, 'recall': 1.0, 'f1-score': 0.7499999999999999, 'support': 42}, 'accuracy': 0.6, 'macro avg': {'precision': 0.3, 'recall': 0.5, 'f1-score': 0.37499999999999994, 'support': 70}, 'weighted avg': {'precision': 0.36, 'recall': 0.6, 'f1-score': 0.44999999999999996, 'support': 70}}
-----------------------------------------

Relu NN 0.600000
{'-1': {'precision': 0.0, 'recall': 0.0, 'f1-score': 0.0, 'support': 28}, '1': {'precision': 0.6, 'recall': 1.0, 'f1-score': 0.7499999999999999, 'support': 42}, 'accuracy': 0.6, 'macro avg': {'precision': 0.3, 'recall': 0.5, 'f1-score': 0.37499999999999994, 'support': 70}, 'weighted avg': {'precision': 0.36, 'recall': 0.6, 'f1-score': 0.44999999999999996, 'support': 70}}
-----------------------------------------

1-kNN 0.614286
{'-1': {'precision': 0.5217391304347826, 'recall': 0.42857142857142855, 'f1-score': 0.47058823529411764, 'support'

  _warn_prf(average, modifier, msg_start, len(result))
  _warn_prf(average, modifier, msg_start, len(result))
  _warn_prf(average, modifier, msg_start, len(result))
  _warn_prf(average, modifier, msg_start, len(result))
  _warn_prf(average, modifier, msg_start, len(result))
  _warn_prf(average, modifier, msg_start, len(result))
  _warn_prf(average, modifier, msg_start, len(result))
  _warn_prf(average, modifier, msg_start, len(result))
  _warn_prf(average, modifier, msg_start, len(result))


Sigmoid NN 0.600000
{'-1': {'precision': 0.0, 'recall': 0.0, 'f1-score': 0.0, 'support': 28}, '1': {'precision': 0.6, 'recall': 1.0, 'f1-score': 0.7499999999999999, 'support': 42}, 'accuracy': 0.6, 'macro avg': {'precision': 0.3, 'recall': 0.5, 'f1-score': 0.37499999999999994, 'support': 70}, 'weighted avg': {'precision': 0.36, 'recall': 0.6, 'f1-score': 0.44999999999999996, 'support': 70}}
-----------------------------------------

Relu NN 0.600000
{'-1': {'precision': 0.0, 'recall': 0.0, 'f1-score': 0.0, 'support': 28}, '1': {'precision': 0.6, 'recall': 1.0, 'f1-score': 0.7499999999999999, 'support': 42}, 'accuracy': 0.6, 'macro avg': {'precision': 0.3, 'recall': 0.5, 'f1-score': 0.37499999999999994, 'support': 70}, 'weighted avg': {'precision': 0.36, 'recall': 0.6, 'f1-score': 0.44999999999999996, 'support': 70}}
-----------------------------------------

1-kNN 0.642857
{'-1': {'precision': 0.5652173913043478, 'recall': 0.4642857142857143, 'f1-score': 0.5098039215686274, 'support': 

  _warn_prf(average, modifier, msg_start, len(result))
  _warn_prf(average, modifier, msg_start, len(result))
  _warn_prf(average, modifier, msg_start, len(result))
  _warn_prf(average, modifier, msg_start, len(result))
  _warn_prf(average, modifier, msg_start, len(result))
  _warn_prf(average, modifier, msg_start, len(result))


Sigmoid NN 0.600000
{'-1': {'precision': 0.0, 'recall': 0.0, 'f1-score': 0.0, 'support': 28}, '1': {'precision': 0.6, 'recall': 1.0, 'f1-score': 0.7499999999999999, 'support': 42}, 'accuracy': 0.6, 'macro avg': {'precision': 0.3, 'recall': 0.5, 'f1-score': 0.37499999999999994, 'support': 70}, 'weighted avg': {'precision': 0.36, 'recall': 0.6, 'f1-score': 0.44999999999999996, 'support': 70}}
-----------------------------------------

Relu NN 0.400000
{'-1': {'precision': 0.4, 'recall': 1.0, 'f1-score': 0.5714285714285715, 'support': 28}, '1': {'precision': 0.0, 'recall': 0.0, 'f1-score': 0.0, 'support': 42}, 'accuracy': 0.4, 'macro avg': {'precision': 0.2, 'recall': 0.5, 'f1-score': 0.28571428571428575, 'support': 70}, 'weighted avg': {'precision': 0.16, 'recall': 0.4, 'f1-score': 0.22857142857142862, 'support': 70}}
-----------------------------------------

1-kNN 0.671429
{'-1': {'precision': 0.6086956521739131, 'recall': 0.5, 'f1-score': 0.5490196078431373, 'support': 28}, '1': {'pre

  _warn_prf(average, modifier, msg_start, len(result))
  _warn_prf(average, modifier, msg_start, len(result))
  _warn_prf(average, modifier, msg_start, len(result))
  _warn_prf(average, modifier, msg_start, len(result))
  _warn_prf(average, modifier, msg_start, len(result))
  _warn_prf(average, modifier, msg_start, len(result))


Random Forest 0.985714
{'-1': {'precision': 0.9655172413793104, 'recall': 1.0, 'f1-score': 0.9824561403508771, 'support': 28}, '1': {'precision': 1.0, 'recall': 0.9761904761904762, 'f1-score': 0.9879518072289156, 'support': 42}, 'accuracy': 0.9857142857142858, 'macro avg': {'precision': 0.9827586206896552, 'recall': 0.9880952380952381, 'f1-score': 0.9852039737898963, 'support': 70}, 'weighted avg': {'precision': 0.9862068965517242, 'recall': 0.9857142857142858, 'f1-score': 0.9857535404777003, 'support': 70}}
-----------------------------------------

[Polynomial^3] SVM 0.571429
{'-1': {'precision': 0.0, 'recall': 0.0, 'f1-score': 0.0, 'support': 28}, '1': {'precision': 0.5882352941176471, 'recall': 0.9523809523809523, 'f1-score': 0.7272727272727274, 'support': 42}, 'accuracy': 0.5714285714285714, 'macro avg': {'precision': 0.29411764705882354, 'recall': 0.47619047619047616, 'f1-score': 0.3636363636363637, 'support': 70}, 'weighted avg': {'precision': 0.35294117647058826, 'recall': 0.57

  _warn_prf(average, modifier, msg_start, len(result))
  _warn_prf(average, modifier, msg_start, len(result))
  _warn_prf(average, modifier, msg_start, len(result))
  _warn_prf(average, modifier, msg_start, len(result))
  _warn_prf(average, modifier, msg_start, len(result))
  _warn_prf(average, modifier, msg_start, len(result))
  _warn_prf(average, modifier, msg_start, len(result))
  _warn_prf(average, modifier, msg_start, len(result))
  _warn_prf(average, modifier, msg_start, len(result))


Sigmoid NN 0.600000
{'-1': {'precision': 0.0, 'recall': 0.0, 'f1-score': 0.0, 'support': 28}, '1': {'precision': 0.6, 'recall': 1.0, 'f1-score': 0.7499999999999999, 'support': 42}, 'accuracy': 0.6, 'macro avg': {'precision': 0.3, 'recall': 0.5, 'f1-score': 0.37499999999999994, 'support': 70}, 'weighted avg': {'precision': 0.36, 'recall': 0.6, 'f1-score': 0.44999999999999996, 'support': 70}}
-----------------------------------------

Relu NN 0.600000
{'-1': {'precision': 0.0, 'recall': 0.0, 'f1-score': 0.0, 'support': 28}, '1': {'precision': 0.6, 'recall': 1.0, 'f1-score': 0.7499999999999999, 'support': 42}, 'accuracy': 0.6, 'macro avg': {'precision': 0.3, 'recall': 0.5, 'f1-score': 0.37499999999999994, 'support': 70}, 'weighted avg': {'precision': 0.36, 'recall': 0.6, 'f1-score': 0.44999999999999996, 'support': 70}}
-----------------------------------------

1-kNN 0.642857
{'-1': {'precision': 0.56, 'recall': 0.5, 'f1-score': 0.5283018867924528, 'support': 28}, '1': {'precision': 0.688

  _warn_prf(average, modifier, msg_start, len(result))
  _warn_prf(average, modifier, msg_start, len(result))
  _warn_prf(average, modifier, msg_start, len(result))
  _warn_prf(average, modifier, msg_start, len(result))
  _warn_prf(average, modifier, msg_start, len(result))
  _warn_prf(average, modifier, msg_start, len(result))
  _warn_prf(average, modifier, msg_start, len(result))
  _warn_prf(average, modifier, msg_start, len(result))
  _warn_prf(average, modifier, msg_start, len(result))


{'-1': {'precision': 0.4, 'recall': 1.0, 'f1-score': 0.5714285714285715, 'support': 28}, '1': {'precision': 0.0, 'recall': 0.0, 'f1-score': 0.0, 'support': 42}, 'accuracy': 0.4, 'macro avg': {'precision': 0.2, 'recall': 0.5, 'f1-score': 0.28571428571428575, 'support': 70}, 'weighted avg': {'precision': 0.16, 'recall': 0.4, 'f1-score': 0.22857142857142862, 'support': 70}}
-----------------------------------------

1-kNN 0.585714
{'-1': {'precision': 0.4827586206896552, 'recall': 0.5, 'f1-score': 0.49122807017543857, 'support': 28}, '1': {'precision': 0.6585365853658537, 'recall': 0.6428571428571429, 'f1-score': 0.6506024096385543, 'support': 42}, 'accuracy': 0.5857142857142857, 'macro avg': {'precision': 0.5706476030277544, 'recall': 0.5714285714285714, 'f1-score': 0.5709152399069964, 'support': 70}, 'weighted avg': {'precision': 0.5882253994953743, 'recall': 0.5857142857142857, 'f1-score': 0.5868526738533081, 'support': 70}}
-----------------------------------------

Decision Tree 0.98

  _warn_prf(average, modifier, msg_start, len(result))
  _warn_prf(average, modifier, msg_start, len(result))
  _warn_prf(average, modifier, msg_start, len(result))
  _warn_prf(average, modifier, msg_start, len(result))
  _warn_prf(average, modifier, msg_start, len(result))
  _warn_prf(average, modifier, msg_start, len(result))
  _warn_prf(average, modifier, msg_start, len(result))
  _warn_prf(average, modifier, msg_start, len(result))
  _warn_prf(average, modifier, msg_start, len(result))


1-kNN 0.671429
{'-1': {'precision': 0.6, 'recall': 0.5357142857142857, 'f1-score': 0.5660377358490566, 'support': 28}, '1': {'precision': 0.7111111111111111, 'recall': 0.7619047619047619, 'f1-score': 0.735632183908046, 'support': 42}, 'accuracy': 0.6714285714285714, 'macro avg': {'precision': 0.6555555555555556, 'recall': 0.6488095238095237, 'f1-score': 0.6508349598785512, 'support': 70}, 'weighted avg': {'precision': 0.6666666666666667, 'recall': 0.6714285714285714, 'f1-score': 0.6677944046844502, 'support': 70}}
-----------------------------------------

Decision Tree 0.971429
{'-1': {'precision': 0.9642857142857143, 'recall': 0.9642857142857143, 'f1-score': 0.9642857142857143, 'support': 28}, '1': {'precision': 0.9761904761904762, 'recall': 0.9761904761904762, 'f1-score': 0.9761904761904762, 'support': 42}, 'accuracy': 0.9714285714285714, 'macro avg': {'precision': 0.9702380952380952, 'recall': 0.9702380952380952, 'f1-score': 0.9702380952380952, 'support': 70}, 'weighted avg': {'pre

  _warn_prf(average, modifier, msg_start, len(result))
  _warn_prf(average, modifier, msg_start, len(result))
  _warn_prf(average, modifier, msg_start, len(result))
  _warn_prf(average, modifier, msg_start, len(result))
  _warn_prf(average, modifier, msg_start, len(result))
  _warn_prf(average, modifier, msg_start, len(result))
  _warn_prf(average, modifier, msg_start, len(result))
  _warn_prf(average, modifier, msg_start, len(result))
  _warn_prf(average, modifier, msg_start, len(result))
  _warn_prf(average, modifier, msg_start, len(result))
  _warn_prf(average, modifier, msg_start, len(result))
  _warn_prf(average, modifier, msg_start, len(result))


In [5]:
"""
K=1 Nearest Neighbors
"""
n_neighbors=1
knn = KNeighborsClassifier(n_neighbors=n_neighbors, p=1) # p-l1 vs l2 distance
imputer = KNNImputer(n_neighbors=n_neighbors)
train_x_knn = imputer.fit_transform(train_x)
test_x_knn = imputer.fit_transform(test_x)
knn.fit(train_x_knn, train_y)
preds_knn = knn.predict(test_x_knn)
print("1-kNN %f" % knn.score(test_x_knn, test_y))
print(classification_report(test_y, preds_knn))

1-kNN 0.642857
              precision    recall  f1-score   support

          -1       0.57      0.46      0.51        28
           1       0.68      0.76      0.72        42

    accuracy                           0.64        70
   macro avg       0.62      0.61      0.61        70
weighted avg       0.63      0.64      0.64        70



In [6]:
"""
Decision Tree vs Random Forest
"""
dt = DecisionTreeClassifier(criterion="entropy")
dt.fit(train_x_knn, train_y)
preds_dt = dt.predict(test_x_knn)
print("Decision Tree %f" % dt.score(test_x_knn,test_y))
print(classification_report(test_y, preds_dt))

rf = RandomForestClassifier(criterion="gini", n_estimators=100)
rf.fit(train_x_knn, train_y)
preds_rf = rf.predict(test_x_knn)
print("Random Forest %f" % rf.score(test_x_knn,test_y))
print(classification_report(test_y, preds_rf))

Decision Tree 1.000000
              precision    recall  f1-score   support

          -1       1.00      1.00      1.00        28
           1       1.00      1.00      1.00        42

    accuracy                           1.00        70
   macro avg       1.00      1.00      1.00        70
weighted avg       1.00      1.00      1.00        70

Random Forest 0.985714
              precision    recall  f1-score   support

          -1       0.97      1.00      0.98        28
           1       1.00      0.98      0.99        42

    accuracy                           0.99        70
   macro avg       0.98      0.99      0.99        70
weighted avg       0.99      0.99      0.99        70



In [None]:
"""
Grid Search
"""
param_grid_knn = {'p':[1,2], 'n_neighbors':[1,2,3,4,5]}
param_grid_rf = {'criterion': ['gini'], 'n_estimators': [1, 5, 10, 50, 100, 500, 1000]}

param_grid_svm =[
    {'coef0':[.01,.1,1,5,10,20,30,50,75,100,125,150,175,200], 'C':[.1,1,10,25,50,75,100,125,150,175,200], 'tol':[1e-1,1e-2,1e-3,1e-4,1e-5,1e-6,1e-7,1e-8]},
    {'C':[.1,1,10,25,50,75,100,125,150,175,200], 'tol':[1e-1,1e-2,1e-3,1e-4,1e-5,1e-6,1e-7,1e-8]}
]

param_grid_nn =[
    {'hidden_layer_sizes':[(3,3,3),(5,5,5),(10,10,10)],'activation':["logistic"],'max_iter':[100,500,1000,1500,2000,3000],'alpha':[.1,.01,.001,.0001,.00001,.000001],'tol':[1e-2,1e-3,1e-4,1e-5,1e-6],'beta_1':[.01,.1,.2,.3,.4,.5,.6,.7,.8,.9],'beta_2':[.015,.15,.25,.35,.45,.55,.65,.75,.85,.95],'epsilon':[1e-4,1e-5,1e-6,1e-7,1e-8,1e-9,1e-10]},
    {'hidden_layer_sizes':[(3,3,3),(5,5,5),(10,10,10)],'activation':["relu"],    'max_iter':[100,500,1000,1500,2000,3000],'alpha':[.1,.01,.001,.0001,.00001,.000001],'tol':[1e-2,1e-3,1e-4,1e-5,1e-6],'beta_1':[.01,.1,.2,.3,.4,.5,.6,.7,.8,.9],'beta_2':[.015,.15,.25,.35,.45,.55,.65,.75,.85,.95],'epsilon':[1e-4,1e-5,1e-6,1e-7,1e-8,1e-9,1e-10]}
]


# max_iter=1500, alpha=.00001, tol=.0001, beta_1=.8, beta_2=.95, epsilon=.00000000001


# clf_k = GridSearchCV(estimator=KNeighborsClassifier(), param_grid=param_grid_knn, n_jobs=-1)
# clf_k.fit(train_x_knn, train_y)

# NOT CONSTANT 
# clf_rf = GridSearchCV(estimator=RandomForestClassifier(), param_grid=param_grid_rf, n_jobs=-1)
# clf_rf.fit(train_x_knn, train_y)

clf_nn = GridSearchCV(estimator=MLPClassifier(), param_grid=param_grid_nn, n_jobs=-1)
clf_nn.fit(train_x_knn, train_y)

#     mlp_sig = MLPClassifier(hidden_layer_sizes=(5,5,5), activation='logistic', max_iter=1500, alpha=.00001, tol=.0001, beta_1=.8, beta_2=.95, epsilon=.00000000001)



In [None]:
# print(clf_k.best_params_)
# print(clf_rf.best_params_)
print(clf_nn.best_params_)

In [None]:
# Try computing confusion matrix
confusion_matrix(y_pred, y_val)
accuracy(cm)

In [None]:
param_grid_nn =[
    {'hidden_layer_sizes':[(5,5),(10,10),(3,3,3),(5,5,5),(10,10,10),(10,5,5),(5,5,10)],'activation':["logistic"],'max_iter':[100,500,1000,1500,2000,3000,5000,7500,10000],'alpha'=[.1,.01,.001,.0001,.00001,.000001],'tol'=[1e-1,1e-2,1e-3,1e-4,1e-5,1e-6],'beta_1'=[.01,.1,.2,.3,.4,.5,.6,.7,.8,.9],'beta_2'=[.015,.15,.25,.35,.45,.55,.65,.75,.85,.95],'epsilon'=[1e-4,1e-5,1e-6,1e-7,1e-8,1e-9,1e-10]},
    {'hidden_layer_sizes':[(5,5),(10,10),(3,3,3),(5,5,5),(10,10,10),(10,5,5),(5,5,10)],'activation':["relu"],'max_iter':[100,500,1000,1500,2000,3000,5000,7500,10000],'alpha'=[.1,.01,.001,.0001,.00001,.000001],'tol'=[1e-1,1e-2,1e-3,1e-4,1e-5,1e-6],'beta_1'=[.01,.1,.2,.3,.4,.5,.6,.7,.8,.9],'beta_2'=[.015,.15,.25,.35,.45,.55,.65,.75,.85,.95],'epsilon'=[1e-4,1e-5,1e-6,1e-7,1e-8,1e-9,1e-10]}
]

In [None]:
""" 2 SVM:
Custom Polynomial SVM vs Gaussian SVM
"""
svm_3 = svm.SVC(kernel='poly', degree=3, gamma='scale', coef0=131, C=12, tol=.0001)
svm_3.fit(train_x_knn, train_y)
preds_svm3 = svm_3.predict(test_x_knn)
print("[Polynomial^3] SVM %f" % svm_3.score(test_x_knn,test_y))
print(classification_report(test_y, preds_svm3))

svm_g = svm.SVC(kernel='rbf', gamma='auto', C=12, tol=.0001)
svm_g.fit(train_x_knn, train_y)
preds_svmg = svm_g.predict(test_x_knn)
print("[Gaussian] SVM %f" % svm_g.score(test_x_knn,test_y))
print(classification_report(test_y, preds_svmg))

print(len(set(test_y)))
print(len(set(preds_svm3)))

In [None]:
""" 2 NN:
Sigmoid NN vs ReLu NN
"""
scaler = StandardScaler()
scaler.fit(train_x_knn)
train_x_nn = scaler.transform(train_x_knn)
test_x_nn = scaler.transform(test_x_knn)
d = train_x_nn.shape[1]


mlp_sig = MLPClassifier(hidden_layer_sizes=(d,d,d), activation='logistic', max_iter=1500, alpha=.00001, tol=.0001, beta_1=.8, beta_2=.95, epsilon=.00000000001)
mlp_sig.fit(train_x_nn, train_y) # train_y.values.ravel() converts Series -> np.ndarray
preds_nn_sig = mlp_sig.predict(test_x_nn)
print("Relu NN %f" % mlp_sig.score(test_x_nn,test_y))
print(classification_report(test_y, preds_nn_sig))

mlp_relu = MLPClassifier(hidden_layer_sizes=(d,d,d), activation='relu', max_iter=1500, alpha=.00001, tol=.001, beta_1=.8, beta_2=.95, epsilon=.00000000001)
mlp_relu.fit(train_x_nn, train_y) # train_y.values.ravel() converts Series -> np.ndarray
preds_nn_relu = mlp_relu.predict(test_x_nn)
print("Relu NN %f" % mlp_relu.score(test_x_nn,test_y))
print(classification_report(test_y, preds_nn_relu))

In [None]:
"""
TODO:
    - Cross Validation --> over all 10 data sets --> and validation
    - Grid search for correct parameters
        --> SVM Help?
"""

In [None]:
if __name__ == "__main__":
    opts, args = getopt.getopt(sys.argv[1:],"d:a:f")
    degree = 1
    for a, b in opts:
        if a == '-d':
            degree = b
        elif a == '-a':
            degree = -1
        else:
            print("Usage: %s <-d degree#> <-a>" % sys.argv[0])
    
    print(degree)

# 