In [1]:
import numpy as np
import pandas as pd
import matplotlib.pyplot as plt
import sys, getopt
from sklearn.model_selection import KFold, GridSearchCV
from sklearn.neighbors import KNeighborsClassifier
from sklearn.tree import DecisionTreeClassifier
from sklearn.impute import KNNImputer
from sklearn.ensemble import RandomForestClassifier
from sklearn import svm
from sklearn.preprocessing import StandardScaler
from sklearn.neural_network import MLPClassifier
from sklearn.metrics import classification_report

pd.set_option('display.max_rows', None)
pd.set_option('display.max_columns', None)
pd.set_option('mode.chained_assignment', 'raise')

In [2]:
"""
Helper Functions
"""

def getResults(perf, keys):
    results = {}
    for key in keys:
        data = []
        for p in perf[key]:
            sensitivity = p['1']['recall']
            specificity = p['-1']['recall']
            accuracy = p['accuracy']
            precision = p['macro avg']['precision']
            recall = p['macro avg']['recall']
            f1 = p['macro avg']['f1-score']
            json = {'Sensitivity':sensitivity, 'Specificity':specificity, 'Accuracy':accuracy, 'Precision':precision, 'Recall':recall, 'F1':f1}
            data.append(json)
        results[key] = pd.DataFrame(data).mean()    
    return pd.DataFrame(results).T

def split_xy(df):
    """
    params:
        df : dataframe
    return:
        tuple
            0 - dataframe of data w
            1 - ser
    """
    y = df['Class']
    x = df.drop('Class',axis=1)
    return x,y

In [3]:
"""
Data Collection
"""
colnames = ['Sample code number','Clump Thickness','Uniformity of Cell Size','Uniformity of Cell Shape','Marginal Adhesion','Single Epithelial Cell Size','Bare Nuclei','Bland Chromatin','Normal Nucleoli','Mitoses','Class']        
classes = {1 : 'benign', -1 : 'malignant'}
# All columns have values b/w 1-10 except for first (id) and last (class)
data = pd.read_csv('breast-cancer-wisconsin.data', names=colnames)
data.loc[(data['Class'] == 2),'Class'] = 1
data.loc[(data['Class'] == 4),'Class'] = -1
# ADJUST FOR MISISNG DATA '?'
data = data.replace('?', np.nan)

test_data = data.sample(frac=.1)
pre_train_data = data.drop(test_data.index).reset_index(drop=True)

In [4]:
def knnMethod(train_x, train_y, test_x, test_y, imputer, n_neighbors=1, p=1):
    xTri, xTei = imputer.fit_transform(train_x), imputer.fit_transform(test_x)
    knn = KNeighborsClassifier(n_neighbors=n_neighbors, p=p) # p-l1 vs l2 distance
    knn.fit(xTri, train_y)
    preds_knn = knn.predict(xTei)
    perf = classification_report(test_y, preds_knn,output_dict=True)
    print("1-kNN %f" % perf['macro avg']['f1-score'])
    return perf
    
def decisionTree(train_x, train_y, test_x, test_y, imputer):
    xTri, xTei = imputer.fit_transform(train_x), imputer.fit_transform(test_x)
    dt = DecisionTreeClassifier(criterion="entropy")
    dt.fit(xTri, train_y)
    preds_dt = dt.predict(xTei)
    perf = classification_report(test_y, preds_dt,output_dict=True)
    print("Decision Tree %f" % perf['macro avg']['f1-score'])
    return perf

def randomForest(train_x, train_y, test_x, test_y, imputer, n_estimators=100):
    xTri, xTei = imputer.fit_transform(train_x), imputer.fit_transform(test_x)
    rf = RandomForestClassifier(criterion="gini", n_estimators=100)
    rf.fit(xTri, train_y)
    preds_rf = rf.predict(xTei)
    perf = classification_report(test_y, preds_rf,output_dict=True)
    print("Random Forest %f" % perf['macro avg']['f1-score'])
    return perf

def polynomialSVC(train_x, train_y, test_x, test_y, imputer):
    xTri, xTei = imputer.fit_transform(train_x), imputer.fit_transform(test_x)
    scaler = StandardScaler()
    scaler.fit(xTri)
    xTrs, xTes = scaler.transform(xTri), scaler.transform(xTei)
    svm_3 = svm.SVC(kernel='poly', degree=3, gamma='scale', coef0=131, C=12, tol=.0001)
    svm_3.fit(xTrs, train_y)
    preds_svm3 = svm_3.predict(xTes)
    perf = classification_report(test_y, preds_svm3,output_dict=True, zero_division=True)
    print("[Polynomial^3] SVM %f" % perf['macro avg']['f1-score'])
    return perf
    
def gaussianSVC(train_x, train_y, test_x, test_y, imputer):
    xTri, xTei = imputer.fit_transform(train_x), imputer.fit_transform(test_x)
    scaler = StandardScaler()
    scaler.fit(xTri)
    xTrs, xTes = scaler.transform(xTri), scaler.transform(xTei)
    svm_g = svm.SVC(kernel='rbf', gamma='auto', C=12, tol=.0001)
    svm_g.fit(xTrs, train_y)
    preds_svmg = svm_g.predict(xTes)
    perf = classification_report(test_y, preds_svmg,output_dict=True, zero_division=True)
    print("[Gaussian] SVM %f" % perf['macro avg']['f1-score'])
    return perf
    
def sigmoidNN(train_x, train_y, test_x, test_y, imputer):
    xTri, xTei = imputer.fit_transform(train_x), imputer.fit_transform(test_x)
    scaler = StandardScaler()
    scaler.fit(xTri)
    xTrnn, xTenn = scaler.transform(xTri), scaler.transform(xTei)
    mlp_sig = MLPClassifier(hidden_layer_sizes=(10,10,10), activation='logistic', max_iter=1500, alpha=.00001, tol=.0001, beta_1=.8, beta_2=.95, epsilon=.00000000001)
    mlp_sig.fit(xTrnn, train_y) # train_y.values.ravel() converts Series -> np.ndarray
    preds_nn_sig = mlp_sig.predict(xTenn)
    perf = classification_report(test_y, preds_nn_sig,output_dict=True,zero_division=True)
    print("Sigmoid NN %f" % perf['macro avg']['f1-score'])
    return perf
    
def reluNN(train_x, train_y, test_x, test_y, imputer):
    xTri, xTei = imputer.fit_transform(train_x), imputer.fit_transform(test_x)
    scaler = StandardScaler()
    scaler.fit(xTri)
    xTrnn, xTenn = scaler.transform(xTri), scaler.transform(xTei)
    mlp_relu = MLPClassifier(hidden_layer_sizes=(10,10,10), activation='relu', max_iter=1500, alpha=.00001, tol=.001, beta_1=.8, beta_2=.95, epsilon=.00000000001)
    mlp_relu.fit(xTrnn, train_y) # train_y.values.ravel() converts Series -> np.ndarray
    preds_nn_relu = mlp_relu.predict(xTenn)
    perf = classification_report(test_y, preds_nn_relu, output_dict=True, zero_division=True)
    print("Relu NN %f" % perf['macro avg']['f1-score'])
    return perf

In [5]:
"""
Cross Validation
https://machinelearningmastery.com/repeated-k-fold-cross-validation-with-python/
"""
cv_datasets = []
imputers = [KNNImputer(n_neighbors=1)]
keys = ['knn', 'dt', 'rf', 'svmp', 'svmg', 'nnsig', 'nnrelu']
perf = {key:[] for key in keys}
for train_idx,valid_idx in KFold(n_splits=10, random_state=1, shuffle=True).split(pre_train_data):
    train_data = pre_train_data.iloc[train_idx]
    valid_data = pre_train_data.iloc[valid_idx]
    cv_datasets.append((train_data,valid_data))
    train_x, train_y = split_xy(train_data)
    valid_x, valid_y = split_xy(valid_data)
    test_x, test_y = split_xy(test_data)
    
    perf['knn'].append(knnMethod(train_x, train_y, test_x, test_y, imputers[0]))
    perf['dt'].append(decisionTree(train_x, train_y, test_x, test_y, imputers[0]))
    perf['rf'].append(randomForest(train_x, train_y, test_x, test_y, imputers[0]))
    perf['svmp'].append(polynomialSVC(train_x, train_y, test_x, test_y, imputers[0]))
    perf['svmg'].append(gaussianSVC(train_x, train_y, test_x, test_y, imputers[0]))
    perf['nnsig'].append(sigmoidNN(train_x, train_y, test_x, test_y, imputers[0]))
    perf['nnrelu'].append(reluNN(train_x, train_y, test_x, test_y, imputers[0]))
results = getResults(perf,keys)
print("FINISHED")
display(results)
# DEBUG
train = cv_datasets[0][0]
train_x, train_y = split_xy(train)
valid = cv_datasets[0][1]
valid_x, valid_y = split_xy(valid)
test_x, test_y = split_xy(test_data)
imputer = KNNImputer(n_neighbors=1)
train_x = imputer.fit_transform(train_x)
test_x = imputer.fit_transform(test_x)

1-kNN 0.656140
Decision Tree 0.969854
Random Forest 0.984819
[Polynomial^3] SVM 0.936594
[Gaussian] SVM 0.906667
Sigmoid NN 0.984819
Relu NN 0.984819
1-kNN 0.619565
Decision Tree 0.969854
Random Forest 1.000000
[Polynomial^3] SVM 0.906667
[Gaussian] SVM 0.909561
Sigmoid NN 1.000000
Relu NN 0.969854
1-kNN 0.643848
Decision Tree 0.984819
Random Forest 1.000000
[Polynomial^3] SVM 0.921507
[Gaussian] SVM 0.906667
Sigmoid NN 0.969854
Relu NN 0.984819
1-kNN 0.682971
Decision Tree 0.969854
Random Forest 1.000000
[Polynomial^3] SVM 0.904891
[Gaussian] SVM 0.893732
Sigmoid NN 1.000000
Relu NN 1.000000
1-kNN 0.643848
Decision Tree 1.000000
Random Forest 1.000000
[Polynomial^3] SVM 0.921507
[Gaussian] SVM 0.922891
Sigmoid NN 1.000000
Relu NN 1.000000
1-kNN 0.580436
Decision Tree 0.954457
Random Forest 1.000000
[Polynomial^3] SVM 0.953734
[Gaussian] SVM 0.922891
Sigmoid NN 1.000000
Relu NN 0.969854
1-kNN 0.635417
Decision Tree 0.969406
Random Forest 1.000000
[Polynomial^3] SVM 0.906667
[Gaussian] 

Unnamed: 0,Sensitivity,Specificity,Accuracy,Precision,Recall,F1
knn,0.825,0.45,0.685714,0.660266,0.6375,0.640992
dt,0.972727,0.984615,0.977143,0.97363,0.978671,0.975677
rf,0.997727,1.0,0.998571,0.998148,0.998864,0.998482
svmp,0.975,0.834615,0.922857,0.930777,0.904808,0.914925
svmg,0.940909,0.876923,0.917143,0.913569,0.908916,0.910788
nnsig,0.990909,1.0,0.994286,0.992725,0.995455,0.993949
nnrelu,0.975,1.0,0.984286,0.980277,0.9875,0.983406


In [7]:
"""
- work with output results perf, score --> get fmeasure, sensitivity, recall, ROC, accuracy, etc
- SVM ISSUES:
    - Sometimes infinite look
    - Experiment with soft margin
    
"""


'\n- work with output results perf, score --> get fmeasure, sensitivity, recall, ROC, accuracy, etc\n- SVM ISSUES:\n    - Sometimes infinite look\n    - Experiment with soft margin\n    \n'

In [None]:
"""
Grid Search
"""
param_grid_knn = {'p':[1,2], 'n_neighbors':[1,2,3,4,5]}
param_grid_rf = {'criterion': ['gini'], 'n_estimators': [1, 5, 10, 50, 100, 500, 1000]}

param_grid_svm = [
    {'kernel':['poly'],'degree':[2,3,4,5], 'gamma':['scale'], 'coef0':[.01,.1,1,5,10,20,30,50,75,100,125,150,175,200], 'C':[.1,1,10,25,50,75,100,125,150,175,200], 'tol':[1e-1,1e-2,1e-3,1e-4,1e-5,1e-6,1e-7,1e-8]},
    {'kernel':['rbf'],'gamma':['auto'],'C':[.1,1,10,25,50,75,100,125,150,175,200], 'tol':[1e-1,1e-2,1e-3,1e-4,1e-5,1e-6,1e-7,1e-8]}
]                                

param_grid_nn =[
    {'hidden_layer_sizes':[(3,3,3),(5,5,5),(10,10,10)],'activation':["logistic"],'max_iter':[100,500,1000,1500,2000,3000],'alpha':[.1,.01,.001,.0001,.00001,.000001],'tol':[1e-2,1e-3,1e-4,1e-5,1e-6],'beta_1':[.01,.1,.2,.3,.4,.5,.6,.7,.8,.9],'beta_2':[.015,.15,.25,.35,.45,.55,.65,.75,.85,.95],'epsilon':[1e-4,1e-5,1e-6,1e-7,1e-8,1e-9,1e-10]},
    {'hidden_layer_sizes':[(3,3,3),(5,5,5),(10,10,10)],'activation':["relu"],    'max_iter':[100,500,1000,1500,2000,3000],'alpha':[.1,.01,.001,.0001,.00001,.000001],'tol':[1e-2,1e-3,1e-4,1e-5,1e-6],'beta_1':[.01,.1,.2,.3,.4,.5,.6,.7,.8,.9],'beta_2':[.015,.15,.25,.35,.45,.55,.65,.75,.85,.95],'epsilon':[1e-4,1e-5,1e-6,1e-7,1e-8,1e-9,1e-10]}
]


# max_iter=1500, alpha=.00001, tol=.0001, beta_1=.8, beta_2=.95, epsilon=.00000000001


clf_k = GridSearchCV(estimator=KNeighborsClassifier(), param_grid=param_grid_knn, n_jobs=-1, cv=5)
clf_k.fit(train_x, train_y)
print(clf_k.best_params_)

# NOT CONSTANT 
clf_rf = GridSearchCV(estimator=RandomForestClassifier(), param_grid=param_grid_rf, n_jobs=-1, cv=5)
clf_rf.fit(train_x, train_y)
print(clf_rf.best_params_)

clf_svm = GridSearchCV(estimator=svm.SVC(), param_grid=param_grid_svm, n_jobs=-1, cv=5)
clf_svm.fit(train_x, train_y)
print(clf_svm.best_params_)

# CURSE OF DIM : takes too long too run
# clf_nn = GridSearchCV(estimator=MLPClassifier(), param_grid=param_grid_nn, n_jobs=-1)
# clf_nn.fit(train_x_knn, train_y)

# print(clf_k.best_params_)
# print(clf_rf.best_params_)
# print(clf_nn.best_params_)

In [None]:
p = clf_k.predict(test_x)
score = clf_k.score(test_x,test_y)
print(score)
perf = classification_report(test_y, p,output_dict=True)


In [None]:
# # Try computing confusion matrix
# confusion_matrix(y_pred, y_val)
# accuracy(cm)

In [None]:
"""
TODO:
    - Cross Validation --> over all 10 data sets --> and validation
    - Grid search for correct parameters
        --> SVM Help?
"""

In [None]:
if __name__ == "__main__":
    opts, args = getopt.getopt(sys.argv[1:],"d:a:f")
    degree = 1
    for a, b in opts:
        if a == '-d':
            degree = b
        elif a == '-a':
            degree = -1
        else:
            print("Usage: %s <-d degree#> <-a>" % sys.argv[0])
    
    print(degree)

# 