In [1]:
import numpy as np
import pandas as pd
import matplotlib.pyplot as plt
import sys, getopt
from sklearn.model_selection import KFold, GridSearchCV
from sklearn.neighbors import KNeighborsClassifier
from sklearn.tree import DecisionTreeClassifier
from sklearn.impute import KNNImputer
from sklearn.ensemble import RandomForestClassifier
from sklearn import svm
from sklearn.preprocessing import StandardScaler
from sklearn.neural_network import MLPClassifier
from sklearn.metrics import classification_report

pd.set_option('display.max_rows', None)
pd.set_option('display.max_columns', None)
pd.set_option('mode.chained_assignment', 'raise')

In [2]:
"""
Helper Functions
"""

# def perf(test_y, pred_y):
#     """
#     params:
#         pred_y : predicted y labels for data
#         test_y : actual y labels for data
#     return:
#         Fraction of correctly classified samples
#     """
#     res = pred_y - test_y
#     return res[res == 0].count() / res.count()


def split_xy(df):
    """
    params:
        df : dataframe
    return:
        tuple
            0 - dataframe of data w
            1 - ser
    """
    y = df['Class']
    x = df.drop('Class',axis=1)
    return x,y

In [3]:
"""
Data Collection
"""
colnames = ['Sample code number','Clump Thickness','Uniformity of Cell Size','Uniformity of Cell Shape','Marginal Adhesion','Single Epithelial Cell Size','Bare Nuclei','Bland Chromatin','Normal Nucleoli','Mitoses','Class']        
classes = {1 : 'benign', -1 : 'malignant'}
# All columns have values b/w 1-10 except for first (id) and last (class)
data = pd.read_csv('breast-cancer-wisconsin.data', names=colnames)
data.loc[(data['Class'] == 2),'Class'] = 1
data.loc[(data['Class'] == 4),'Class'] = -1
# ADJUST FOR MISISNG DATA '?'
data = data.replace('?', np.nan)

test_data = data.sample(frac=.1)
pre_train_data = data.drop(test_data.index).reset_index(drop=True)

In [4]:
"""
Cross Validation
https://machinelearningmastery.com/repeated-k-fold-cross-validation-with-python/
"""

def knnMethod(train_x, train_y, test_x, test_y, imputer, n_neighbors=1, p=1):
    xTri, xTei = imputer.fit_transform(train_x), imputer.fit_transform(test_x)
    knn = KNeighborsClassifier(n_neighbors=n_neighbors, p=p) # p-l1 vs l2 distance
    knn.fit(xTri, train_y)
    preds_knn = knn.predict(xTei)
    score = knn.score(xTei, test_y)
    print("1-kNN %f" % score)
    perf = classification_report(test_y, preds_knn,output_dict=True)
    return score, perf
    
def decisionTree(train_x, train_y, test_x, test_y, imputer):
    xTri, xTei = imputer.fit_transform(train_x), imputer.fit_transform(test_x)
    dt = DecisionTreeClassifier(criterion="entropy")
    dt.fit(xTri, train_y)
    preds_dt = dt.predict(xTei)
    score = dt.score(xTei,test_y)
    print("Decision Tree %f" % score)
    perf = classification_report(test_y, preds_dt,output_dict=True)
    return score, perf

def randomForest(train_x, train_y, test_x, test_y, imputer, n_estimators=100):
    xTri, xTei = imputer.fit_transform(train_x), imputer.fit_transform(test_x)
    rf = RandomForestClassifier(criterion="gini", n_estimators=100)
    rf.fit(xTri, train_y)
    preds_rf = rf.predict(xTei)
    score = rf.score(xTei,test_y)
    print("Random Forest %f" % score)
    perf = classification_report(test_y, preds_rf,output_dict=True)
    return score,perf

def polynomialSVC(train_x, train_y, test_x, test_y, imputer):
    xTri, xTei = imputer.fit_transform(train_x), imputer.fit_transform(test_x)
    svm_3 = svm.SVC(kernel='poly', degree=3, gamma='scale', coef0=131, C=12, tol=.0001)
    svm_3.fit(xTri, train_y)
    preds_svm3 = svm_3.predict(xTei)
    score = svm_3.score(xTei,test_y)
    print("[Polynomial^3] SVM %f" % score)
    perf = classification_report(test_y, preds_svm3,output_dict=True, zero_division=True)
    return score,perf
    
def gaussianSVC(train_x, train_y, test_x, test_y, imputer):
    xTri, xTei = imputer.fit_transform(train_x), imputer.fit_transform(test_x)
    svm_g = svm.SVC(kernel='rbf', gamma='auto', C=12, tol=.0001)
    svm_g.fit(xTri, train_y)
    preds_svmg = svm_g.predict(xTei)
    score = svm_g.score(xTei,test_y)
    print("[Gaussian] SVM %f" % score)
    perf = classification_report(test_y, preds_svmg,output_dict=True, zero_division=True)
    return score,perf
    
def sigmoidNN(train_x, train_y, test_x, test_y, imputer):
    xTri, xTei = imputer.fit_transform(train_x), imputer.fit_transform(test_x)
    scaler = StandardScaler()
    scaler.fit(xTri)
    xTrnn, xTenn = scaler.transform(xTri), scaler.transform(xTei)
    mlp_sig = MLPClassifier(hidden_layer_sizes=(10,10,10), activation='logistic', max_iter=1500, alpha=.00001, tol=.0001, beta_1=.8, beta_2=.95, epsilon=.00000000001)
    mlp_sig.fit(xTrnn, train_y) # train_y.values.ravel() converts Series -> np.ndarray
    preds_nn_sig = mlp_sig.predict(xTenn)
    score = mlp_sig.score(xTenn,test_y)
    print("Sigmoid NN %f" % score)
    perf = classification_report(test_y, preds_nn_sig,output_dict=True,zero_division=True)
    return score,perf
    
def reluNN(train_x, train_y, test_x, test_y, imputer):
    xTri, xTei = imputer.fit_transform(train_x), imputer.fit_transform(test_x)
    scaler = StandardScaler()
    scaler.fit(xTri)
    xTrnn, xTenn = scaler.transform(xTri), scaler.transform(xTei)
    mlp_relu = MLPClassifier(hidden_layer_sizes=(10,10,10), activation='relu', max_iter=1500, alpha=.00001, tol=.001, beta_1=.8, beta_2=.95, epsilon=.00000000001)
    mlp_relu.fit(xTrnn, train_y) # train_y.values.ravel() converts Series -> np.ndarray
    preds_nn_relu = mlp_relu.predict(xTenn)
    score = mlp_relu.score(xTenn,test_y)
    print("Relu NN %f" % score)
    perf = classification_report(test_y, preds_nn_relu, output_dict=True, zero_division=True)
    return score,perf
    
cv_datasets = []
imputers = [KNNImputer(n_neighbors=1)]
keys = ['knn', 'dt', 'rf', 'svmp', 'svmg', 'nnsig', 'nnrelu']
perf_arr = {key:[] for key in keys}
for train_idx,valid_idx in KFold(n_splits=10, random_state=1, shuffle=True).split(pre_train_data):
    train_data = pre_train_data.iloc[train_idx]
    valid_data = pre_train_data.iloc[valid_idx]
    cv_datasets.append((train_data,valid_data))
    train_x, train_y = split_xy(train_data)
    valid_x, valid_y = split_xy(valid_data)
    test_x, test_y = split_xy(test_data)
    
    perf_arr['knn'].append(knnMethod(train_x, train_y, test_x, test_y, imputers[0]))
    perf_arr['dt'].append(decisionTree(train_x, train_y, test_x, test_y, imputers[0]))
    perf_arr['rf'].append(randomForest(train_x, train_y, test_x, test_y, imputers[0]))
    perf_arr['svmp'].append(polynomialSVC(train_x, train_y, test_x, test_y, imputers[0]))
    perf_arr['svmg'].append(gaussianSVC(train_x, train_y, test_x, test_y, imputers[0]))
    perf_arr['nnsig'].append(sigmoidNN(train_x, train_y, test_x, test_y, imputers[0]))
    perf_arr['nnrelu'].append(reluNN(train_x, train_y, test_x, test_y, imputers[0]))

print("FINISHED")
print()
# DEBUG
train = cv_datasets[0][0]
train_x, train_y = split_xy(train)
valid = cv_datasets[0][1]
valid_x, valid_y = split_xy(valid)
test_x, test_y = split_xy(test_data)
imputer = KNNImputer(n_neighbors=1)
train_x = imputer.fit_transform(train_x)
test_x = imputer.fit_transform(test_x)

1-kNN 0.557143
Decision Tree 0.971429
Random Forest 0.985714
[Polynomial^3] SVM 0.671429
[Gaussian] SVM 0.700000
Sigmoid NN 0.985714
Relu NN 0.985714
1-kNN 0.557143
Decision Tree 0.971429
Random Forest 0.971429
[Polynomial^3] SVM 0.671429
[Gaussian] SVM 0.700000
Sigmoid NN 0.985714
Relu NN 0.985714
1-kNN 0.585714
Decision Tree 0.985714
Random Forest 0.985714
[Polynomial^3] SVM 0.671429
[Gaussian] SVM 0.685714
Sigmoid NN 0.985714
Relu NN 0.971429
1-kNN 0.600000
Decision Tree 0.957143
Random Forest 0.985714
[Polynomial^3] SVM 0.671429
[Gaussian] SVM 0.700000
Sigmoid NN 0.985714
Relu NN 0.985714
1-kNN 0.571429
Decision Tree 0.928571
Random Forest 0.971429
[Polynomial^3] SVM 0.657143
[Gaussian] SVM 0.700000
Sigmoid NN 0.985714
Relu NN 0.985714
1-kNN 0.542857
Decision Tree 0.942857
Random Forest 0.985714
[Polynomial^3] SVM 0.671429
[Gaussian] SVM 0.700000
Sigmoid NN 0.985714
Relu NN 0.985714
1-kNN 0.600000
Decision Tree 0.985714
Random Forest 0.971429
[Polynomial^3] SVM 0.671429
[Gaussian] 

In [None]:
"""
Grid Search
"""
param_grid_knn = {'p':[1,2], 'n_neighbors':[1,2,3,4,5]}
param_grid_rf = {'criterion': ['gini'], 'n_estimators': [1, 5, 10, 50, 100, 500, 1000]}

param_grid_svm = [
    {'kernel':['poly'],'degree':[2,3,4,5], 'gamma':['scale'], 'coef0':[.01,.1,1,5,10,20,30,50,75,100,125,150,175,200], 'C':[.1,1,10,25,50,75,100,125,150,175,200], 'tol':[1e-1,1e-2,1e-3,1e-4,1e-5,1e-6,1e-7,1e-8]},
    {'kernel':['rbf'],'gamma':['auto'],'C':[.1,1,10,25,50,75,100,125,150,175,200], 'tol':[1e-1,1e-2,1e-3,1e-4,1e-5,1e-6,1e-7,1e-8]}
]                                

param_grid_nn =[
    {'hidden_layer_sizes':[(3,3,3),(5,5,5),(10,10,10)],'activation':["logistic"],'max_iter':[100,500,1000,1500,2000,3000],'alpha':[.1,.01,.001,.0001,.00001,.000001],'tol':[1e-2,1e-3,1e-4,1e-5,1e-6],'beta_1':[.01,.1,.2,.3,.4,.5,.6,.7,.8,.9],'beta_2':[.015,.15,.25,.35,.45,.55,.65,.75,.85,.95],'epsilon':[1e-4,1e-5,1e-6,1e-7,1e-8,1e-9,1e-10]},
    {'hidden_layer_sizes':[(3,3,3),(5,5,5),(10,10,10)],'activation':["relu"],    'max_iter':[100,500,1000,1500,2000,3000],'alpha':[.1,.01,.001,.0001,.00001,.000001],'tol':[1e-2,1e-3,1e-4,1e-5,1e-6],'beta_1':[.01,.1,.2,.3,.4,.5,.6,.7,.8,.9],'beta_2':[.015,.15,.25,.35,.45,.55,.65,.75,.85,.95],'epsilon':[1e-4,1e-5,1e-6,1e-7,1e-8,1e-9,1e-10]}
]


# max_iter=1500, alpha=.00001, tol=.0001, beta_1=.8, beta_2=.95, epsilon=.00000000001


clf_k = GridSearchCV(estimator=KNeighborsClassifier(), param_grid=param_grid_knn, n_jobs=-1, cv=5)
clf_k.fit(train_x, train_y)
print(clf_k.best_params_)

# NOT CONSTANT 
clf_rf = GridSearchCV(estimator=RandomForestClassifier(), param_grid=param_grid_rf, n_jobs=-1, cv=5)
clf_rf.fit(train_x, train_y)
print(clf_rf.best_params_)

clf_svm = GridSearchCV(estimator=svm.SVC(), param_grid=param_grid_svm, n_jobs=-1, cv=5)
clf_svm.fit(train_x, train_y)
print(clf_svm.best_params_)

# CURSE OF DIM : takes too long too run
# clf_nn = GridSearchCV(estimator=MLPClassifier(), param_grid=param_grid_nn, n_jobs=-1)
# clf_nn.fit(train_x_knn, train_y)



{'n_neighbors': 1, 'p': 1}
{'criterion': 'gini', 'n_estimators': 100}


In [None]:
"""
- work with output results perf, score --> get fmeasure, sensitivity, recall, ROC, accuracy, etc
- SVM ISSUES:
    - Sometimes infinite look
    - Experiment with soft margin
    
"""


In [8]:
p = clf_k.predict(test_x)
score = clf_k.score(test_x,test_y)
print(score)
perf = classification_report(test_y, p,output_dict=True)
print(perf)

0.5571428571428572
{'-1': {'precision': 0.32, 'recall': 0.36363636363636365, 'f1-score': 0.3404255319148936, 'support': 22}, '1': {'precision': 0.6888888888888889, 'recall': 0.6458333333333334, 'f1-score': 0.6666666666666667, 'support': 48}, 'accuracy': 0.5571428571428572, 'macro avg': {'precision': 0.5044444444444445, 'recall': 0.5047348484848485, 'f1-score': 0.5035460992907802, 'support': 70}, 'weighted avg': {'precision': 0.5729523809523809, 'recall': 0.5571428571428572, 'f1-score': 0.5641337386018237, 'support': 70}}


In [9]:
score = 0
for e in perf_arr['knn']:
    score = e[0]
    perf = e[1]

In [10]:
perf_arr['knn'][0]

(0.5571428571428572,
 {'-1': {'precision': 0.32,
   'recall': 0.36363636363636365,
   'f1-score': 0.3404255319148936,
   'support': 22},
  '1': {'precision': 0.6888888888888889,
   'recall': 0.6458333333333334,
   'f1-score': 0.6666666666666667,
   'support': 48},
  'accuracy': 0.5571428571428572,
  'macro avg': {'precision': 0.5044444444444445,
   'recall': 0.5047348484848485,
   'f1-score': 0.5035460992907802,
   'support': 70},
  'weighted avg': {'precision': 0.5729523809523809,
   'recall': 0.5571428571428572,
   'f1-score': 0.5641337386018237,
   'support': 70}})

In [11]:
perf_arr['nnrelu']

[(0.9857142857142858,
  {'-1': {'precision': 0.9565217391304348,
    'recall': 1.0,
    'f1-score': 0.9777777777777777,
    'support': 22},
   '1': {'precision': 1.0,
    'recall': 0.9791666666666666,
    'f1-score': 0.9894736842105264,
    'support': 48},
   'accuracy': 0.9857142857142858,
   'macro avg': {'precision': 0.9782608695652174,
    'recall': 0.9895833333333333,
    'f1-score': 0.9836257309941521,
    'support': 70},
   'weighted avg': {'precision': 0.986335403726708,
    'recall': 0.9857142857142858,
    'f1-score': 0.9857978279030911,
    'support': 70}}),
 (0.9857142857142858,
  {'-1': {'precision': 0.9565217391304348,
    'recall': 1.0,
    'f1-score': 0.9777777777777777,
    'support': 22},
   '1': {'precision': 1.0,
    'recall': 0.9791666666666666,
    'f1-score': 0.9894736842105264,
    'support': 48},
   'accuracy': 0.9857142857142858,
   'macro avg': {'precision': 0.9782608695652174,
    'recall': 0.9895833333333333,
    'f1-score': 0.9836257309941521,
    'support

In [12]:
"""
K=1 Nearest Neighbors
"""
n_neighbors=1
knn = KNeighborsClassifier(n_neighbors=n_neighbors, p=1) # p-l1 vs l2 distance
imputer = KNNImputer(n_neighbors=n_neighbors)
train_x_knn = imputer.fit_transform(train_x)
test_x_knn = imputer.fit_transform(test_x)
knn.fit(train_x_knn, train_y)
preds_knn = knn.predict(test_x_knn)
print("1-kNN %f" % knn.score(test_x_knn, test_y))
print(classification_report(test_y, preds_knn))

1-kNN 0.557143
              precision    recall  f1-score   support

          -1       0.32      0.36      0.34        22
           1       0.69      0.65      0.67        48

    accuracy                           0.56        70
   macro avg       0.50      0.50      0.50        70
weighted avg       0.57      0.56      0.56        70



In [13]:
"""
Decision Tree vs Random Forest
"""
dt = DecisionTreeClassifier(criterion="entropy")
dt.fit(train_x_knn, train_y)
preds_dt = dt.predict(test_x_knn)
print("Decision Tree %f" % dt.score(test_x_knn,test_y))
print(classification_report(test_y, preds_dt))

rf = RandomForestClassifier(criterion="gini", n_estimators=100)
rf.fit(train_x_knn, train_y)
preds_rf = rf.predict(test_x_knn)
print("Random Forest %f" % rf.score(test_x_knn,test_y))
print(classification_report(test_y, preds_rf))

Decision Tree 0.957143
              precision    recall  f1-score   support

          -1       0.95      0.91      0.93        22
           1       0.96      0.98      0.97        48

    accuracy                           0.96        70
   macro avg       0.96      0.94      0.95        70
weighted avg       0.96      0.96      0.96        70

Random Forest 0.985714
              precision    recall  f1-score   support

          -1       0.96      1.00      0.98        22
           1       1.00      0.98      0.99        48

    accuracy                           0.99        70
   macro avg       0.98      0.99      0.98        70
weighted avg       0.99      0.99      0.99        70



In [14]:
# print(clf_k.best_params_)
# print(clf_rf.best_params_)
print(clf_nn.best_params_)

NameError: name 'clf_nn' is not defined

In [None]:
# Try computing confusion matrix
confusion_matrix(y_pred, y_val)
accuracy(cm)

In [None]:
param_grid_nn =[
    {'hidden_layer_sizes':[(5,5),(10,10),(3,3,3),(5,5,5),(10,10,10),(10,5,5),(5,5,10)],'activation':["logistic"],'max_iter':[100,500,1000,1500,2000,3000,5000,7500,10000],'alpha'=[.1,.01,.001,.0001,.00001,.000001],'tol'=[1e-1,1e-2,1e-3,1e-4,1e-5,1e-6],'beta_1'=[.01,.1,.2,.3,.4,.5,.6,.7,.8,.9],'beta_2'=[.015,.15,.25,.35,.45,.55,.65,.75,.85,.95],'epsilon'=[1e-4,1e-5,1e-6,1e-7,1e-8,1e-9,1e-10]},
    {'hidden_layer_sizes':[(5,5),(10,10),(3,3,3),(5,5,5),(10,10,10),(10,5,5),(5,5,10)],'activation':["relu"],'max_iter':[100,500,1000,1500,2000,3000,5000,7500,10000],'alpha'=[.1,.01,.001,.0001,.00001,.000001],'tol'=[1e-1,1e-2,1e-3,1e-4,1e-5,1e-6],'beta_1'=[.01,.1,.2,.3,.4,.5,.6,.7,.8,.9],'beta_2'=[.015,.15,.25,.35,.45,.55,.65,.75,.85,.95],'epsilon'=[1e-4,1e-5,1e-6,1e-7,1e-8,1e-9,1e-10]}
]

In [None]:
""" 2 SVM:
Custom Polynomial SVM vs Gaussian SVM
"""
svm_3 = svm.SVC(kernel='poly', degree=3, gamma='scale', coef0=131, C=12, tol=.0001)
svm_3.fit(train_x_knn, train_y)
preds_svm3 = svm_3.predict(test_x_knn)
print("[Polynomial^3] SVM %f" % svm_3.score(test_x_knn,test_y))
print(classification_report(test_y, preds_svm3))

svm_g = svm.SVC(kernel='rbf', gamma='auto', C=12, tol=.0001)
svm_g.fit(train_x_knn, train_y)
preds_svmg = svm_g.predict(test_x_knn)
print("[Gaussian] SVM %f" % svm_g.score(test_x_knn,test_y))
print(classification_report(test_y, preds_svmg))

print(len(set(test_y)))
print(len(set(preds_svm3)))

In [None]:
""" 2 NN:
Sigmoid NN vs ReLu NN
"""
scaler = StandardScaler()
scaler.fit(train_x)
train_x_nn = scaler.transform(train_x)
test_x_nn = scaler.transform(test_x)
d = train_x_nn.shape[1]


mlp_sig = MLPClassifier(hidden_layer_sizes=(d,d,d), activation='logistic', max_iter=1500, alpha=.00001, tol=.0001, beta_1=.8, beta_2=.95, epsilon=.00000000001)
mlp_sig.fit(train_x_nn, train_y) # train_y.values.ravel() converts Series -> np.ndarray
preds_nn_sig = mlp_sig.predict(test_x_nn)
print("Relu NN %f" % mlp_sig.score(test_x_nn,test_y))
print(classification_report(test_y, preds_nn_sig))

mlp_relu = MLPClassifier(hidden_layer_sizes=(d,d,d), activation='relu', max_iter=1500, alpha=.00001, tol=.001, beta_1=.8, beta_2=.95, epsilon=.00000000001)
mlp_relu.fit(train_x_nn, train_y) # train_y.values.ravel() converts Series -> np.ndarray
preds_nn_relu = mlp_relu.predict(test_x_nn)
print("Relu NN %f" % mlp_relu.score(test_x_nn,test_y))
print(classification_report(test_y, preds_nn_relu))

In [None]:
"""
TODO:
    - Cross Validation --> over all 10 data sets --> and validation
    - Grid search for correct parameters
        --> SVM Help?
"""

In [None]:
if __name__ == "__main__":
    opts, args = getopt.getopt(sys.argv[1:],"d:a:f")
    degree = 1
    for a, b in opts:
        if a == '-d':
            degree = b
        elif a == '-a':
            degree = -1
        else:
            print("Usage: %s <-d degree#> <-a>" % sys.argv[0])
    
    print(degree)

# 