In [1]:
import numpy as np
import pandas as pd
import matplotlib.pyplot as plt
import sys, getopt
from sklearn.model_selection import KFold, GridSearchCV
from sklearn.neighbors import KNeighborsClassifier
from sklearn.tree import DecisionTreeClassifier
from sklearn.impute import KNNImputer
from sklearn.ensemble import RandomForestClassifier
from sklearn import svm
from sklearn.preprocessing import StandardScaler
from sklearn.neural_network import MLPClassifier
from sklearn.metrics import classification_report

pd.set_option('display.max_rows', None)
pd.set_option('display.max_columns', None)
pd.set_option('mode.chained_assignment', 'raise')

In [2]:
"""
Helper Functions
"""

# def perf(test_y, pred_y):
#     """
#     params:
#         pred_y : predicted y labels for data
#         test_y : actual y labels for data
#     return:
#         Fraction of correctly classified samples
#     """
#     res = pred_y - test_y
#     return res[res == 0].count() / res.count()


def split_xy(df):
    """
    params:
        df : dataframe
    return:
        tuple
            0 - dataframe of data w
            1 - ser
    """
    y = df['Class']
    x = df.drop('Class',axis=1)
    return x,y

In [3]:
"""
Data Collection
"""
colnames = ['Sample code number','Clump Thickness','Uniformity of Cell Size','Uniformity of Cell Shape','Marginal Adhesion','Single Epithelial Cell Size','Bare Nuclei','Bland Chromatin','Normal Nucleoli','Mitoses','Class']        
classes = {1 : 'benign', -1 : 'malignant'}
# All columns have values b/w 1-10 except for first (id) and last (class)
data = pd.read_csv('breast-cancer-wisconsin.data', names=colnames)
data.loc[(data['Class'] == 2),'Class'] = 1
data.loc[(data['Class'] == 4),'Class'] = -1
# ADJUST FOR MISISNG DATA '?'
data = data.replace('?', np.nan)

test_data = data.sample(frac=.1)
pre_train_data = data.drop(test_data.index).reset_index(drop=True)

In [None]:
"""
Cross Validation
https://machinelearningmastery.com/repeated-k-fold-cross-validation-with-python/
"""

param_grid_nn = [
    {'p':[1,2], 'n_neighbors':[1,2,3,4,5]}
]

param_grid_rf = [
    {'n_estimators':[100,500,1000,5000,10000]}
]

def knnMethod(train_x, train_y, test_x, test_y, imputer, n_neighbors=1, p=1):
    xTri, xTei = imputer.fit_transform(train_x), imputer.fit_transform(test_x)
    knn = KNeighborsClassifier(n_neighbors=n_neighbors, p=p) # p-l1 vs l2 distance
    knn.fit(xTri, train_y)
    preds_knn = knn.predict(xTei)
    score = knn.score(xTei, test_y)
    print("1-kNN %f" % score)
    perf = classification_report(test_y, preds_knn)
    print("-----------------------------------------\n")
    return score, perf
    
def decisionTree(train_x, train_y, test_x, test_y, imputer):
    xTri, xTei = imputer.fit_transform(train_x), imputer.fit_transform(test_x)
    dt = DecisionTreeClassifier(criterion="entropy")
    dt.fit(xTri, train_y)
    preds_dt = dt.predict(xTei)
    score = dt.score(xTei,test_y)
    print("Decision Tree %f" % score)
    perf = classification_report(test_y, preds_dt)
    print(perf)
    print("-----------------------------------------\n")
    return score, perf

def randomForest(train_x, train_y, test_x, test_y, imputer, n_estimators=100):
    xTri, xTei = imputer.fit_transform(train_x), imputer.fit_transform(test_x)
    rf = RandomForestClassifier(criterion="gini", n_estimators=100)
    rf.fit(xTri, train_y)
    preds_rf = rf.predict(xTei)
    score = rf.score(xTei,test_y)
    print("Random Forest %f" % score)
    perf = classification_report(test_y, preds_rf)
    print(perf)
    print("-----------------------------------------\n")
    return score,perf

def polynomialSVC(train_x, train_y, test_x, test_y, imputer):
    xTri, xTei = imputer.fit_transform(train_x), imputer.fit_transform(test_x)
    svm_3 = svm.SVC(kernel='poly', degree=3, gamma='scale', coef0=131, C=12, tol=.0001)
    svm_3.fit(xTri, train_y)
    preds_svm3 = svm_3.predict(xTei)
    score = svm_3.score(xTei,test_y)
    print("[Polynomial^3] SVM %f" % score)
    perf = classification_report(test_y, preds_svm3)
    print(perf)
    print("-----------------------------------------\n")
    return score,perf
    
def gaussianSVC(train_x, train_y, test_x, test_y, imputer):
    xTri, xTei = imputer.fit_transform(train_x), imputer.fit_transform(test_x)
    svm_g = svm.SVC(kernel='rbf', gamma='auto', C=12, tol=.0001)
    svm_g.fit(xTri, train_y)
    preds_svmg = svm_g.predict(xTei)
    score = svm_g.score(xTei,test_y)
    print("[Gaussian] SVM %f" % score)
    perf = classification_report(test_y, preds_svmg)
    print(perf)
    print("-----------------------------------------\n")
    return score,perf
    
def sigmoidNN(train_x, train_y, test_x, test_y, imputer):
    xTri, xTei = imputer.fit_transform(train_x), imputer.fit_transform(test_x)
    scaler = StandardScaler()
    scaler.fit(xTri)
    xTrnn, xTenn = scaler.transform(xTri), scaler.transform(xTei)
    mlp_sig = MLPClassifier(hidden_layer_sizes=(5,5,5), activation='logistic', max_iter=1500, alpha=.00001, tol=.0001, beta_1=.8, beta_2=.95, epsilon=.00000000001)
    mlp_sig.fit(xTri, train_y) # train_y.values.ravel() converts Series -> np.ndarray
    preds_nn_sig = mlp_sig.predict(xTei)
    score = mlp_sig.score(xTei,test_y)
    print("Sigmoid NN %f" % score)
    perf = classification_report(test_y, preds_nn_sig)
    print(perf)
    return score,perf
    
def reluNN(train_x, train_y, test_x, test_y, imputer):
    xTri, xTei = imputer.fit_transform(train_x), imputer.fit_transform(test_x)
    scaler = StandardScaler()
    scaler.fit(xTri)
    xTrnn, xTenn = scaler.transform(xTri), scaler.transform(xTei)
    mlp_relu = MLPClassifier(hidden_layer_sizes=(5,5,5), activation='relu', max_iter=1500, alpha=.00001, tol=.001, beta_1=.8, beta_2=.95, epsilon=.00000000001)
    mlp_relu.fit(xTri, train_y) # train_y.values.ravel() converts Series -> np.ndarray
    preds_nn_relu = mlp_relu.predict(xTei)
    score = mlp_relu.score(xTei,test_y)
    print("Relu NN %f" % score)
    perf = classification_report(test_y, preds_nn_relu)
    print(perf)
    return score,perf
    
cv_datasets = []
imputers = [KNNImputer(n_neighbors=1)]
for train_idx,valid_idx in KFold(n_splits=10, random_state=1, shuffle=True).split(pre_train_data):
    train_data = pre_train_data.iloc[train_idx]
    valid_data = pre_train_data.iloc[valid_idx]
    cv_datasets.append((train_data,valid_data))
    train_x, train_y = split_xy(train_data)
    valid_x, valid_y = split_xy(valid_data)
    test_x, test_y = split_xy(test_data)
    
    knnMethod(train_x, train_y, test_x, test_y, imputers[0])
    decisionTree(train_x, train_y, test_x, test_y, imputers[0])
    randomForest(train_x, train_y, test_x, test_y, imputers[0])
    polynomialSVC(train_x, train_y, test_x, test_y, imputers[0])
    gaussianSVC(train_x, train_y, test_x, test_y, imputers[0])
    sigmoidNN(train_x, train_y, test_x, test_y, imputers[0])
    reluNN(train_x, train_y, test_x, test_y, imputers[0])
#     cv_datasets[2][0]

train = cv_datasets[0][0]
train_x, train_y = split_xy(train)
valid = cv_datasets[0][1]
valid_x, valid_y = split_xy(valid)
test_x, test_y = split_xy(test_data)

1-kNN 0.528571
-----------------------------------------

Decision Tree 0.942857
              precision    recall  f1-score   support

          -1       0.92      0.92      0.92        24
           1       0.96      0.96      0.96        46

    accuracy                           0.94        70
   macro avg       0.94      0.94      0.94        70
weighted avg       0.94      0.94      0.94        70

-----------------------------------------

Random Forest 0.985714
              precision    recall  f1-score   support

          -1       0.96      1.00      0.98        24
           1       1.00      0.98      0.99        46

    accuracy                           0.99        70
   macro avg       0.98      0.99      0.98        70
weighted avg       0.99      0.99      0.99        70

-----------------------------------------

[Polynomial^3] SVM 0.657143
              precision    recall  f1-score   support

          -1       0.00      0.00      0.00        24
           1       

  _warn_prf(average, modifier, msg_start, len(result))
  _warn_prf(average, modifier, msg_start, len(result))
  _warn_prf(average, modifier, msg_start, len(result))
  _warn_prf(average, modifier, msg_start, len(result))
  _warn_prf(average, modifier, msg_start, len(result))
  _warn_prf(average, modifier, msg_start, len(result))
  _warn_prf(average, modifier, msg_start, len(result))
  _warn_prf(average, modifier, msg_start, len(result))
  _warn_prf(average, modifier, msg_start, len(result))


-----------------------------------------

Decision Tree 0.971429
              precision    recall  f1-score   support

          -1       0.92      1.00      0.96        24
           1       1.00      0.96      0.98        46

    accuracy                           0.97        70
   macro avg       0.96      0.98      0.97        70
weighted avg       0.97      0.97      0.97        70

-----------------------------------------

Random Forest 0.985714
              precision    recall  f1-score   support

          -1       0.96      1.00      0.98        24
           1       1.00      0.98      0.99        46

    accuracy                           0.99        70
   macro avg       0.98      0.99      0.98        70
weighted avg       0.99      0.99      0.99        70

-----------------------------------------

[Polynomial^3] SVM 0.657143
              precision    recall  f1-score   support

          -1       0.00      0.00      0.00        24
           1       0.66      1.00 

  _warn_prf(average, modifier, msg_start, len(result))
  _warn_prf(average, modifier, msg_start, len(result))
  _warn_prf(average, modifier, msg_start, len(result))


Sigmoid NN 0.657143
              precision    recall  f1-score   support

          -1       0.00      0.00      0.00        24
           1       0.66      1.00      0.79        46

    accuracy                           0.66        70
   macro avg       0.33      0.50      0.40        70
weighted avg       0.43      0.66      0.52        70

Relu NN 0.342857
              precision    recall  f1-score   support

          -1       0.34      1.00      0.51        24
           1       0.00      0.00      0.00        46

    accuracy                           0.34        70
   macro avg       0.17      0.50      0.26        70
weighted avg       0.12      0.34      0.18        70

1-kNN 0.628571
-----------------------------------------

Decision Tree 0.957143
              precision    recall  f1-score   support

          -1       0.96      0.92      0.94        24
           1       0.96      0.98      0.97        46

    accuracy                           0.96        70
   macro a

  _warn_prf(average, modifier, msg_start, len(result))
  _warn_prf(average, modifier, msg_start, len(result))
  _warn_prf(average, modifier, msg_start, len(result))
  _warn_prf(average, modifier, msg_start, len(result))
  _warn_prf(average, modifier, msg_start, len(result))
  _warn_prf(average, modifier, msg_start, len(result))


Random Forest 0.985714
              precision    recall  f1-score   support

          -1       0.96      1.00      0.98        24
           1       1.00      0.98      0.99        46

    accuracy                           0.99        70
   macro avg       0.98      0.99      0.98        70
weighted avg       0.99      0.99      0.99        70

-----------------------------------------

[Polynomial^3] SVM 0.657143
              precision    recall  f1-score   support

          -1       0.00      0.00      0.00        24
           1       0.66      1.00      0.79        46

    accuracy                           0.66        70
   macro avg       0.33      0.50      0.40        70
weighted avg       0.43      0.66      0.52        70

-----------------------------------------

[Gaussian] SVM 0.685714
              precision    recall  f1-score   support

          -1       1.00      0.08      0.15        24
           1       0.68      1.00      0.81        46

    accuracy         

  _warn_prf(average, modifier, msg_start, len(result))
  _warn_prf(average, modifier, msg_start, len(result))
  _warn_prf(average, modifier, msg_start, len(result))
  _warn_prf(average, modifier, msg_start, len(result))
  _warn_prf(average, modifier, msg_start, len(result))
  _warn_prf(average, modifier, msg_start, len(result))
  _warn_prf(average, modifier, msg_start, len(result))
  _warn_prf(average, modifier, msg_start, len(result))
  _warn_prf(average, modifier, msg_start, len(result))


Relu NN 0.657143
              precision    recall  f1-score   support

          -1       0.00      0.00      0.00        24
           1       0.66      1.00      0.79        46

    accuracy                           0.66        70
   macro avg       0.33      0.50      0.40        70
weighted avg       0.43      0.66      0.52        70

1-kNN 0.571429
-----------------------------------------

Decision Tree 0.928571
              precision    recall  f1-score   support

          -1       0.91      0.88      0.89        24
           1       0.94      0.96      0.95        46

    accuracy                           0.93        70
   macro avg       0.92      0.92      0.92        70
weighted avg       0.93      0.93      0.93        70

-----------------------------------------

Random Forest 0.985714
              precision    recall  f1-score   support

          -1       0.96      1.00      0.98        24
           1       1.00      0.98      0.99        46

    accuracy      

In [None]:
"""
Grid Search
"""


In [None]:
"""
K=1 Nearest Neighbors
"""
n_neighbors=1
knn = KNeighborsClassifier(n_neighbors=n_neighbors, p=1) # p-l1 vs l2 distance
imputer = KNNImputer(n_neighbors=n_neighbors)
train_x_knn = imputer.fit_transform(train_x)
test_x_knn = imputer.fit_transform(test_x)
knn.fit(train_x_knn, train_y)
preds_knn = knn.predict(test_x_knn)
print("1-kNN %f" % knn.score(test_x_knn, test_y))
print(classification_report(test_y, preds_knn))

In [None]:
"""
Decision Tree vs Random Forest
"""
dt = DecisionTreeClassifier(criterion="entropy")
dt.fit(train_x_knn, train_y)
preds_dt = dt.predict(test_x_knn)
print("Decision Tree %f" % dt.score(test_x_knn,test_y))
print(classification_report(test_y, preds_dt))

rf = RandomForestClassifier(criterion="gini", n_estimators=100)
rf.fit(train_x_knn, train_y)
preds_rf = rf.predict(test_x_knn)
print("Random Forest %f" % rf.score(test_x_knn,test_y))
print(classification_report(test_y, preds_rf))

In [None]:
""" 2 SVM:
Custom Polynomial SVM vs Gaussian SVM
"""
svm_3 = svm.SVC(kernel='poly', degree=3, gamma='scale', coef0=131, C=12, tol=.0001)
svm_3.fit(train_x_knn, train_y)
preds_svm3 = svm_3.predict(test_x_knn)
print("[Polynomial^3] SVM %f" % svm_3.score(test_x_knn,test_y))
print(classification_report(test_y, preds_svm3))

svm_g = svm.SVC(kernel='rbf', gamma='auto', C=12, tol=.0001)
svm_g.fit(train_x_knn, train_y)
preds_svmg = svm_g.predict(test_x_knn)
print("[Gaussian] SVM %f" % svm_g.score(test_x_knn,test_y))
print(classification_report(test_y, preds_svmg))

print(len(set(test_y)))
print(len(set(preds_svm3)))

In [None]:
""" 2 NN:
Sigmoid NN vs ReLu NN
"""
scaler = StandardScaler()
scaler.fit(train_x_knn)
train_x_nn = scaler.transform(train_x_knn)
test_x_nn = scaler.transform(test_x_knn)
d = train_x_nn.shape[1]


mlp_sig = MLPClassifier(hidden_layer_sizes=(d,d,d), activation='logistic', max_iter=1500, alpha=.00001, tol=.0001, beta_1=.8, beta_2=.95, epsilon=.00000000001)
mlp_sig.fit(train_x_nn, train_y) # train_y.values.ravel() converts Series -> np.ndarray
preds_nn_sig = mlp_sig.predict(test_x_nn)
print("Relu NN %f" % mlp_sig.score(test_x_nn,test_y))
print(classification_report(test_y, preds_nn_sig))

mlp_relu = MLPClassifier(hidden_layer_sizes=(d,d,d), activation='relu', max_iter=1500, alpha=.00001, tol=.001, beta_1=.8, beta_2=.95, epsilon=.00000000001)
mlp_relu.fit(train_x_nn, train_y) # train_y.values.ravel() converts Series -> np.ndarray
preds_nn_relu = mlp_relu.predict(test_x_nn)
print("Relu NN %f" % mlp_relu.score(test_x_nn,test_y))
print(classification_report(test_y, preds_nn_relu))

In [None]:
"""
TODO:
    - Cross Validation --> over all 10 data sets --> and validation
    - Grid search for correct parameters
        --> SVM Help?
"""

In [None]:
if __name__ == "__main__":
    opts, args = getopt.getopt(sys.argv[1:],"d:a:f")
    degree = 1
    for a, b in opts:
        if a == '-d':
            degree = b
        elif a == '-a':
            degree = -1
        else:
            print("Usage: %s <-d degree#> <-a>" % sys.argv[0])
    
    print(degree)

# 