In [1]:
import numpy as np
import pandas as pd
import matplotlib.pyplot as plt
import sys, getopt
from sklearn.model_selection import KFold, GridSearchCV
from sklearn.neighbors import KNeighborsClassifier
from sklearn.tree import DecisionTreeClassifier
from sklearn.impute import KNNImputer
from sklearn.ensemble import RandomForestClassifier
from sklearn import svm
from sklearn.preprocessing import StandardScaler
from sklearn.neural_network import MLPClassifier
from sklearn.metrics import classification_report

pd.set_option('display.max_rows', None)
pd.set_option('display.max_columns', None)
pd.set_option('mode.chained_assignment', 'raise')

In [2]:
"""
Helper Functions
"""

def perf(test_y, pred_y):
    """
    params:
        pred_y : predicted y labels for data
        test_y : actual y labels for data
    return:
        Fraction of correctly classified samples
    """
    res = pred_y - test_y
    return res[res == 0].count() / res.count()


def split_xy(df):
    """
    params:
        df : dataframe
    return:
        tuple
            0 - dataframe of data w
            1 - ser
    """
    y = df['Class']
    x = df.drop('Class',axis=1)
    return x,y

In [3]:
"""
Data Collection
"""
colnames = ['Sample code number','Clump Thickness','Uniformity of Cell Size','Uniformity of Cell Shape','Marginal Adhesion','Single Epithelial Cell Size','Bare Nuclei','Bland Chromatin','Normal Nucleoli','Mitoses','Class']        
classes = {1 : 'benign', -1 : 'malignant'}
# All columns have values b/w 1-10 except for first (id) and last (class)
data = pd.read_csv('breast-cancer-wisconsin.data', names=colnames)
data.loc[(data['Class'] == 2),'Class'] = 1
data.loc[(data['Class'] == 4),'Class'] = -1
# ADJUST FOR MISISNG DATA '?'
data = data.replace('?', np.nan)

test_data = data.sample(frac=.1)
pre_train_data = data.drop(test_data.index).reset_index(drop=True)

In [4]:
"""
Cross Validation
https://machinelearningmastery.com/repeated-k-fold-cross-validation-with-python/
"""
def knnMethod(train_x, train_y, test_x, test_y):
    n_neighbors=1
    knn = KNeighborsClassifier(n_neighbors=n_neighbors, p=1) # p-l1 vs l2 distance
    imputer = KNNImputer(n_neighbors=n_neighbors)
    train_x_knn = imputer.fit_transform(train_x)
    test_x_knn = imputer.fit_transform(test_x)
    knn.fit(train_x_knn, train_y)
    preds_knn = knn.predict(test_x_knn)
    print("1-kNN %f" % knn.score(test_x_knn, test_y))
    print(classification_report(test_y, preds_knn))
    return train_x_knn, test_x_knn
    
def decisionTree(train_x, train_y, test_x, test_y):
    dt = DecisionTreeClassifier(criterion="entropy")
    dt.fit(train_x_knn, train_y)
    preds_dt = dt.predict(test_x_knn)
    print("Decision Tree %f" % dt.score(test_x_knn,test_y))
    print(classification_report(test_y, preds_dt))

def randomForest(train_x, train_y, test_x, test_y):
    rf = RandomForestClassifier(criterion="gini", n_estimators=100)
    rf.fit(train_x_knn, train_y)
    preds_rf = rf.predict(test_x_knn)
    print("Random Forest %f" % rf.score(test_x_knn,test_y))
    print(classification_report(test_y, preds_rf))
    
cv_datasets = []
for train_idx,valid_idx in KFold(n_splits=10, random_state=1, shuffle=True).split(pre_train_data):
    train_data = pre_train_data.iloc[train_idx]
    valid_data = pre_train_data.iloc[valid_idx]
    cv_datasets.append((train_data,valid_data))
    train_x, train_y = split_xy(train_data)
    valid_x, valid_y = split_xy(valid_data)
    test_x, test_y = split_xy(test_data)
    train_x_knn, test_x_knn = knnMethod(train_x, train_y, test_x, test_y)
    decisionTree(train_x, train_y, test_x, test_y)
    randomForest(train_x, train_y, test_x, test_y)
#     cv_datasets[2][0]

train = cv_datasets[0][0]
train_x, train_y = split_xy(train)
valid = cv_datasets[0][1]
valid_x, valid_y = split_xy(valid)
test_x, test_y = split_xy(test_data)

In [16]:
"""
K=1 Nearest Neighbors
"""
n_neighbors=1
knn = KNeighborsClassifier(n_neighbors=n_neighbors, p=1) # p-l1 vs l2 distance
imputer = KNNImputer(n_neighbors=n_neighbors)
train_x_knn = imputer.fit_transform(train_x)
test_x_knn = imputer.fit_transform(test_x)
knn.fit(train_x_knn, train_y)
preds_knn = knn.predict(test_x_knn)
print("1-kNN %f" % knn.score(test_x_knn, test_y))
print(classification_report(test_y, preds_knn))

1-kNN 0.557143
              precision    recall  f1-score   support

          -1       0.40      0.30      0.34        27
           1       0.62      0.72      0.67        43

    accuracy                           0.56        70
   macro avg       0.51      0.51      0.50        70
weighted avg       0.54      0.56      0.54        70



In [25]:
"""
Decision Tree vs Random Forest
"""
dt = DecisionTreeClassifier(criterion="entropy")
dt.fit(train_x_knn, train_y)
preds_dt = dt.predict(test_x_knn)
print("Decision Tree %f" % dt.score(test_x_knn,test_y))
print(classification_report(test_y, preds_dt))

rf = RandomForestClassifier(criterion="gini", n_estimators=100)
rf.fit(train_x_knn, train_y)
preds_rf = rf.predict(test_x_knn)
print("Random Forest %f" % rf.score(test_x_knn,test_y))
print(classification_report(test_y, preds_rf))

Decision Tree 0.942857
              precision    recall  f1-score   support

          -1       0.93      0.93      0.93        27
           1       0.95      0.95      0.95        43

    accuracy                           0.94        70
   macro avg       0.94      0.94      0.94        70
weighted avg       0.94      0.94      0.94        70

Random Forest 1.000000
              precision    recall  f1-score   support

          -1       1.00      1.00      1.00        27
           1       1.00      1.00      1.00        43

    accuracy                           1.00        70
   macro avg       1.00      1.00      1.00        70
weighted avg       1.00      1.00      1.00        70



In [26]:
""" 2 SVM:
Custom Polynomial SVM vs Gaussian SVM
"""
svm_3 = svm.SVC(kernel='poly', degree=3, gamma='scale', coef0=131, C=12, tol=.0001)
svm_3.fit(train_x_knn, train_y)
preds_svm3 = svm_3.predict(test_x_knn)
print("[Polynomial^3] SVM %f" % svm_3.score(test_x_knn,test_y))
print(classification_report(test_y, preds_svm3))

svm_g = svm.SVC(kernel='rbf', gamma='auto', C=12, tol=.0001)
svm_g.fit(train_x_knn, train_y)
preds_svmg = svm_g.predict(test_x_knn)
print("[Gaussian] SVM %f" % svm_g.score(test_x_knn,test_y))
print(classification_report(test_y, preds_svmg))

print(len(set(test_y)))
print(len(set(preds_svm3)))

[Polynomial^3] SVM 0.642857
              precision    recall  f1-score   support

          -1       1.00      0.07      0.14        27
           1       0.63      1.00      0.77        43

    accuracy                           0.64        70
   macro avg       0.82      0.54      0.46        70
weighted avg       0.77      0.64      0.53        70

[Gaussian] SVM 0.628571
              precision    recall  f1-score   support

          -1       1.00      0.04      0.07        27
           1       0.62      1.00      0.77        43

    accuracy                           0.63        70
   macro avg       0.81      0.52      0.42        70
weighted avg       0.77      0.63      0.50        70

2
2


In [27]:
""" 2 NN:
Sigmoid NN vs ReLu NN
"""
scaler = StandardScaler()
scaler.fit(train_x_knn)
train_x_nn = scaler.transform(train_x_knn)
test_x_nn = scaler.transform(test_x_knn)

mlp_sig = MLPClassifier(hidden_layer_sizes=(5,5,5), activation='logistic', max_iter=1500, alpha=.00001, tol=.0001, beta_1=.8, beta_2=.95, epsilon=.00000000001)
mlp_sig.fit(train_x_nn, train_y) # train_y.values.ravel() converts Series -> np.ndarray
preds_nn_sig = mlp_sig.predict(test_x_nn)
print("Relu NN %f" % mlp_sig.score(test_x_nn,test_y))
print(classification_report(test_y, preds_nn_sig))

mlp_relu = MLPClassifier(hidden_layer_sizes=(5,5,5), activation='relu', max_iter=1500, alpha=.00001, tol=.001, beta_1=.8, beta_2=.95, epsilon=.00000000001)
mlp_relu.fit(train_x_nn, train_y) # train_y.values.ravel() converts Series -> np.ndarray
preds_nn_relu = mlp_relu.predict(test_x_nn)
print("Relu NN %f" % mlp_relu.score(test_x_nn,test_y))
print(classification_report(test_y, preds_nn_relu))

Relu NN 0.985714
              precision    recall  f1-score   support

          -1       0.96      1.00      0.98        27
           1       1.00      0.98      0.99        43

    accuracy                           0.99        70
   macro avg       0.98      0.99      0.99        70
weighted avg       0.99      0.99      0.99        70

Relu NN 0.985714
              precision    recall  f1-score   support

          -1       0.96      1.00      0.98        27
           1       1.00      0.98      0.99        43

    accuracy                           0.99        70
   macro avg       0.98      0.99      0.99        70
weighted avg       0.99      0.99      0.99        70



In [10]:
"""
TODO:
    - Deal with missing data! Currently using KNN Imputer
    - Cross Validation --> over all 10 data sets --> and validation
    - Grid search for correct parameters
        --> SVM Help?
"""

'\nTODO:\n    - Deal with missing data! Currently using KNN Imputer\n    - Cross Validation --> over all 10 data sets --> and validation\n    - SVM division by 0 ??????\n    - Grid search for correct parameters\n'

SHARE W EASH & ADD GRID SEARCH

In [11]:
if __name__ == "__main__":
    opts, args = getopt.getopt(sys.argv[1:],"d:a:f")
    degree = 1
    for a, b in opts:
        if a == '-d':
            degree = b
        elif a == '-a':
            degree = -1
        else:
            print("Usage: %s <-d degree#> <-a>" % sys.argv[0])
    
    print(degree)

Usage: /Users/jfeibs/.pyenv/versions/3.9.1/envs/stockalyzer/lib/python3.9/site-packages/ipykernel_launcher.py <-d degree#> <-a>
1


# 