In [1]:
import numpy as np
import pandas as pd
from sklearn.neighbors import KNeighborsClassifier
from sklearn.metrics import accuracy_score, roc_auc_score, recall_score, precision_score 
from sklearn.model_selection import  StratifiedShuffleSplit

In [2]:
data = pd.read_csv('./database.csv')
data.shape

(77, 13)

In [3]:
data.centroidX= (data.centroidX - data.centroidX.min()) / (data.centroidX.max() - data.centroidX.min())
data.centroidY= (data.centroidY - data.centroidY.min()) / (data.centroidY.max() - data.centroidY.min())
data.orientation = (data.orientation  - data.orientation.min()) / (data.orientation.max() - data.orientation.min())
data.perimeter= (data.perimeter - data.perimeter.min()) / (data.perimeter.max() - data.perimeter.min())
data.meanIntensity= (data.meanIntensity - data.meanIntensity.min()) / (data.meanIntensity.max() - data.meanIntensity.min())
data.extent= (data.extent - data.extent.min()) / (data.extent.max() - data.extent.min())
data.aspectRatio= (data.aspectRatio - data.aspectRatio.min()) / (data.aspectRatio.max() - data.aspectRatio.min())
data.area = (data.area - data.area.min()) / (data.area.max() - data.area.min())
data.equivalentDiameter = (data.equivalentDiameter - data.equivalentDiameter.min()) / (data.equivalentDiameter.max() - data.equivalentDiameter.min())
data = data.drop(columns=['previousCancerDiagnosis', 'dicomPatientId'])
data.head()

Unnamed: 0,label,area,perimeter,aspectRatio,centroidX,centroidY,equivalentDiameter,extent,meanIntensity,orientation,eccentricity
0,0,0.326653,0.350579,0.949091,0.530454,0.842094,0.534709,0.308178,0.333713,0.036288,0.811104
1,0,0.326653,0.350579,0.949091,0.530454,0.842094,0.534709,0.308178,0.333713,0.036288,0.811104
2,0,0.334669,0.229176,0.04,1.0,0.894392,0.54224,0.435063,0.684463,0.464267,0.874032
3,0,0.130261,0.086087,0.28,0.040752,0.857569,0.30873,0.720855,0.668596,0.545255,0.602152
4,1,0.1002,0.072654,0.44,0.470218,0.208083,0.261825,0.700916,0.334637,0.793856,0.418157


In [4]:
featuresColumns = [x for x in data.columns if x not in 'label']
stf = StratifiedShuffleSplit(n_splits=1, test_size=23, random_state=42)
trainIdx, testIdx = next(stf.split(data[featuresColumns], data['label']))

xTrain = data.loc[trainIdx, featuresColumns]
yTrain = data.loc[trainIdx, 'label']

xTest = data.loc[testIdx, featuresColumns]
yTest = data.loc[testIdx, 'label']

In [5]:
knn = KNeighborsClassifier(n_neighbors=1)
knn.fit(xTrain, yTrain)

KNeighborsClassifier(n_neighbors=1)

In [6]:
def measureErrors(yTrue, yGuess, label):
    return pd.Series({
        'accuracy': accuracy_score(y_true=yTrue, y_pred=yGuess),
        'precision': precision_score(y_true=yTrue, y_pred=yGuess),
        'recall': recall_score(y_true=yTrue, y_pred=yGuess),
        'roc': roc_auc_score(yTrue,yGuess)
        },name=label
    )

In [7]:
trainPrediction = knn.predict(xTrain)
testPrediction = knn.predict(xTest)

trainAndTestErrors = pd.concat([measureErrors(yTrain, trainPrediction, 'train'), measureErrors(yTest, testPrediction, 'test')], axis=1)

trainAndTestErrors

Unnamed: 0,train,test
accuracy,1.0,0.826087
precision,1.0,0.7
recall,1.0,0.875
roc,1.0,0.8375


In [9]:
import pickle;
pickle.dump(knn, open('knn.sav', 'wb'));