In [23]:
import matplotlib.pyplot as plt
import numpy as np
import pandas as pd
import math
from sklearn.neighbors import KNeighborsClassifier
from sklearn.metrics import accuracy_score
from sklearn.metrics import roc_auc_score

In [29]:
# importing the features normalised and dropping all the non relevant columns to have a clean start
df = pd.read_csv("../data/interim/norm_features.csv")
df = df.drop(["seborrheic_keratosis", "Perimeter", "Area", "image_id", "Red", "Green", "Blue"], axis=1)

df.head()


Unnamed: 0,melanoma,Norm_Compactness,Norm_Asymmetry,Norm_Average Color
0,0.0,0.47651,0.383144,0.576044
1,0.0,0.397999,0.313298,0.350968
2,0.0,0.401858,0.183044,0.737918
3,0.0,0.319373,0.185313,0.648577
4,0.0,0.369017,0.519001,0.575675


In [49]:
# Split the data before feature selection
from sklearn.model_selection import train_test_split

#copying the dataframe. it should be with .copy() but it works like this too
df2 = df


# Some noisy features
noise = np.random.RandomState(42).uniform(0, 0.1, size=(df2.shape[0], 20))

# Add the noisy data to the informative features
# this is not something that makes sense to me. we add the noise to then ignore it in the next cell
X = np.hstack((df2[['Norm_Compactness', 'Norm_Asymmetry', "Norm_Average Color"]], noise))
y = df2['melanoma']

# Split dataset to select feature and evaluate the classifier
# the splitting is done by splitting the data into data to be used for training and validation (development of the model --> dev), and data to be used for testing. 
X_dev, X_test, y_dev, y_test = train_test_split(
        X, y, stratify=y, random_state=0)

# the development data is split into training and validation.
X_train, X_val, y_train, y_val = train_test_split(
        X_dev, y_dev, stratify=y_dev)



In [39]:

# Select features to train the classifier with 
X_train2 = X_train[:, [0,2]] # Here just selecting the first three 


In [84]:
## test to decide which neighbor is best for accuracy:
# after deciding which is the best value for the classifier, we proceed to 

accuracy_dict = {}


neigh = int(math.sqrt(len(X_train2)))
neigh
# the range is to the neigh, as some sources suggest to use the square root of the numebr of datapoints, and it does seem as a fair estimation to decrease error
for i in range(1, neigh):
    #train the model with different values of the neighbors
    knn1 = KNeighborsClassifier(n_neighbors=i) 
    knn1trained = knn1.fit(X_train2, y_train)
    
    #Select the same features as before
    X_val2 = X_val[:, [0,2]]
    y_val_knn1 = knn1trained.predict(X_val2)
    
    #adding the accuracy value to the dictionary to assess which is the best
    # can be calculated with another calculation, but it would be time consimung to do both to show they are identical
    accuracy_percentage = accuracy_score(y_val, y_val_knn1)*100
    accuracy_dict[i] = accuracy_percentage


# getting all the values with the highest accuracy score
max_keys = [key for key, value in accuracy_dict.items() if value == max(accuracy_dict.values())]

# we use the biggest of the neighbors values as the neighbor to use for the classification, as a lower value is not recomended 
accuracy_dict[max_keys[-1]]


85.71428571428571

In [77]:
# this is a repetition of the previous step, to make sure that the accuracy is high. can be skipped if desired

# we choose the neighbor and we fit it with the training data 
knn1 = KNeighborsClassifier(n_neighbors=max_keys[-1]) 
knn1trained = knn1.fit(X_train2, y_train)

#Select the same features as before from the X validation wth noise
X_val2 = X_val[:, [0,2]]

# predicting the class labels for the provided data ( the values that were set aside for validating the accuracy of the model)
y_val_knn1 = knn1trained.predict(X_val2)


# Accuracy 
print(np.sum(y_val_knn1 == y_val) / np.size(y_val) * 100)


85.71428571428571



 we calculate the accuracy score based on the validation data that we were given. the first parameter is the true values and the second the values that derived.  
acc_knn1 = accuracy_score(y_val, y_val_knn1)

print(acc_knn1)

 it shows that we have the same accuracy score both with the function provided and the one we calculate ourselves

In [76]:
# computes the area under the curve based on the true labels and the predicted ones
auc1 = roc_auc_score(y_val, y_val_knn1)

print(auc1)


0.6666666666666666


In [82]:
# this is the evaluation of the model on the test data that was set aside for this purpose. the accuracy score is lower, but still valid 

X_test2 = X_test[:, [0,2]]
y_test_knn1 = knn1trained.predict(X_test2)

acc_test = accuracy_score(y_test, y_test_knn1)
print(acc_test)

auc_test = roc_auc_score(y_test, y_test_knn1)
print(auc_test)

0.7105263157894737
0.45
