In [None]:
import pandas as pd
import numpy as np
from sklearn.model_selection import train_test_split
from sklearn.linear_model import Lasso
from sklearn.neighbors import KNeighborsClassifier, NearestCentroid
from sklearn.naive_bayes import ComplementNB
from sklearn.svm import SVC
from sklearn.metrics import accuracy_score, roc_curve, roc_auc_score, auc, confusion_matrix, precision_score, pairwise
from imblearn.over_sampling import RandomOverSampler

import matplotlib.pyplot as plt
import math

In [None]:
# importing the features normalised and dropping all the non relevant columns to have a clean start
df = pd.read_csv("../data/processed/ISIC_2017_norm_features.csv")
df = df.drop(["seborrheic_keratosis", "Perimeter", "Area", "image_id", "Red", "Green", "Blue"], axis=1)


df.head()
#Note: for sex, 1 is female, 0 is male

In [None]:
#copying the dataframe. it should be with .copy() but it works like this too
df2 = df.copy().dropna()


# Some noisy features
noise = np.random.RandomState(42).uniform(0, 0.1, size=(df2.shape[0], 20))

# Add the noisy data to the informative features
# this is not something that makes sense to me. we add the noise to then ignore it in the next cell
X = np.hstack((df2[['Norm_Compactness', 'Norm_Asymmetry', "Norm_Average Color","Norm_Age","Sex"]], noise))
y = df2['melanoma'].astype("int32")


# Split dataset to select feature and evaluate the classifier
# the splitting is done by splitting the data into data to be used for training and validation (development of the model --> dev), and data to be used for testing. 
X_dev, X_test, y_dev, y_test = train_test_split(
        X, y, stratify=y)

# the development data is split into training and validation.
X_train, X_val, y_train, y_val = train_test_split(
        X_dev, y_dev, stratify=y_dev)


#OverSampling to compensate for imbalanced dataset
oversample = RandomOverSampler(sampling_strategy = 0.6)
X_over, y_over = oversample.fit_resample(X_train,y_train)


In [None]:
# Train a classifier


best_k_list = []
# the range is to the neigh, as some sources suggest to use the square root of the number of datapoints, and it does seem as a fair estimation to decrease error
for count in range(0,100):
    roc_test_dict = {}
    X_over = X_over[:,0:5] # Take only wanted features
    X_val = X_val[:, 0:5] # Take only wanted features
    X_test = X_test[:,0:5]
    max_keys = None
    neigh = int(math.sqrt(len(X_over)))
    neigh
    
    # Split dataset to select feature and evaluate the classifier
    # the splitting is done by splitting the data into data to be used for training and validation (development of the model --> dev), and data to be used for testing. 
    X_dev, X_test, y_dev, y_test = train_test_split(
            X, y, stratify=y)

    # the development data is split into training and validation.
    X_train, X_val, y_train, y_val = train_test_split(
            X_dev, y_dev, stratify=y_dev)


    #OverSampling to compensate for imbalanced dataset
    oversample = RandomOverSampler(sampling_strategy = 0.6)
    X_over, y_over = oversample.fit_resample(X_train,y_train)
    
    for i in range(1, neigh*4):
        #train the model with different values of the neighbors
        knn1 = KNeighborsClassifier(n_neighbors=i) 
        knn1trained = knn1.fit(X_over, y_over)

        #Select the same features as before
        y_val_knn1 = knn1trained.predict_proba(X_val)
        y_test_knn1 = knn1trained.predict_proba(X_test)


        #adding the roc_score value to the dictionary to assess which is the best
        # can be calculated with another calculation, but it would be time consuming to do both to show they are identical
        fpr, tpr, threshold = roc_curve(y_val, y_val_knn1[:,1])
        roc_val = auc(fpr,tpr)
        fpr, tpr, threshold = roc_curve(y_test, y_test_knn1[:,1])
        roc_test = auc(fpr,tpr)
        roc_test_dict[i] = np.mean([roc_val,roc_test])
        

    # getting all the values with the highest roc score
    k = [key for key, value in roc_test_dict.items() if value == max(roc_test_dict.values())][-1]
    best_k_list.append(k)
    print(count)

#best trained knn algorithm
print(np.mean(best_k_list))
#final_knn_trained = knn_list[max_keys[-1]]



In [None]:
print(max(roc_val_dict.values()))
plt.bar(roc_val_dict.keys(),roc_val_dict.values())
plt.ylim([0.5,1]);

In [None]:
print(max(roc_test_dict.values()))
plt.bar(roc_test_dict.keys(),roc_test_dict.values())
plt.ylim([0.5,1]);

In [None]:
y_scores = final_knn_trained.predict_proba(X_test)
fpr, tpr, threshold = roc_curve(y_test, y_scores[:, 1])
roc_auc = auc(fpr, tpr)

plt.title('Receiver Operating Characteristic')
plt.plot(fpr, tpr, 'b', label = 'AUC = %0.2f' % roc_auc)
plt.legend(loc = 'lower right')
plt.plot([0, 1], [0, 1],'r--')
plt.xlim([0, 1])
plt.ylim([0, 1])
plt.ylabel('True Positive Rate')
plt.xlabel('False Positive Rate')
plt.title('ROC Curve of kNN')
plt.show()
print(y_scores)

In [None]:
#KNN model for 3 features
noise = np.random.RandomState(42).uniform(0, 0.1, size=(df2.shape[0], 20))
X = np.hstack((df2[['Norm_Compactness', 'Norm_Asymmetry', "Norm_Average Color","Norm_Age","Sex"]], noise))[:,0:3]
y = df2['melanoma'].astype("int32")

oversample = RandomOverSampler(sampling_strategy = 0.6)
X_over, y_over = oversample.fit_resample(X,y)
final_knn = KNeighborsClassifier(n_neighbors=91)
final_knn = final_knn.fit(X_over, y_over)

In [None]:
import joblib
joblib.dump(final_knn,"knn_trained.joblib")

In [None]:
#KNN model for 5 features
noise = np.random.RandomState(42).uniform(0, 0.1, size=(df2.shape[0], 20))
X = np.hstack((df2[['Norm_Compactness', 'Norm_Asymmetry', "Norm_Average Color","Norm_Age","Sex"]], noise))[:,0:5]
y = df2['melanoma'].astype("int32")

oversample = RandomOverSampler(sampling_strategy = 0.6)
X_over, y_over = oversample.fit_resample(X,y)
final_knn_extra = KNeighborsClassifier(n_neighbors=94)
final_knn_extra = final_knn_extra.fit(X_over, y_over)

In [None]:
import joblib
joblib.dump(final_knn_extra,"knn_trained_extra.joblib")