## Get some images and import packages

In [None]:
if True:      #A weird trick needed for Google Colab
  # Clone repository with example images 
  !rm -rf fyp2022-imaging
  !git clone https://github.com/vcheplygina/fyp2022-imaging.git


import matplotlib.pyplot as plt
import numpy as np
import pandas as pd


Cloning into 'fyp2021p3'...
remote: Enumerating objects: 387, done.[K
remote: Counting objects: 100% (13/13), done.[K
remote: Compressing objects: 100% (13/13), done.[K
remote: Total 387 (delta 6), reused 2 (delta 0), pack-reused 374[K
Receiving objects: 100% (387/387), 855.59 MiB | 36.47 MiB/s, done.
Resolving deltas: 100% (98/98), done.
Checking out files: 100% (366/366), done.


# Load data

In [None]:
# Load features and labels

file_data = 'fyp2022-imaging/data/example_ground_truth.csv'
file_features = 'fyp2022-imaging/features/features.csv'

df = pd.read_csv(file_data)
features = pd.read_csv(file_features)


# Combine variables we want in one place
df = df.drop(['image_id','seborrheic_keratosis'],axis=1)
df['area'] = features['area']
df['perimeter'] = features['perimeter']

print(df.head())


   melanoma      area  perimeter
0       0.0  216160.0     2013.0
1       0.0  130493.0     1372.0
2       0.0  205116.0     1720.0
3       0.0  161705.0     1344.0
4       0.0  317040.0     2063.0


# Feature selection

In [None]:
# Split the data before feature selection
from sklearn.model_selection import train_test_split

# Some noisy features
noise = np.random.RandomState(42).uniform(0, 0.1, size=(df2.shape[0], 20))

# Add the noisy data to the informative features
X = np.hstack((df2[['area', 'perimeter']], noise))
y = df2['melanoma']

# Split dataset to select feature and evaluate the classifier
X_dev, X_test, y_dev, y_test = train_test_split(
        X, y, stratify=y, random_state=0)

X_train, X_val, y_train, y_val = train_test_split(
        X_dev, y_dev, stratify=y_dev)


# Train some classifiers

In [None]:
from sklearn.neighbors import KNeighborsClassifier
from sklearn.tree import DecisionTreeClassifier

# Select features 
X_train1 = X_train[:, [0,1]] # Here just selecting the first two 
X_train2 = selector.transform(X_train) # Here based on feature selection process

# Train a classifier
knn1 = KNeighborsClassifier(n_neighbors=1) # other hyperparameters possible
knn1trained = knn1.fit(X_train2, y_train)

knn2 = KNeighborsClassifier(n_neighbors=3)
knn2trained = knn2.fit(X_train2, y_train)

tree1 = DecisionTreeClassifier() 
tree1trained = tree1.fit(X_train2, y_train)


# Evaluate classifiers on validation set

In [None]:
#Select the same features as before
X_val1 = X_val[:, [0,1]]
X_val2 = selector.transform(X_val)

y_val_knn1 = knn1trained.predict(X_val2)
y_val_knn2 = knn2trained.predict(X_val2)

# Accuracy - often used in ML but not suitable for medical imaging in general
print(np.sum(y_val_knn1 == y_val) / np.size(y_val) * 100)
print(np.sum(y_val_knn2 == y_val) / np.size(y_val) * 100)

71.42857142857143
78.57142857142857


In [None]:
from sklearn.metrics import accuracy_score

acc_knn1 = accuracy_score(y_val, y_val_knn1)
acc_knn2 = accuracy_score(y_val, y_val_knn2)

print(acc_knn1)
print(acc_knn2)

0.7142857142857143
0.7857142857142857


In [None]:
from sklearn.metrics import roc_auc_score

auc1 = roc_auc_score(y_val, y_val_knn1)
auc2 = roc_auc_score(y_val, y_val_knn2)

print(auc1)
print(auc2)

0.6363636363636364
0.6818181818181818


In [None]:
# ONLY FOR REPORTING, also evaluate on test set
X_test = X_test[:, [0,1]]

y_test_knn1 = knn2trained.predict(X_test)

acc_test = accuracy_score(y_test, y_test_knn1)
auc_test = roc_auc_score(y_test, y_test_knn1)

print(acc_test)
print(auc_test)

# For small datasets these results will depend on the random seed you chose when splitting, 
# this is why it is good to look at multiple splits/cross-validation

0.6842105263157895
0.43333333333333335
