### Imports

In [1]:
import pandas as pd
import numpy as np

from sklearn.model_selection import train_test_split

from sklearn.preprocessing import StandardScaler
from sklearn.decomposition import PCA

from sklearn.feature_selection import SelectKBest,mutual_info_classif

from sklearn.model_selection import StratifiedGroupKFold
from sklearn.neighbors import KNeighborsClassifier
from sklearn.svm import LinearSVC

from sklearn.metrics import accuracy_score, recall_score, precision_score, roc_auc_score
from sklearn.model_selection import cross_validate
from sklearn.linear_model import LogisticRegression

import matplotlib.pyplot as plt

import pickle as pk

## File handling 

In [2]:
# Extracting found features for each lesions with mask
file_features = "feature_data.csv"

# Naming of features
feature_names = ['assymmetry', 'red_var', 'green_var', 'blue_var', \
    'hue_var', 'sat_var', 'val_var', 'dom_hue', 'dom_sat', 'dom_val', \
    'compactness', 'convexity']

# Creating data frame for features with mask 
df_features = pd.read_csv(file_features)

# Loading in meta data for lesions with masks
file_data = 'metadata_withmasks.csv'
df = pd.read_csv(file_data)
df = df[df['mask'] == 1]

# Creating labels
labels = np.array(df['diagnostic'])
patient_id = df['patient_id']

In [3]:
#Preparing data to be split
X = df_features[feature_names]
y =  (labels == 'BCC') | (labels == 'SCC') | (labels == 'MEL')   #now True means healthy nevus, False means something else
groups = df_features['patient_id']

In [4]:
X.shape

(690, 12)

## PCA Approach 

In [7]:
# standardizing X data  
std_scl = StandardScaler()
X_std = std_scl.fit_transform(X)

pca = PCA(n_components= 0.95)

X_pca2 = pca.fit_transform(X_std)

In [60]:
X_pca.shape

(690, 8)

In [61]:
pk.dump(pca, open('pca.pkl','wb'))

In [5]:
pca_reload = pk.load(open('pca.pkl','rb'))

std_scl = StandardScaler()
X_std = std_scl.fit_transform(X)
X_pca = pca_reload.transform(X_std)

In [6]:
X_pca.shape

(690, 8)

In [8]:
X_pca == X_pca2

array([[False, False, False, ..., False, False, False],
       [False, False, False, ..., False, False, False],
       [False, False, False, ..., False, False, False],
       ...,
       [ True, False, False, ..., False,  True, False],
       [False, False, False, ..., False, False, False],
       [False, False, False, ..., False, False, False]])

In [95]:
#Splitting data into trainning and test data
X_train_std, X_test_std, y_train, y_test = train_test_split(X_std, y, test_size=0.2, random_state=41, stratify=y)

In [96]:
# Creating PCA class
pca = PCA(n_components=0.95)

# Fitting PCA class for the training set
pca.fit(X_train_std)

# Transforming all X with the PCA class
X_train_pca = pca.transform(X_train_std)
X_test_pca = pca.transform(X_test_std)

X_train_pca.shape

(552, 4)

## SelectKBest Approach

In [109]:
feature_selector1 = SelectKBest(mutual_info_classif, k=5)
feature_selector1.fit(X, y)

X_1 = feature_selector1.transform(X)

X_1.shape


(690, 5)

In [101]:
# Splitting data into training and test data
X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.2, random_state=1, stratify=y)

In [70]:
feature_selector = SelectKBest(mutual_info_classif, k=5)
feature_selector.fit(X_train, y_train)

scores = feature_selector.scores_

X_train_adj = feature_selector.transform(X_train)
X_test_adj = feature_selector.transform(X_test)

## Classifier 

In [150]:
#Different classifiers to test out
# classifiers = [
#     KNeighborsClassifier(),
#     KNeighborsClassifier(10),
#     LinearSVC(max_iter = 5000),
#     LogisticRegression()
# ]
classifiers1 = [KNeighborsClassifier(n_neighbors=i) for i in range(1, 20, 2)]

classifiers = [
    KNeighborsClassifier(n_neighbors=3),
    KNeighborsClassifier(n_neighbors=5),
    KNeighborsClassifier(n_neighbors=7)
]

In [112]:
from sklearn.model_selection import StratifiedShuffleSplit

num_folds = 5
sss = StratifiedShuffleSplit(n_splits = num_folds)
sgkf = StratifiedGroupKFold(n_splits= num_folds)

scores = ['accuracy', 'recall', 'precision', 'roc_auc']

cv_result = cross_validate(KNeighborsClassifier(5), X_1, y, scoring= scores, cv=sgkf, groups= patient_id)
cv_result

{'fit_time': array([0.00045896, 0.00031376, 0.00031185, 0.00032091, 0.00030708]),
 'score_time': array([0.00485015, 0.00454092, 0.00472212, 0.0045321 , 0.00462198]),
 'test_accuracy': array([0.64963504, 0.67153285, 0.64028777, 0.6884058 , 0.6618705 ]),
 'test_recall': array([0.51923077, 0.51923077, 0.55769231, 0.63461538, 0.51923077]),
 'test_precision': array([0.54      , 0.57446809, 0.51785714, 0.57894737, 0.55102041]),
 'test_roc_auc': array([0.67918552, 0.67647059, 0.69639699, 0.71265653, 0.7071176 ])}

In [115]:
def evaluate_classifiers(X_train, y_train, classifiers):
    scores = ['accuracy', 'recall', 'precision', 'roc_auc']


    num_folds = 5
    # cross_val = StratifiedShuffleSplit(n_splits = num_folds)
    cross_val = StratifiedGroupKFold(n_splits= num_folds)

    evaluation_results = {}

    for classifier in classifiers:
        cv_results = cross_validate(classifier, X_train, y_train, scoring=scores, cv=cross_val, groups = patient_id)

        if type(classifier).__name__ == "KNeighborsClassifier":
            classifier_name = type(classifier).__name__
            params_dict = classifier.get_params()
            n_neigbors = params_dict["n_neighbors"]
            classifier_name = f"{classifier_name} with n_neighbors: {n_neigbors}"
        else:
            classifier_name = type(classifier).__name__

        evaluation_results[classifier_name] = {
            'Accuracy': cv_results['test_accuracy'].mean(),
            'Recall': cv_results['test_recall'].mean(),
            'Precision': cv_results['test_precision'].mean(),
            'ROC AUC': cv_results['test_roc_auc'].mean()

        }

    # for classifier in classifiers:
    #     cv_results = cross_validate(classifier, X_train, y_train, scoring=scores, cv=cross_val)
    #     classifier_name = type(classifier).__name__
        
    #     results[classifier_name] = {
    #         'Accuracy': cv_results['test_accuracy'].mean(),
    #         'Recall': cv_results['test_recall'].mean(),
    #         'Precision': cv_results['test_precision'].mean(),
    #         'ROC AUC': cv_results['test_roc_auc'].mean()
    #     }

    return evaluation_results

In [151]:
results = evaluate_classifiers(X_pca, y, classifiers1)

results

{'KNeighborsClassifier with n_neighbors: 1': {'Accuracy': 0.630472150366029,
  'Recall': 0.5269230769230769,
  'Precision': 0.5114509193680525,
  'ROC AUC': 0.6100653090796494},
 'KNeighborsClassifier with n_neighbors: 3': {'Accuracy': 0.6636067724684105,
  'Recall': 0.5,
  'Precision': 0.5600512820512821,
  'ROC AUC': 0.6739221975879366},
 'KNeighborsClassifier with n_neighbors: 5': {'Accuracy': 0.6594274437638085,
  'Recall': 0.4884615384615385,
  'Precision': 0.5560261995424851,
  'ROC AUC': 0.6897727517384105},
 'KNeighborsClassifier with n_neighbors: 7': {'Accuracy': 0.6795921815388059,
  'Recall': 0.5038461538461538,
  'Precision': 0.588857653973933,
  'ROC AUC': 0.7152835523204409},
 'KNeighborsClassifier with n_neighbors: 9': {'Accuracy': 0.6797494914255837,
  'Recall': 0.49230769230769234,
  'Precision': 0.5949642144213447,
  'ROC AUC': 0.7165520295466283},
 'KNeighborsClassifier with n_neighbors: 11': {'Accuracy': 0.6796335828829795,
  'Recall': 0.4653846153846154,
  'Precisi

In [146]:
for classifier, scores in results.items():
    print(classifier)
    for metric, score in scores.items():
        print(f'{metric}: {score:.4f}')
    print()

KNeighborsClassifier with n_neighbors: 1
Accuracy: 0.6231
Recall: 0.4962
Precision: 0.4992
ROC AUC: 0.5980

KNeighborsClassifier with n_neighbors: 3
Accuracy: 0.6563
Recall: 0.4808
Precision: 0.5516
ROC AUC: 0.6669

KNeighborsClassifier with n_neighbors: 5
Accuracy: 0.6594
Recall: 0.4923
Precision: 0.5546
ROC AUC: 0.6932

KNeighborsClassifier with n_neighbors: 7
Accuracy: 0.6696
Recall: 0.4808
Precision: 0.5769
ROC AUC: 0.7134

KNeighborsClassifier with n_neighbors: 9
Accuracy: 0.6768
Recall: 0.4846
Precision: 0.5884
ROC AUC: 0.7202

KNeighborsClassifier with n_neighbors: 11
Accuracy: 0.6884
Recall: 0.4769
Precision: 0.6143
ROC AUC: 0.7261

KNeighborsClassifier with n_neighbors: 13
Accuracy: 0.6739
Recall: 0.4500
Precision: 0.5948
ROC AUC: 0.7258



In [123]:
kn7 = KNeighborsClassifier(n_neighbors=7)
kn7.fit(X_1, y)
kn7.predict(X_1)
kn7.predict_proba(X_1)

array([[1.        , 0.        ],
       [0.42857143, 0.57142857],
       [0.42857143, 0.57142857],
       ...,
       [0.71428571, 0.28571429],
       [0.42857143, 0.57142857],
       [0.85714286, 0.14285714]])

In [308]:
classifiers_trained = [
    KNeighborsClassifier(n_neighbors=5).fit(X_train_adj, y_train),
    KNeighborsClassifier(n_neighbors=7).fit(X_train_adj, y_train),
    LinearSVC(max_iter = 5000).fit(X_train_adj, y_train),
    LogisticRegression().fit(X_train_adj, y_train)
]



for clf in classifiers_trained:
    if type(clf).__name__ == "KNeighborsClassifier":
        classifier_name = type(clf).__name__
        params_dict = clf.get_params()
        n_neigbors = params_dict["n_neighbors"]
        classifier_name = f"{classifier_name} with n_neighbors: {n_neigbors}"
        print(classifier_name)


KNeighborsClassifier with n_neighbors: 5
KNeighborsClassifier with n_neighbors: 7


In [304]:
def evaluate_test_data(X_test, y_true, classifiers):

    results = {}
    for clf in classifiers:
        y_pred = clf.predict(X_test)

        if type(clf).__name__ == "KNeighborsClassifier":
            classifier_name = type(clf).__name__
            params_dict = clf.get_params()
            n_neigbors = params_dict["n_neighbors"]
            classifier_name = f"{classifier_name} with n_neighbors: {n_neigbors}"
        else:
            classifier_name = type(clf).__name__
  
        
        results[classifier_name] = {
            'Accuracy': round(accuracy_score(y_true, y_pred), 3),
            'Recall': round(recall_score(y_true, y_pred), 3),
            'Precision': round(precision_score(y_true, y_pred), 3),
        }
    return results

In [305]:
results = evaluate_test_data(X_test_adj, y_test, classifiers_trained)

results

{'KNeighborsClassifier with n_neighbors: 5': {'Accuracy': 0.684,
  'Recall': 0.667,
  'Precision': 0.667},
 'KNeighborsClassifier with n_neighbors: 7': {'Accuracy': 0.684,
  'Recall': 0.667,
  'Precision': 0.667},
 'LinearSVC': {'Accuracy': 0.684, 'Recall': 0.444, 'Precision': 0.8},
 'LogisticRegression': {'Accuracy': 0.684, 'Recall': 0.444, 'Precision': 0.8}}

In [306]:
for classifier, scores in results.items():
    print(classifier)
    for metric, score in scores.items():
        print(f'{metric}: {score:.4f}')
    print()

KNeighborsClassifier with n_neighbors: 5
Accuracy: 0.6840
Recall: 0.6670
Precision: 0.6670

KNeighborsClassifier with n_neighbors: 7
Accuracy: 0.6840
Recall: 0.6670
Precision: 0.6670

LinearSVC
Accuracy: 0.6840
Recall: 0.4440
Precision: 0.8000

LogisticRegression
Accuracy: 0.6840
Recall: 0.4440
Precision: 0.8000

