### Imports

In [53]:
import pandas as pd
import numpy as np

from sklearn.model_selection import train_test_split

from sklearn.preprocessing import StandardScaler
from sklearn.decomposition import PCA

from sklearn.feature_selection import SelectKBest,mutual_info_classif

from sklearn.model_selection import StratifiedGroupKFold
from sklearn.neighbors import KNeighborsClassifier
from sklearn.svm import LinearSVC

from sklearn.metrics import accuracy_score, recall_score, precision_score, roc_auc_score
from sklearn.model_selection import cross_validate
from sklearn.linear_model import LogisticRegression



## File handling 

In [16]:
# Extracting found features for each lesions with mask
file_features = "feature_data.csv"

# Naming of features
feature_names = ['assymmetry', 'red_var', 'green_var', 'blue_var', \
    'hue_var', 'sat_var', 'val_var', 'dom_hue', 'dom_sat', 'dom_val', \
    'compactness', 'convexity']

# Creating data frame for features with mask 
df_features = pd.read_csv(file_features)

# Loading in meta data for lesions with masks
file_data = 'metadata_withmasks.csv'
df = pd.read_csv(file_data)
df = df[df['mask'] == 1]

# Creating labels
labels = np.array(df['diagnostic'])

In [60]:
#Preparing data to be split
X = np.array(df_features[feature_names])
y =  (labels == 'BCC') | (labels == 'SCC') | (labels == 'MEL')   #now True means healthy nevus, False means something else
patient_id = df['patient_id']

(92, 12)

## PCA Approach 

In [33]:
# standardizing X data  
std_scl = StandardScaler()
X_std = std_scl.fit_transform(X)

In [179]:
#Splitting data into trainning and test data
X_train_std, X_test_std, y_train, y_test = train_test_split(X_std, y, test_size=0.2, random_state=1, stratify=y)

In [180]:
# Creating PCA class
pca = PCA(n_components=5)

# Fitting PCA class for the training set
pca.fit(X_train_std)

# Transforming all X with the PCA class
X_train_pca = pca.transform(X_train_std)
X_test_pca = pca.transform(X_test_std)

## SelectKBest Approach

In [168]:
# Splitting data into training and test data
X_train, X_test, y_train, y_test = train_test_split(X_std, y, test_size=0.2, random_state=1, stratify=y)

In [171]:
feature_selector = SelectKBest(mutual_info_classif, k=5)
feature_selector.fit(X_train, y_train)

X_train_adj = feature_selector.transform(X_train)
X_test_adj = feature_selector.transform(X_test)

## Classifier 

In [297]:
#Different classifiers to test out
# classifiers = [
#     KNeighborsClassifier(),
#     KNeighborsClassifier(10),
#     LinearSVC(max_iter = 5000),
#     LogisticRegression()
# ]
classifiers = [
    KNeighborsClassifier(n_neighbors=3),
    KNeighborsClassifier(n_neighbors=5),
    KNeighborsClassifier(n_neighbors=7),
    LogisticRegression(),
    LinearSVC(max_iter = 5000)
]

In [None]:
from sklearn.model_selection import StratifiedShuffleSplit

num_folds = 5
sss = StratifiedShuffleSplit(n_splits = num_folds)
sgkf = StratifiedGroupKFold(n_splits= num_folds)


cv_result = cross_validate(KNeighborsClassifier(), X_train_pca, y_train, scoring="accuracy", cv=sgkf)
cv_result

In [300]:
def evaluate_classifiers(X_train, y_train, classifiers):
    scores = ['accuracy', 'recall', 'precision', 'roc_auc']


    num_folds = 5
    cross_val = StratifiedShuffleSplit(n_splits = num_folds)

    evaluation_results = {}

    for classifier in classifiers:
        cv_results = cross_validate(classifier, X_train, y_train, scoring=scores, cv=cross_val)

        if type(classifier).__name__ == "KNeighborsClassifier":
            classifier_name = type(classifier).__name__
            params_dict = classifier.get_params()
            n_neigbors = params_dict["n_neighbors"]
            classifier_name = f"{classifier_name} with n_neighbors: {n_neigbors}"
        else:
            classifier_name = type(classifier).__name__

        evaluation_results[classifier_name] = {
            'Accuracy': cv_results['test_accuracy'].mean(),
            'Recall': cv_results['test_recall'].mean(),
            'Precision': cv_results['test_precision'].mean(),
            'ROC AUC': cv_results['test_roc_auc'].mean()

        }

    # for classifier in classifiers:
    #     cv_results = cross_validate(classifier, X_train, y_train, scoring=scores, cv=cross_val)
    #     classifier_name = type(classifier).__name__
        
    #     results[classifier_name] = {
    #         'Accuracy': cv_results['test_accuracy'].mean(),
    #         'Recall': cv_results['test_recall'].mean(),
    #         'Precision': cv_results['test_precision'].mean(),
    #         'ROC AUC': cv_results['test_roc_auc'].mean()
    #     }

    return evaluation_results

In [301]:
evaluation_results = evaluate_classifiers(X_train_pca, y_train, classifiers)

evaluation_results

{'KNeighborsClassifier with n_neighbors: 3': {'Accuracy': 0.375,
  'Recall': 0.3,
  'Precision': 0.32999999999999996,
  'ROC AUC': 0.31875},
 'KNeighborsClassifier with n_neighbors: 5': {'Accuracy': 0.375,
  'Recall': 0.25,
  'Precision': 0.2966666666666667,
  'ROC AUC': 0.375},
 'KNeighborsClassifier with n_neighbors: 7': {'Accuracy': 0.475,
  'Recall': 0.3,
  'Precision': 0.4333333333333333,
  'ROC AUC': 0.50625},
 'LogisticRegression': {'Accuracy': 0.675,
  'Recall': 0.55,
  'Precision': 0.7333333333333333,
  'ROC AUC': 0.6375},
 'LinearSVC': {'Accuracy': 0.675,
  'Recall': 0.5,
  'Precision': 0.7833333333333333,
  'ROC AUC': 0.6125}}

In [302]:
for classifier, scores in evaluation_results.items():
    print(classifier)
    for metric, score in scores.items():
        print(f'{metric}: {score:.4f}')
    print()

KNeighborsClassifier with n_neighbors: 3
Accuracy: 0.3750
Recall: 0.3000
Precision: 0.3300
ROC AUC: 0.3187

KNeighborsClassifier with n_neighbors: 5
Accuracy: 0.3750
Recall: 0.2500
Precision: 0.2967
ROC AUC: 0.3750

KNeighborsClassifier with n_neighbors: 7
Accuracy: 0.4750
Recall: 0.3000
Precision: 0.4333
ROC AUC: 0.5062

LogisticRegression
Accuracy: 0.6750
Recall: 0.5500
Precision: 0.7333
ROC AUC: 0.6375

LinearSVC
Accuracy: 0.6750
Recall: 0.5000
Precision: 0.7833
ROC AUC: 0.6125



In [308]:
classifiers_trained = [
    KNeighborsClassifier(n_neighbors=5).fit(X_train_adj, y_train),
    KNeighborsClassifier(n_neighbors=7).fit(X_train_adj, y_train),
    LinearSVC(max_iter = 5000).fit(X_train_adj, y_train),
    LogisticRegression().fit(X_train_adj, y_train)
]


for clf in classifiers_trained:
    if type(clf).__name__ == "KNeighborsClassifier":
        classifier_name = type(clf).__name__
        params_dict = clf.get_params()
        n_neigbors = params_dict["n_neighbors"]
        classifier_name = f"{classifier_name} with n_neighbors: {n_neigbors}"
        print(classifier_name)


KNeighborsClassifier with n_neighbors: 5
KNeighborsClassifier with n_neighbors: 7


In [304]:
def evaluate_test_data(X_test, y_true, classifiers):

    results = {}
    for clf in classifiers:
        y_pred = clf.predict(X_test)

        if type(clf).__name__ == "KNeighborsClassifier":
            classifier_name = type(clf).__name__
            params_dict = clf.get_params()
            n_neigbors = params_dict["n_neighbors"]
            classifier_name = f"{classifier_name} with n_neighbors: {n_neigbors}"
        else:
            classifier_name = type(clf).__name__
  
        
        results[classifier_name] = {
            'Accuracy': round(accuracy_score(y_true, y_pred), 3),
            'Recall': round(recall_score(y_true, y_pred), 3),
            'Precision': round(precision_score(y_true, y_pred), 3),
        }
    return results

In [305]:
results = evaluate_test_data(X_test_adj, y_test, classifiers_trained)

results

{'KNeighborsClassifier with n_neighbors: 5': {'Accuracy': 0.684,
  'Recall': 0.667,
  'Precision': 0.667},
 'KNeighborsClassifier with n_neighbors: 7': {'Accuracy': 0.684,
  'Recall': 0.667,
  'Precision': 0.667},
 'LinearSVC': {'Accuracy': 0.684, 'Recall': 0.444, 'Precision': 0.8},
 'LogisticRegression': {'Accuracy': 0.684, 'Recall': 0.444, 'Precision': 0.8}}

In [306]:
for classifier, scores in results.items():
    print(classifier)
    for metric, score in scores.items():
        print(f'{metric}: {score:.4f}')
    print()

KNeighborsClassifier with n_neighbors: 5
Accuracy: 0.6840
Recall: 0.6670
Precision: 0.6670

KNeighborsClassifier with n_neighbors: 7
Accuracy: 0.6840
Recall: 0.6670
Precision: 0.6670

LinearSVC
Accuracy: 0.6840
Recall: 0.4440
Precision: 0.8000

LogisticRegression
Accuracy: 0.6840
Recall: 0.4440
Precision: 0.8000

