# Melanoma Classification Model - Support Vector Machine

## Set up

#### Set up for importing utilities

In [3]:
import os
import sys
module_path = os.path.abspath(os.path.join('..'))
if module_path not in sys.path:
    sys.path.append(module_path)

 #### Random State
 Please use the following random state for all methods that may take a random state in order to achieve reproducable results

In [4]:
from utilities import random_state
random_state = random_state() 

#### Export Config
You may configure 
- whether the results shall be exported (export_results)
- where the results will be exported to

The default folder "export" will not be tracked by git in order to avoid flooding the repository with w.i.p. results. If you want to save a result on github, please rename it including the type of model and the date and move the picture to another folder :)

In [5]:
export_results = True
export_folder = 'export'

date_format = "%d%m%Y%H%M%S" # timestamp format in exported files
if export_results:
    import datetime
    import os
    if not os.path.exists(export_folder): 
      os.makedirs(export_folder)
      print("Created new directory %s" %export_folder)

## Get data

#### Get image paths
You may adjust the number of images loaded in order to run models more quickly on your private computer. Note hoewever that the less images you use the worse the predictions will likely be. 

For developing models on the cluster the max_images parameter should be removed. Instead call the method get_all_img_paths(img_folder).

In [20]:
from utilities import get_img_paths

img_folder_train = "data/train" # change this to the folder including your images!
img_folder_test = "data/test"
max_images_train = 800
max_images_test = 100

img_paths_train = get_img_paths(img_folder_train, max_images_train) 
img_paths_test = get_img_paths(img_folder_test, max_images_test)

#### Load data
Loads the images specified in img_paths into a data frame. This includes resizing the images and flattening them into an array and may take a while.

In [21]:
from utilities import load_data

groundtruth_file_train = "data/ISIC_2020_2019_train.csv" # change this to the path where you have your data!
groundtruth_file_test = "data/ISIC_2020_2019_test.csv"

df_train = load_data(img_paths_train, groundtruth_file_train)
df_test = load_data(img_paths_test, groundtruth_file_test)

#### Split into target and predictors

In [22]:
from utilities import split_predictors_target

X_train, y_train = split_predictors_target(df_train) 
X_test, y_test = split_predictors_target(df_test) 

## Feature extraction, grid search and training the model
Function performing feature extraction, grid search, training and testing the model

In [23]:
import sklearn
from sklearn import svm
from sklearn.model_selection import GridSearchCV
from sklearn.model_selection import train_test_split
from sklearn.metrics import classification_report
from utilities import display_results
from utilities import display_interesting_results


def feature_selection_prediction(method,x_train, y_train,x_test, y_test):
    x_train_reduced = method.fit(x_train, y_train).transform(x_train)
    param_grid = {'C': [0.1, 1, 10, 100, 1000], 
              'gamma': [1, 0.1, 0.01, 0.001, 0.0001],
              'kernel': ['rbf','poly']} 
    model = GridSearchCV(svm.SVC(), param_grid, refit = True, verbose = 0)
    model.fit(x_train_reduced,y_train)
    print(f'GridSearch best params after tuning={ model.best_params_}')
    print(f'GridSearch model after tuning={model.best_estimator_}')  
    x_test_reduced = method.fit(x_test, y_test).transform(x_test)
    y_pred = model.predict(x_test_reduced)
    
    ## print classification result
    report = classification_report(y_test, y_pred, digits=4)
    print(f'\nClassification_report=\n{report}\n')
    if export_results:
        file = open(export_folder + "/classification_report_"+datetime.datetime.now().strftime(date_format)+".txt", 'w')
        file.write(report)
        file.close()

Testing and training of four different feature extraction methods.

In [24]:
print("Principle Component Analysis\n")
from sklearn.decomposition import PCA
pca = PCA(n_components=20)
feature_selection_prediction(pca, X_train, y_train,X_test, y_test)

print("Independent Component Analysis\n")
from sklearn.decomposition import FastICA
ica = FastICA(n_components=20,max_iter = 500)
feature_selection_prediction(ica, X_train, y_train,X_test, y_test)

print("Linear Discriminant Analysis\n")
from sklearn.discriminant_analysis import LinearDiscriminantAnalysis
lda = LinearDiscriminantAnalysis(n_components=1)
feature_selection_prediction(lda, X_train, y_train,X_test, y_test)

print("Locally Linear Embedding")
from sklearn.manifold import LocallyLinearEmbedding
embedding = LocallyLinearEmbedding(n_components=2)
feature_selection_prediction(embedding, X_train, y_train,X_test, y_test)

Principle Component Analysis

GridSearch best params after tuning={'C': 0.1, 'gamma': 0.001, 'kernel': 'poly'}
GridSearch model after tuning=SVC(C=0.1, gamma=0.001, kernel='poly')

Classification_report=
              precision    recall  f1-score   support

           0     0.8061    1.0000    0.8927        79
           1     1.0000    0.0952    0.1739        21

    accuracy                         0.8100       100
   macro avg     0.9031    0.5476    0.5333       100
weighted avg     0.8468    0.8100    0.7417       100


Independent Component Analysis

GridSearch best params after tuning={'C': 100, 'gamma': 1, 'kernel': 'rbf'}
GridSearch model after tuning=SVC(C=100, gamma=1)

Classification_report=
              precision    recall  f1-score   support

           0     0.7907    0.8608    0.8242        79
           1     0.2143    0.1429    0.1714        21

    accuracy                         0.7100       100
   macro avg     0.5025    0.5018    0.4978       100
weighted avg  

  _warn_prf(average, modifier, msg_start, len(result))
  _warn_prf(average, modifier, msg_start, len(result))
  _warn_prf(average, modifier, msg_start, len(result))
