# Melanoma Classification Model - Support Vector Machine

## Set up

#### Set up for importing utilities

In [None]:
import os
import sys
module_path = os.path.abspath(os.path.join('..'))
if module_path not in sys.path:
    sys.path.append(module_path)

 #### Random State
 Please use the following random state for all methods that may take a random state in order to achieve reproducable results

In [None]:
from utilities import random_state
random_state = random_state() 

#### Export Config
You may configure 
- whether the results shall be exported (export_results)
- where the results will be exported to

The default folder "export" will not be tracked by git in order to avoid flooding the repository with w.i.p. results. If you want to save a result on github, please rename it including the type of model and the date and move the picture to another folder :)

In [None]:
export_results = True
on_cluster = False
export_folder = f'results/SVM_Model'

date_format = "%d%m%Y%H%M%S" # timestamp format in exported files
if export_results:
    import datetime
    import os
    if not os.path.exists(export_folder): 
      os.makedirs(export_folder)
      print("Created new directory %s" %export_folder)

## Get data

#### Get image paths
You may adjust the number of images loaded in order to run models more quickly on your private computer. Note hoewever that the less images you use the worse the predictions will likely be. 

For developing models on the cluster the max_images parameter should be removed. Instead call the method get_all_img_paths(img_folder).

In [None]:
downsampled_data = True
base_path = "data/30" 
current_train_melanoma_percentage = 0.3

In [None]:
from utilities import get_all_img_paths, get_img_paths

img_folder_train = base_path + "/train" + ("_downsampled" if downsampled_data else "")
img_folder_test = base_path + "/test" + ("_downsampled" if downsampled_data else "")
max_images_train = int(13653*1)
max_images_test = int(5804*1)

img_paths_train = get_img_paths(img_folder_train, max_images_train) 
img_paths_test = get_img_paths(img_folder_test, max_images_test)

#### Load data
Loads the images specified in img_paths into a data frame. This includes resizing the images and flattening them into an array and may take a while.

In [None]:
from utilities import load_train_test

groundtruth_file_train = base_path + "/ISIC_2020_2019_train" + ("_downsampled" if downsampled_data else "") + ".csv" 
groundtruth_file_test = base_path + "/ISIC_2020_2019_test" + ("_downsampled" if downsampled_data else "") + ".csv"

# available options
options = ["sequential", # first load train, then load test
           "parallel_train_test", # load train and test parallel (load data within train and test sequential)
           "sequential_train_test_parallel_chunks", # load first train, then test, but load the data within the sets parallel
           "parallel_fusion" # run train and test parallel and parallely load data with train and test 
          ]

# chose an option
option = "parallel_fusion"

df_train, df_test = load_train_test(img_paths_train, groundtruth_file_train, img_paths_test, groundtruth_file_test, option);


#### Split into target and predictors

In [None]:
from utilities import split_predictors_target

X_train, y_train = split_predictors_target(df_train) 
X_test, y_test = split_predictors_target(df_test) 

## Feature extraction, grid search and training the model
Function performing feature extraction, grid search, training and testing the model

In [None]:
from utilities import display_results
from utilities import display_interesting_results

def display_model_results(X_test, y_pred, y_test):
    # display some results
    print("Classification results")
    plt_all = display_results(X_test, y_pred, y_test, 15)

    if export_results:
        plt_all.savefig(export_folder + "/classification_results_"+datetime.datetime.now().strftime(date_format)+".png")

    plt_all.show()

    ## display interesting results
    print("Incorrect classification results")
    from utilities import display_interesting_results
    plt_wrong = display_interesting_results(X_test, y_pred, y_test)
    if export_results:
        plt_wrong.savefig(export_folder + "/incorrect_classification_results_"+datetime.datetime.now().strftime(date_format)+".png")

    plt_wrong.show()

In [None]:
import sklearn
from sklearn import svm
from sklearn.model_selection import GridSearchCV
from sklearn.model_selection import train_test_split
from sklearn.metrics import classification_report
import pandas as pd

def feature_selection_prediction(method,x_train, y_train,x_test, y_test,explained_variance=True,inverse_transform=True):
    x_train_reduced = method.fit_transform(x_train,y_train)
    
    if explained_variance:
        print(f'Method preserved information={sum(method.explained_variance_ratio_)}')
        print(f'Method loss={1-sum(method.explained_variance_ratio_)}')
    
    param_grid = {'C': [0.1, 1, 10, 100, 1000], 
              'gamma': [1, 0.1, 0.01, 0.001, 0.0001],
              'kernel': ['rbf','poly']} 
    model = GridSearchCV(svm.SVC(), param_grid, refit = True, verbose = 0)
    model.fit(x_train_reduced,y_train)
    print(f'GridSearch best params after tuning={ model.best_params_}')
    print(f'GridSearch model after tuning={model.best_estimator_}')  
    
    ## test model
    x_test_reduced = method.transform(x_test)
    y_pred = model.predict(x_test_reduced)
    
    ## print classification result
    report = classification_report(y_test, y_pred, digits=4)
    print(f'\nClassification_report=\n{report}\n')
    if export_results:
        file = open(export_folder + "/classification_report_"+datetime.datetime.now().strftime(date_format)+".txt", 'w')
        file.write(report)
        file.close()
        
    # store model
    #if export_results and on_cluster:
    #    model.save(export_folder + "/model_"+datetime.datetime.now().strftime(date_format)+".h5")
    #'GridSearchCV' object has no attribute 'save', we would probably need to install pickle on cluster
    
    if inverse_transform:
        X_test = method.inverse_transform(x_test_reduced)
        X_test = pd.DataFrame(X_test, dtype="float", columns = ['pixel' + str(i + 1) for i in range(X_test.shape[1])])

        display_model_results(X_test,y_pred, y_test)

Testing and training of four different feature extraction methods.

In [None]:
print("Principle Component Analysis\n")
from sklearn.decomposition import PCA
pca = PCA(n_components=20)
feature_selection_prediction(pca, X_train, y_train,X_test, y_test)