# BIO-SELECT - Marigliano
## Global pipeline

_TODO_ : insert global pipeline image here

## Imports

In [None]:
from sklearn import neighbors, datasets
import pandas as pd
import os
from matplotlib import pyplot as plt
import numpy as np
from sklearn import preprocessing

from sklearn.feature_selection import SelectKBest
from sklearn.feature_selection import f_classif
from sklearn.multiclass import OneVsOneClassifier, OneVsRestClassifier
from sklearn.pipeline import Pipeline
from sklearn.svm import LinearSVC

import itertools
from sklearn.metrics import confusion_matrix

%matplotlib inline

# set float precision at 2 digits
np.set_printoptions(precision=2)

# set the random seed for reproducibility
np.random.seed(0)

## Constants

In [None]:
N_FEATURES_ALGORITHM = 100

## Dataset loading
_TODO_: 
* this notebook must only load one dataset
* retrieve dataset to load from cmd arguments or from env variable

In [None]:
from datasets.EGEOD22619.EGEOD22619Dataset import EGEOD22619Dataset
from datasets.MILE.MileDataset import MileDataset
from datasets.Golub99.GolubDataset import GolubDataset

from datasets.DatasetEncoder import DatasetEncoder
from datasets.DatasetSplitter import DatasetSplitter
from datasets.DatasetLoader import DatasetLoader
from datasets.DatasetBalancer import DatasetBalancer

# Load dataset from environment variable. This is used by automated scripts
ds_class = DatasetLoader.load_from_env_var(default_dataset="MILE")

print("Dataset used: %s" % ds_class.__name__)

ds = ds_class()

## Dataset transformation
The dataset needs some transformations such as encoding the outputs as float (necessary for scikit learn), normalization, ...

_TODO_:
* dataset splitting (train, test[, validation])
* encode outputs
* normalization
* classes merging
    * due to the low class balancing we might want to regroup them. Example Healthy vs Non-Healthy (choose the most represented class ?)

In [None]:
# encode Dataset string classes into numbers
ds_encoder = DatasetEncoder(ds)
ds = ds_encoder.encode()

ds_balancer = DatasetBalancer(ds)
ds = ds_balancer.balance()

ds = DatasetSplitter(ds, test_size=0.4)

X = ds.get_X()
y = ds.get_y()

X_train = ds.get_X_train()
y_train = ds.get_y_train()
X_test = ds.get_X_test()
y_test = ds.get_y_test()

print("Number of genes: %d" % len(X_train[0]))
print("Dataset samples: %d" % len(y))
print("Train set size %d" % len(X_train))
print("Test set size %d" % len(X_test))

## Algorithms
Run the chosen algorithms and save them and their output subset of features using cPickle into files. They can be used later to display some graphs and to be analyzed

_TODO_: Write a subsection for each algorithm :
* OneVsRest or OneVsOne ?
    * only for those who needs it
* Grid search + CV
    * maybe not for all algorithms such as SVM RFE which takes a lot of time
    * not for algorthms which does not have parameters to tune (ReliefF, Fisher Score,...)
* print classification report (accuracy, recall, precision, ...)
    * issue: not all algortihms are able to do this
* normalize score using minmax normalization (0-1)
* show score per features (50 to 100 first ones)
* save algorithm in a file

Algorithms:
* ExtraTrees
* Random Forest
* SVM
* SVM RFE
* ANN
* ReliefF
* Fisher Score
* "Best features subset ~ SVM"
* SVM Backward ?
* CFS - Correlation-based Feature Selection
* Mutual Information Classifier
* One genetic based algorithm

In [None]:
from algorithms.Algorithm import NotSupportedException
from algorithms.ExtraTreesAlgorithm import ExtraTreesAlgorithm
from algorithms.ReliefFAlgorithm import ReliefFAlgorithm
from algorithms.FisherScoreAlgorithm import FisherScoreAlgorithm
from algorithms.FValueAlgorithm import FValueAlgorithm
from algorithms.SVMAlgorithm import SVMAlgorithm
from algorithms.GAANNAlgorithm import GAANNAlgorithm

algorithms = []

eta_grid = [{'n_estimators': np.arange(10, 1000, 300), 'criterion': ["gini", "entropy"], 'max_features': ["sqrt", "auto", "log2"], "n_jobs": [-1]}]
%time eta = ExtraTreesAlgorithm(ds, N_FEATURES_ALGORITHM, eta_grid)
algorithms.append(eta)
print("ExtraTrees best params \n\t%s" % eta.best_params)


rff = ReliefFAlgorithm(ds, N_FEATURES_ALGORITHM)
algorithms.append(rff)

fsa = FisherScoreAlgorithm(ds, N_FEATURES_ALGORITHM)
algorithms.append(fsa)

fva = FValueAlgorithm(ds, N_FEATURES_ALGORITHM)
algorithms.append(fva)


#FIXME: grid search does not seem to work (for SVM at least)
gridsearch_params = [{
        'kernel':['linear'],
        'C':[200, 0.1, 1, 10, 100, 1000],
        'gamma' : [1e-2, 1e-3, 1e-4, 1e-5],
        'tol' : [1e-2, 1e-3, 1e-4, 1e-5],
        'cache_size':[1024]
    }]
#%time svm_gs = SVMAlgorithm(ds, N_FEATURES_ALGORITHM, gridsearch_params)
#algorithms.append(svm_gs)
#print("Best params \n\t%s" % svm_gs.best_params)

%time svm = SVMAlgorithm(ds, N_FEATURES_ALGORITHM)
algorithms.append(svm)

#%time gaanna = GAANNAlgorithm(ds, N_FEATURES_ALGORITHM)
#algorithms.append(gaanna)

subsets = []
alg_names = []
for alg in algorithms:
    feats = alg.get_best_features()
    
    subsets.append(feats)
    alg_names.append(alg.name)
    
    try:
        print("[%s] score: %.3f" % (alg.name, alg.get_score()))
    except NotSupportedException:
        pass


## Features subsets merging
Each algorithm has done its work and provide a subset of features as:
* a ranked score list
* a ranked list (no score)
* a list (no ranking, no score)

This part uses some techniques to combine/merge theses lists into a better one

_TODO_: 
* Visualize the lists
    * Venn diagram ? --> limited to 3 sets, does not scale
    * matrix: show the similarity of features between two subsets
        * Jaccard
        * Union
        * Cosine similarity
* implement merge techniques
    * votation
    * weighted votation
    * union of intersection
    * ...

### Subsets visualization

In [None]:
from scipy import spatial

# some set similarity functions
def intersection_count(a, b):
    return len(a.intersection(b))

def jaccard(a, b):
    return len(a.intersection(b))/float(len(a.union(b)))

def cosine_similarity(a, b):
    return 1.0 - spatial.distance.cosine(np.array(list(a)), np.array(list(b)))

def compute_similary_between_subsets(subsets, compare_func):
    N_subsets = len(subsets)
    similarity_matrix = np.zeros(shape=(N_subsets, N_subsets))

    for i, j in itertools.product(range(N_subsets), range(N_subsets)):
        if isinstance(subsets[0][0], int):
            subset_i = set(subsets[i])
            subset_j = set(subsets[j])
        else:
            subset_i = {i[0] for i in subsets[i]}
            subset_j = {j[0] for j in subsets[j]}

        similarity_matrix[i, j] = compare_func(subset_i, subset_j)
        
    return similarity_matrix

def plot_feature_subsets_matrix(cm, alg_names, title, cmap=plt.cm.Blues):
    title += "\n" # add a little margin for the title
    
    plt.imshow(cm, interpolation='nearest', cmap=cmap)
    #plt.colorbar()
    plt.title(title)
    tick_marks = np.arange(len(alg_names))
    plt.xticks(tick_marks, alg_names, rotation=45)
    plt.yticks(tick_marks, alg_names)

    thresh = cm.max() / 2.0 + 0.1
    for i, j in itertools.product(range(cm.shape[0]), range(cm.shape[1])):
        text = "%.2f" % cm[i, j]
        plt.text(j, i, text,
                 horizontalalignment="center",
                 backgroundcolor="white",
                 #color="white" if cm[i, j] > thresh else "black")
                 color="black")

    plt.tight_layout()


similarity_matrix = compute_similary_between_subsets(subsets, compare_func=jaccard)
plt.figure(figsize=(6, 8))
plot_feature_subsets_matrix(similarity_matrix, alg_names, title="Jaccard similarity between two feature subsets")

similarity_matrix = compute_similary_between_subsets(subsets, compare_func=intersection_count)
plt.figure(figsize=(6, 8))
plot_feature_subsets_matrix(similarity_matrix, alg_names, title="Intersection between two feature subsets")

similarity_matrix = compute_similary_between_subsets(subsets, compare_func=cosine_similarity)
plt.figure(figsize=(6, 8))
plot_feature_subsets_matrix(similarity_matrix, alg_names, title="Cosine similarity between two features subsets")

### Subsets merging

In [None]:
from merge.simple.SimpleUnionSubsetMerger import SimpleUnionSubsetMerger

susm = SimpleUnionSubsetMerger(subsets)
merged_features = susm.merge()

print("Unique features (union of all subsets): %d over a total of %d " % (len(merged_features), (N_FEATURES_ALGORITHM * len(subsets))))

## Evaluation of the merged subset
Once we have a merged list containing the best features, we would like to evaluate it with several classifiers

_TODO_: use a separate test set ? -> split again train/test set -> no changes in the Dataset class

In [None]:
is_list_unique = len(merged_features) == len(set(merged_features))
print("is list unique", is_list_unique)

merged_features = list(merged_features)

In [None]:
from sklearn.model_selection import cross_val_score
from sklearn.neighbors import KNeighborsClassifier
from sklearn.ensemble import ExtraTreesClassifier
from sklearn.neural_network import MLPClassifier

def assess_merged_features(clf, clf_name, assessment_scores):
    clf.fit(ds.get_X_train(), ds.get_y_train())
    y_pred = clf.predict(ds.get_X_test())
    y_test = ds.get_y_test()

    scores = cross_val_score(clf, ds.get_X_test()[:, merged_features], ds.get_y_test(), cv=3, n_jobs=-1)
    score = np.mean(scores)

    print("[%s] Score using the merged list of features: %.3f" % (clf_name, score))

    assessment_scores[clf_name] = score, (y_test, y_pred)

    
assessment_scores  = {}

clf = KNeighborsClassifier(n_neighbors=5)
assess_merged_features(clf, "KNN", assessment_scores)

#clf = MLPClassifier(solver="adam", alpha=1e-3, hidden_layer_sizes=(100, 50), activation="relu")
clf = MLPClassifier(activation='relu', alpha=1e-05, batch_size='auto', early_stopping=False,
       epsilon=1e-08, hidden_layer_sizes=(100,50), learning_rate='constant',
       learning_rate_init=0.001, max_iter=1000, momentum=0.9,
       nesterovs_momentum=True, power_t=0.5, shuffle=True,
       solver='lbfgs', tol=0.0001, verbose=False,
       warm_start=False)

assess_merged_features(clf, "MLP", assessment_scores)

clf = ExtraTreesClassifier(n_jobs=-1, n_estimators=100)
assess_merged_features(clf, "ExtraTrees", assessment_scores)

### Confusion Matrix

In [None]:
def plot_confusion_matrix(cm, classes,
                          title='Confusion matrix',
                          cmap=plt.cm.Blues):
    """
    This function prints and plots the confusion matrix.
    Normalization can be applied by setting `normalize=True`.
    """
    plt.imshow(cm, interpolation='nearest', cmap=cmap)
    plt.title(title)
    plt.colorbar(fraction=0.046, pad=0.04)
    tick_marks = np.arange(len(classes))
    plt.xticks(tick_marks, classes, rotation=45)
    plt.yticks(tick_marks, classes)

    thresh = cm.max() / 2.
    for i, j in itertools.product(range(cm.shape[0]), range(cm.shape[1])):
        plt.text(j, i, cm[i, j],
                 horizontalalignment="center",
                 color="white" if cm[i, j] > thresh else "black")

    plt.ylabel('True label')
    plt.xlabel('Predicted label')

In [None]:
from sklearn.ensemble import ExtraTreesClassifier
import math

class_names = range(len(set(ds.get_y())))

plt.figure(figsize=(12, 8))

n_subplots = len(assessment_scores)
cols = 3
rows = int(math.ceil(n_subplots / cols))
i = 1

for name, score_cm in assessment_scores.iteritems():
    y_test, y_pred = score_cm[1]
    cnf_matrix = confusion_matrix(y_test, y_pred)

    plt.subplot(rows, cols, i)
    i += 1

    plot_confusion_matrix(cnf_matrix, classes=class_names,
                          title='Confusion matrix for %s' % name)

plt.tight_layout()
plt.show()