# BIO-SELECT - Marigliano
## Features selection using several algorithms

_TODO_ : insert global pipeline image here + highlight this notebook on the picture

## Imports

In [None]:
from sklearn import neighbors, datasets
import pandas as pd
import os
from matplotlib import pyplot as plt
import numpy as np
from sklearn import preprocessing

from sklearn.feature_selection import SelectKBest
from sklearn.feature_selection import f_classif
from sklearn.multiclass import OneVsOneClassifier, OneVsRestClassifier
from sklearn.pipeline import Pipeline
from sklearn.svm import LinearSVC

from utils.ConfusionMatrix import ConfusionMatrix

import itertools
from sklearn.metrics import confusion_matrix

import math

%matplotlib inline

# set float precision at 2 digits
np.set_printoptions(precision=2)

# set the random seed for reproducibility
#np.random.seed(0)

## Constants

In [None]:
N_FEATURES_ALGORITHM = 1000
GROUP_NAME = "MILE_16012017"

## Dataset loading
_TODO_: 
* this notebook must only load one dataset
* retrieve dataset to load from cmd arguments or from env variable

In [None]:
from datasets.EGEOD22619.EGEOD22619Dataset import EGEOD22619Dataset
from datasets.MILE.MileDataset import MileDataset
from datasets.Golub99.GolubDataset import GolubDataset

from datasets.DatasetEncoder import DatasetEncoder
from datasets.DatasetSplitter import DatasetSplitter
from datasets.DatasetLoader import DatasetLoader
from datasets.DatasetBalancer import DatasetBalancer

# Load dataset from environment variable. This is used by automated scripts
ds_class = DatasetLoader.load_from_env_var(default_dataset="MILE")

print("Dataset used: %s" % ds_class.__name__)

ds = ds_class()

## Dataset transformation
The dataset needs some transformations such as encoding the outputs as float (necessary for scikit learn), normalization, ...

_TODO_:
* dataset splitting (train, test[, validation])
* encode outputs
* normalization
* classes merging
    * due to the low class balancing we might want to regroup them. Example Healthy vs Non-Healthy (choose the most represented class ?)

In [None]:
# encode Dataset string classes into numbers
ds_encoder = DatasetEncoder(ds)
ds = ds_encoder.encode()

ds = DatasetSplitter(ds, test_size=0.4)

ds_balancer = DatasetBalancer(ds)
ds = ds_balancer.balance()

X = ds.get_X()
y = ds.get_y()

X_train = ds.get_X_train()
y_train = ds.get_y_train()
X_test = ds.get_X_test()
y_test = ds.get_y_test()

class_names = range(len(set(ds.get_y())))

print("Number of genes: %d" % len(X_train[0]))
print("Dataset samples: %d" % len(y))
print("Train set size %d" % len(X_train))
print("Test set size %d" % len(X_test))

Save the dataset split using Pickle

In [None]:
import pickle

pickle.dump(ds, open("%s.pkl" % GROUP_NAME, "wb"))

## Algorithms
Run the chosen algorithms and save them and their output subset of features using cPickle into files. They can be used later to display some graphs and to be analyzed

_TODO_: Write a subsection for each algorithm :
* OneVsRest or OneVsOne ?
    * only for those who needs it
* Grid search + CV
    * maybe not for all algorithms such as SVM RFE which takes a lot of time
    * not for algorthms which does not have parameters to tune (ReliefF, Fisher Score,...)
* print classification report (accuracy, recall, precision, ...)
    * issue: not all algortihms are able to do this
* normalize score using minmax normalization (0-1)
* show score per features (50 to 100 first ones)
* save algorithm in a file

Algorithms:
* ExtraTrees
* Random Forest
* SVM
* SVM RFE
* ANN
* ReliefF
* Fisher Score
* "Best features subset ~ SVM"
* SVM Backward ?
* CFS - Correlation-based Feature Selection
* Mutual Information Classifier
* One genetic based algorithm

In [None]:
from algorithms.Algorithm import NotSupportedException
from algorithms.ExtraTreesAlgorithm import ExtraTreesAlgorithm
from algorithms.ReliefFAlgorithm import ReliefFAlgorithm
from algorithms.FisherScoreAlgorithm import FisherScoreAlgorithm
from algorithms.FValueAlgorithm import FValueAlgorithm
from algorithms.SVMAlgorithm import SVMAlgorithm
from algorithms.GAANNAlgorithm import GAANNAlgorithm
from algorithms.GridSearchableAlgorithm import GridSearchableAlgorithm
from algorithms.SVMRFEAlgorithm import SVMRFEAlgorithm
from algorithms.CFSAlgorithm import CFSAlgorithm
from algorithms.MRMRAlgorithm import MRMRAlgorithm
from algorithms.DeterministAlgorithm import DeterministAlgorithm

from utils.AlgorithmListsUtils import *

# the main idea here is to prepare all the algorithms in a list of tuple.
# Then in a loop each algorithm will be runned and directly freed from memory
# The goal is to keep the algorithm as less time as possible in memory
algorithms = []


# ExtraTrees
eta_grid = [{
        'n_estimators': np.arange(10, 1000, 300), 
        'criterion': ["gini", "entropy"], 
        'max_features': ["sqrt", "auto", "log2", 0.5, 1.0],
        'n_jobs': [2]
    }]

eta = (ExtraTreesAlgorithm, {
        "dataset": ds,
        "n": N_FEATURES_ALGORITHM,
        "gridsearch_params": eta_grid
    })
algorithms.append(eta)


# ReliefF
rff = (ReliefFAlgorithm, {
        "dataset": ds,
        "n": N_FEATURES_ALGORITHM
    })
algorithms.append(rff)


# Fisher score
fsa = (FisherScoreAlgorithm, {
        "dataset": ds,
        "n": N_FEATURES_ALGORITHM
    })
algorithms.append(fsa)


# F-Value
fva = (FValueAlgorithm, {
        "dataset": ds,
        "n": N_FEATURES_ALGORITHM
    })
algorithms.append(fva)


# SVM
#FIXME: grid search for SVM always returns the first set of parameters, like all params give the same performance
svm_grid_params = [{
        'kernel':['linear'],
        'C':[200, 0.1, 1, 10, 100, 1000],
        'gamma' : [1e-2, 1e-3, 1e-4, 1e-5],
        'tol' : [1e-2, 1e-3, 1e-4, 1e-5],
        'cache_size':[1024],
        'n_jobs': [2]
    }]
#%time svm_gs = SVMAlgorithm(ds, N_FEATURES_ALGORITHM, svm_grid_params)
#algorithms.append(svm_gs)
#print("Best params \n\t%s" % svm_gs.best_params)


svm = (SVMAlgorithm, {
        "dataset": ds,
        "n": N_FEATURES_ALGORITHM
    })
algorithms.append(svm)


# GA ANN, commented because it does not give meaningful features
#%time gaanna = GAANNAlgorithm(ds, N_FEATURES_ALGORITHM)
#algorithms.append(gaanna)


## SVM Forward, takes too long, was replaced by SVM-RFE
#from algorithms.SVMForwardAlgorithm import SVMForwardAlgorithm
#svm_forward = (SVMForwardAlgorithm, {
#        "dataset": ds,
#        "n" : N_FEATURES_ALGORITHM
#    })
#algorithms.append(svm_forward)


# SVM-RFE
svm_rfe = (SVMRFEAlgorithm, {
        "dataset": ds,
        "n": N_FEATURES_ALGORITHM
    })
algorithms.append(svm_rfe)


# CFS
cfs = (CFSAlgorithm, {
        "dataset": ds,
        "n": None # CFS gives its list
    })
#algorithms.append(cfs)


# MRMR
mrmr = (MRMRAlgorithm, {
        "dataset": ds,
        "n": N_FEATURES_ALGORITHM
    })
algorithms.append(mrmr)


subsets = {}

plt.figure(figsize=(12, 8))

# the number of subplot is defined by the number of algorithm whose are able to provide a confusion matrix
n_subplots = len([_ for a in algorithms if isinstance(a, GridSearchableAlgorithm)])
cols = 3
rows = max(1, int(math.ceil(n_subplots / cols)))
i = 1

for alg in algorithms:
    alg_class = alg[0]
    alg_kwargs = alg[1]
    alg_name = None
    
    print("Running %s..." % alg_class.__name__)
    
    if issubclass(alg_class, DeterministAlgorithm):
        # This algorithm is determinist, so it will be run once
        %time alg_instance = alg_class(**alg_kwargs)
        alg_name = alg_instance.name
        subsets[alg_name] = {"features": [], "features_by_rank": [], "features_by_score": []}
        
        feats = alg_instance.get_best_features()
        feats = [(f, 1) for f in feats] # assign the same weight for all features
        subsets[alg_name]["features"] = feats
        
        try:
            r = alg_instance.get_best_features_by_rank()
            # reverse the rank to have the best features with a higher score appear first
            rank_tuples = [(v, 1.0/(1.0+k)) for k, v in enumerate(r)]
            subsets[alg_name]["features_by_rank"] = rank_tuples

            subsets[alg_name]["features_by_score"] = alg_instance.get_best_features_by_score()
        except NotSupportedException:
            pass

    else:
        # The algorithm is not determinist, so we run it multiple times and take a kind of "average" of the lists.
        # This will increase the stability of the returned lists.
        list_of_lists = []
        list_of_ranks = []
        list_of_scores = []

        for _ in range(3):
            # instanciate and run the algorithm
            %time alg_instance = alg_class(**alg_kwargs)


            # retrieve features for this run
            feats = alg_instance.get_best_features()
            list_of_lists.extend(feats)


            try:
                r = alg_instance.get_best_features_by_rank()
                rank_tuples = [(v, k) for k, v in enumerate(r)]
                list_of_ranks.extend(rank_tuples)
                list_of_scores.extend(alg_instance.get_best_features_by_score())
            except NotSupportedException:
                pass


        alg_name = alg_instance.name
        subsets[alg_name] = {"features": [], "features_by_rank": [], "features_by_score": []}

        # Compute the "average" of the returned lists
        feats = compute_most_popular_features(list_of_lists)[:N_FEATURES_ALGORITHM]
        subsets[alg_name]["features"].extend(feats)

        if len(list_of_ranks) > 0:
            feats_ranks = compute_score_of_lists(list_of_ranks, higher_is_better=False)[:N_FEATURES_ALGORITHM]
            subsets[alg_name]["features_by_rank"].extend(feats_ranks)

        if len(list_of_scores) > 0:
            feats_scores = compute_score_of_lists(list_of_scores, higher_is_better=True)[:N_FEATURES_ALGORITHM]
            subsets[alg_name]["features_by_score"].extend(feats_scores)
    
    
    
    # print the score of the algorithm, if provided by the latter
    try:
        print("[%s] score: %.3f" % (alg_name, alg_instance.get_score()))
    except NotSupportedException:
        pass
    
    # show the confusion matrix, if supported by the latter
    try:
        cm = alg_instance.get_confusion_matrix()
        plt.subplot(rows, cols, i)
        ConfusionMatrix.plot(cm, class_names, title="Confusion matrix [%s]" % alg_name)
        i += 1
    except AttributeError:
        pass
    
    print("") # for readability
    
    # the algorithm is freed at the end of the loop, but the lists are kept


In [None]:
#TODO: run limma Rscript in bash, then read/parse the csv and add the features to `algorithms` object
#TODO: convert feature names -> id

## Save the features lists

In [None]:
from utils.CSVFeaturesExporter import CSVFeaturesExporter

features_exporter = CSVFeaturesExporter(subsets, group_name=GROUP_NAME)
features_exporter.export()

In [None]:
!ls -c outputs/*.csv