# BIO-SELECT - Marigliano
## Features merging using several lists

_TODO_ : insert global pipeline image here + highlight this notebook on the picture

## Imports

In [None]:
from sklearn import neighbors, datasets
import pandas as pd
import os
from matplotlib import pyplot as plt
import numpy as np
from sklearn import preprocessing

from sklearn.feature_selection import SelectKBest
from sklearn.feature_selection import f_classif
from sklearn.multiclass import OneVsOneClassifier, OneVsRestClassifier
from sklearn.pipeline import Pipeline
from sklearn.svm import LinearSVC

from utils.ConfusionMatrix import ConfusionMatrix

import itertools
from sklearn.metrics import confusion_matrix

import math

%matplotlib inline

# set float precision at 2 digits
np.set_printoptions(precision=2)

# set the random seed for reproducibility
#np.random.seed(4)

# increase font size in matplotlib
import matplotlib
matplotlib.rcParams.update({'font.size': 11})

In [None]:
# Use Golub
GROUP_NAME = "golub_19122016"
DATASET = "Golub" # choose between "Golub" and "MILE"

# Use MILE
#GROUP_NAME = "mile_19122016"
#DATASET = "MILE"

## Load the features lists

TODO: load the features lists from CSV files

In [None]:
from utils.CSVFeaturesImporter import CSVFeaturesImporter

importer = CSVFeaturesImporter(GROUP_NAME)
subsets = importer.load()
#print(subsets["features"].keys())
#print(subsets["features_by_score"]["ReliefF"][:5])


features = subsets["features"]

## Features subsets merging
Each algorithm has done its work and provide a subset of features as:
* a ranked score list
* a ranked list (no score)
* a list (no ranking, no score)

This part uses some techniques to combine/merge theses lists into a better one

_TODO_: 
* Visualize the lists
    * Venn diagram ? --> limited to 3 sets, does not scale
    * matrix: show the similarity of features between two subsets
        * Jaccard
        * Union
* implement merge techniques
    * votation
    * weighted votation
    * union of intersection
    * ...

### Subsets visualization

In [None]:
from utils.SimilarityMatrix import SimilarityMatrix

# some set similarity functions
def intersection_count(a, b):
    return len(a.intersection(b))

def jaccard(a, b):
    return len(a.intersection(b))/float(len(a.union(b)))


# plot the similarity matrices
alg_names, features_subsets = zip(*features.items())

plt.figure(figsize=(14, 14))

plt.subplot(1,2,1)
sm = SimilarityMatrix(features_subsets, alg_names, compare_func=jaccard, 
                      title="Jaccard similarity between two feature subsets")
sm.show()

plt.subplot(1,2,2)
sm = SimilarityMatrix(features_subsets, alg_names, compare_func=intersection_count, 
                      title="Intersection between two feature subsets")
sm.show()


#### Dendrogram - visualizing the "distance" between the lists

In [None]:
f_names, f_values = zip(*subsets["features"].items())

# only keep the features indices, drop the features occurences
def extract_lists(f_values):
    for fv in f_values:
        try:
            yield [f_idx for f_idx, _ in fv]
        except ValueError:
            pass
            
            
f_values = [i for i in extract_lists(f_values)]

In [None]:
from utils.Dendrogram import Dendrogram

metrics = [
    'rogerstanimoto',
    'jaccard',
    'dice',
    'russellrao',
    'yule'
]

for m in metrics:
    plt.figure()
    d = Dendrogram(lists=f_values, lists_labels=f_names, metric=m)
    d.show()

We can see that the lists of F Value and Fisher Score are the same (like the similarity matrix has shown).

__For Golub only:__

All the features in CFS are in MRMR (see the intersection in the similarity matrix). But CFS only contains 9 features in total. So the mask of features for CFS is almost a list of False values which means that the distance to the other lists (including MRMR) is high.

### Subsets merging

In [None]:
# ensure that when we merge the lists of features, the list remains composed of unique features
def assert_list_contains_only_unique_features(features):
    assert len(features) == len(set(features))


In [None]:
# technique name, selected features
merged_features_lists = {}

#### Union of all features

In [None]:
from merge.techniques.UnionSubsetMerger import UnionSubsetMerger

susm = UnionSubsetMerger(features_subsets)
merged_features = susm.merge()

merged_features_lists["Union of all features"] = merged_features

#### Keep top N features

In [None]:
def group_by_features(features):
    from itertools import groupby
    
    def keyfunc(x): return x[0]
    
    list_of_lists_sorted = sorted(features, key=keyfunc)
    grouped_list = [list(j) for i, j in groupby(list_of_lists_sorted, key=keyfunc)]
    return grouped_list

def mean_score_for_feature(a, n_algorithms):
    feat_name, feat_scores = zip(*a)
    feat_name = feat_name[0] # since name is the same for all tuples
    
    m = sum(feat_scores)/float(n_algorithms)
    return (feat_name, m)
    
def keep_top_n(features, n):
    n_algorithms = len(features)
    
    all_feats = []
    for f in features.values():
        all_feats.extend(f)
    
    
    print(all_feats[:8])
    grouped_list = group_by_features(all_feats)
    grouped_list = [mean_score_for_feature(f, n_algorithms) for f in grouped_list]
    
    grouped_list = sorted(grouped_list, key=lambda x: x[1], reverse=True)
    print(grouped_list[:8])
    
    return [x[0] for x in grouped_list[:n]]
    
    
merged_features = keep_top_n(subsets["features"], n=100)
merged_features_lists["Keep Top N features"] = merged_features

assert_list_contains_only_unique_features(merged_features)

#### Union of intersection (two by two)

In [None]:
def union_of_intersection_two_by_two(features):
    sort_by_len_features = sorted(features.values(), key=lambda x:len(x), reverse=True)
    print([len(f) for f in sort_by_len_features])
    
    def inter(x, y):
        intersection = list(set(x).intersection(set(y)))
        print("Intersection length : %d" % len(intersection))
        return intersection
    
    lists_of_features = [([a[0] for a in f]) for f in sort_by_len_features]
    
    # keep the lists that contains at least 500 features
    lists_of_features = filter(lambda x:len(x) > 500, lists_of_features)
    
    return reduce(inter, lists_of_features)

    
merged_features = union_of_intersection_two_by_two(subsets["features_by_score"])
merged_features_lists["Union of intersections"] = merged_features

assert_list_contains_only_unique_features(merged_features)

## Evaluation of the merged subset
Once we have a merged list containing the best features, we would like to evaluate it with several classifiers

_TODO_: use a separate test set ? -> split again train/test set -> no changes in the Dataset class

### Dataset loading
_TODO_: 
* this notebook must only load one dataset
* retrieve dataset to load from cmd arguments or from env variable

In [None]:
from datasets.EGEOD22619.EGEOD22619Dataset import EGEOD22619Dataset
from datasets.MILE.MileDataset import MileDataset
from datasets.Golub99.GolubDataset import GolubDataset

from datasets.DatasetEncoder import DatasetEncoder
from datasets.DatasetSplitter import DatasetSplitter
from datasets.DatasetLoader import DatasetLoader
from datasets.DatasetBalancer import DatasetBalancer

# Load dataset from environment variable. This is used by automated scripts
ds_class = DatasetLoader.load_from_env_var(default_dataset=DATASET)

print("Dataset used: %s" % ds_class.__name__)

ds = ds_class()

### Dataset transformation
The dataset needs some transformations such as encoding the outputs as float (necessary for scikit learn), normalization, ...

_TODO_:
* dataset splitting (train, test[, validation])
* encode outputs
* normalization
* classes merging
    * due to the low class balancing we might want to regroup them. Example Healthy vs Non-Healthy (choose the most represented class ?)

In [None]:
# encode Dataset string classes into numbers
ds_encoder = DatasetEncoder(ds)
ds = ds_encoder.encode()

ds = DatasetSplitter(ds, test_size=0.4)

ds_balancer = DatasetBalancer(ds)
ds = ds_balancer.balance()

X = ds.get_X()
y = ds.get_y()

X_train = ds.get_X_train()
y_train = ds.get_y_train()
X_test = ds.get_X_test()
y_test = ds.get_y_test()

class_names = range(len(set(ds.get_y())))

N_FEATURES = len(X_train[0])
print("Number of genes: %d" % N_FEATURES)
print("Dataset samples: %d" % len(y))
print("Train set size %d" % len(X_train))
print("Test set size %d" % len(X_test))

### Assess merged features

#### Merging techniques score

In [None]:
# name, selected_features, score, std
assessed_lists = []

In [None]:
from merge.SubsetAssessor import SubsetAssessor

score_index = 2

for m_technique_name, m_selected_features in merged_features_lists.iteritems():
    m_selected_features = list(m_selected_features)
    
    sa = SubsetAssessor(m_selected_features, ds, k=10)
    
    score, std = sa.score, sa.std
    print("[%s] median score: %.3f" % (m_technique_name, score))

    assessed_lists.append((m_technique_name, m_selected_features, score, std))

#### Compare the merged techniques against k random features and against all the features

Compare against random lists

In [None]:
import random

score_std = []
N = 5
k = 100 # length of the random lists
for _ in range(N):
    random_features = random.sample(range(N_FEATURES), k)
    sa = SubsetAssessor(random_features, ds, k=5)
    score_std.append((sa.score, sa.std))


# get the median of the scores. Warning: This is not the real median. 
# The real one would take the mean between the n/2 and (n/2)+1 elements if the n is even
score, std = sorted(score_std, key=lambda x:x[0])[len(score_std)//2]
print("Random features scores: %.3f" % score)

assessed_lists.append(("%d random features" % k, random_features, score, std))

Compare using all the features

In [None]:
all_features = range(N_FEATURES)
sa = SubsetAssessor(all_features, ds, k=10)
score, std = sa.score, sa.std

print("Using all features scores: %.3f" % score)

assessed_lists.append(("All features", all_features, score, std))

### Plot a bar chart with the mean score for the merging methods

In [None]:
def show_barchart_merging_methods(labels, scores, stds):
    y_pos = np.arange(len(labels))

    fig = plt.figure(figsize=(11,4))
    ax = fig.add_subplot(111)

    ax.bar(y_pos, scores, align='center', yerr=stds, 
           alpha=0.8, width=0.3, color="turquoise", edgecolor="turquoise", ecolor="salmon")

    plt.xticks(y_pos, labels)

    # add values above the bars
    for a,b in enumerate(scores):
        plt.text(a, b, " %.3f" % b, ha='left', va='bottom')

    plt.ylabel('Score')
    plt.ylim(0.0, 1.1)
    plt.title('Median score between several merging methods')
    plt.gca().yaxis.grid(True)
    plt.tight_layout()

    plt.show()

assessed_lists = sorted(assessed_lists, key=lambda x:x[score_index], reverse=True)
names, selected_features, scores, stds = zip(*assessed_lists)

labels = ["%s\n(#%d)" % (name, len(feats)) for name, feats in zip(names, selected_features)]
show_barchart_merging_methods(labels, scores, stds)

print(stds)