# BIO-SELECT - Marigliano
## Features merging using several lists

_TODO_ : insert global pipeline image here + highlight this notebook on the picture

## Imports

In [None]:
from sklearn import neighbors, datasets
import pandas as pd
import os
from matplotlib import pyplot as plt
import numpy as np
from sklearn import preprocessing

from sklearn.feature_selection import SelectKBest
from sklearn.feature_selection import f_classif
from sklearn.multiclass import OneVsOneClassifier, OneVsRestClassifier
from sklearn.pipeline import Pipeline
from sklearn.svm import LinearSVC

from utils.ConfusionMatrix import ConfusionMatrix

import itertools
from sklearn.metrics import confusion_matrix

import math

%matplotlib inline

# set float precision at 2 digits
np.set_printoptions(precision=2)

# set the random seed for reproducibility
#np.random.seed(4)

# increase font size in matplotlib
import matplotlib
matplotlib.rcParams.update({'font.size': 13})

In [None]:
# Use Golub
#GROUP_NAME = "golub_19122016"
GROUP_NAME = "golub_16012017"
DATASET = "Golub" # choose between "Golub" and "MILE"

# Use MILE
#GROUP_NAME = "MILE_21012017"
#DATASET = "MILE"

## Load the features lists

TODO: load the features lists from CSV files

In [None]:
from utils.CSVFeaturesImporter import CSVFeaturesImporter

importer = CSVFeaturesImporter(GROUP_NAME)
subsets = importer.load()
#print(subsets["features"].keys())
#print(subsets["features_by_score"]["ReliefF"][:5])


## Features subsets merging
Each algorithm has done its work and provide a subset of features as:
* a ranked score list
* a ranked list (no score)
* a list (no ranking, no score)

This part uses some techniques to combine/merge theses lists into a better one

_TODO_: 
* Visualize the lists
    * Venn diagram ? --> limited to 3 sets, does not scale
    * matrix: show the similarity of features between two subsets
        * Jaccard
        * Union
* implement merge techniques
    * votation
    * weighted votation
    * union of intersection
    * ...

### Subsets visualization

In [None]:
from utils.SimilarityMatrix import SimilarityMatrix

# some set similarity functions
def intersection_count(a, b):
    return len(a.intersection(b))

def jaccard(a, b):
    return len(a.intersection(b))/float(len(a.union(b)))


# plot the similarity matrices
alg_names, features_subsets = subsets["features"].keys(), subsets["features"].values()

plt.figure(figsize=(14, 14))

plt.subplot(1,2,1)
sm = SimilarityMatrix(features_subsets, alg_names, compare_func=jaccard, 
                      title="Jaccard similarity between two feature subsets")
sm.show()

plt.subplot(1,2,2)
sm = SimilarityMatrix(features_subsets, alg_names, compare_func=intersection_count, 
                      title="Intersection between two feature subsets")
sm.show()


#### Dendrogram - visualizing the "distance" between the lists

In [None]:
f_names, f_values = zip(*subsets["features"].items())

# only keep the features indices, drop the features occurences
def extract_lists(f_values):
    for fv in f_values:
        try:
            yield [f_idx for f_idx, _ in fv]
        except ValueError:
            pass
            
            
f_values = [i for i in extract_lists(f_values)]

In [None]:
from utils.Dendrogram import Dendrogram

metrics = [
    'rogerstanimoto',
    'jaccard',
    'dice',
    'russellrao',
    'yule'
]

for m in metrics:
    plt.figure()
    d = Dendrogram(lists=f_values, lists_labels=f_names, metric=m)
    d.show()

We can see that the lists of F Value and Fisher Score are the same (like the similarity matrix has shown).

__For Golub only:__

All the features in CFS are in MRMR (see the intersection in the similarity matrix). But CFS only contains 9 features in total. So the mask of features for CFS is almost a list of False values which means that the distance to the other lists (including MRMR) is high.

### Subsets merging

In [None]:
# technique name, selected features
merged_features_lists = {}

#### Union of all features

In [None]:
from merge.techniques.UnionSubsetMerger import UnionSubsetMerger

susm = UnionSubsetMerger(subsets["features"].values())
merged_features = susm.merge()

merged_features_lists["Union of all features"] = merged_features

#### Keep top N features

In [None]:
from merge.techniques.TopNMerger import TopNMerger
  
merged_features = TopNMerger(subsets["features"].values(), n=100).merge()
merged_features_lists["Keep Top N features"] = merged_features

#### Two by two intersections
Take the intersection between two lists then intersects the result with the next one and so for each remaining list.

In [None]:
from merge.techniques.TwoByTwoIntersectionsMerger import TwoByTwoIntersectionsMerger
    
merged_features = TwoByTwoIntersectionsMerger(subsets["features"]).merge()
merged_features_lists["Two by Two\n intersections"] = merged_features

The same merging technique but using only the lists with a score. All the lists given by algorithms who does not provide scores are ignored.

In [None]:
from merge.techniques.TwoByTwoIntersectionsMerger import TwoByTwoIntersectionsMerger
    
merged_features = TwoByTwoIntersectionsMerger(subsets["features_by_score"]).merge()
merged_features_lists["Two by Two\n intersections (score)"] = merged_features

In [None]:
from merge.techniques.UnionOfIntersectionsMerger import UnionOfIntersectionsMerger

merged_features = UnionOfIntersectionsMerger(subsets["features"]).merge()
merged_features_lists["Union of intersections"] = merged_features

print("#features kept: %d " % len(merged_features))

In [None]:
from merge.techniques.WeightedListsMerger import *

wlm = WeightedListsMerger(subsets["features_by_rank"], max_features_to_keep=300)
merged_features = wlm.merge()
merged_features_lists["Weighted lists"] = merged_features

for name, w in wlm.get_W_per_list():
    print("%s: %.3f" % (name, w))
    
print("Kept %d features" % len(merged_features))
wlm.show_dendrogram()

## Evaluation of the merged subset
Once we have a merged list containing the best features, we would like to evaluate it with several classifiers

_TODO_: use a separate test set ? -> split again train/test set -> no changes in the Dataset class

### Dataset loading

Load the same dataset object that was used to generate the lists of features.
We are doing this because we can use the same split. Otherwise, we have to split the dataset again which might lead to have 'already seen samples' in the test set which can be considered as cheating. 

In [None]:
import pickle

ds = pickle.load(open("%s.pkl" % GROUP_NAME,"rb"))

In [None]:
X = ds.get_X()
y = ds.get_y()

X_train = ds.get_X_train()
y_train = ds.get_y_train()
X_test = ds.get_X_test()
y_test = ds.get_y_test()

class_names = range(len(set(ds.get_y())))

N_FEATURES = len(X_train[0])
print("Number of genes: %d" % N_FEATURES)
print("Dataset samples: %d" % len(y))
print("Train set size %d" % len(X_train))
print("Test set size %d" % len(X_test))

In [None]:
from collections import Counter
c = Counter(y_test)
print(["class %d has %d samples" % (c,s) for c, s in c.most_common()])

### Assess merged features

#### Merging techniques score

In [None]:
# The used score function is F1-Score. This function can leads to 0/0 division.
# Theses following lines hide warnings about 0/0 divisions when computing the F-Score. 
# When looking at the source code, all 0/0 divisions are set to 0. 
import warnings
from sklearn.exceptions import UndefinedMetricWarning

warnings.filterwarnings("ignore", category=UndefinedMetricWarning)

In [None]:
# name, selected_features, score, std
assessed_lists = []

In [None]:
from merge.SubsetAssessor import SubsetAssessor

score_index = 2

for m_technique_name, m_selected_features in merged_features_lists.iteritems():
    m_selected_features = list(m_selected_features)
    
    if len(m_selected_features) == 0:
        print("[warning] %s technique was ignored because it contains 0 features" % m_technique_name)
        assessed_lists.append((m_technique_name, m_selected_features, 0, 0))
        continue
    
    sa = SubsetAssessor(m_selected_features, ds, k=5)
    
    score, std = sa.score, sa.std
    print("[%s] median score: %.2f" % (m_technique_name, score))

    assessed_lists.append((m_technique_name, m_selected_features, score, std))

#### Compare the merged techniques against k random features and against all the features

Compare against random lists

In [None]:
import random

score_std = []
N = 8
k = 100 # length of the random lists
for _ in range(N):
    random_features = random.sample(range(N_FEATURES), k)
    sa = SubsetAssessor(random_features, ds, k=5)
    score_std.append((sa.score, sa.std))


# get the median of the scores. Warning: This is not the real median. 
# The real one would take the mean between the n/2 and (n/2)+1 elements if the n is even
score, std = sorted(score_std, key=lambda x:x[0])[len(score_std)//2]
print("Random features scores: %.2f" % score)

assessed_lists.append(("%d random features" % k, random_features, score, std))

Compare using all the features

In [None]:
all_features = range(N_FEATURES)
sa = SubsetAssessor(all_features, ds, k=5)
score, std = sa.score, sa.std

print("Using all features scores: %.2f" % score)

assessed_lists.append(("All features", all_features, score, std))

### Plot a bar chart with the mean score for the merging methods

In [None]:
def show_barchart_merging_methods(labels, scores, stds):
    y_pos = np.arange(len(labels))

    fig = plt.figure(figsize=(16,4))
    ax = fig.add_subplot(111)

    ax.bar(y_pos, scores, align='center', yerr=stds, 
           alpha=0.8, width=0.3, color="turquoise", edgecolor="turquoise", ecolor="black")

    plt.xticks(y_pos, labels)

    # add values above the bars
    for a,b in enumerate(scores):
        plt.text(a, b, " %.2f" % b, ha='left', va='bottom')

    plt.ylabel('Score')
    plt.ylim(0.0, 1.1)
    plt.title('Median score between several merging methods')
    plt.gca().yaxis.grid(True)
    plt.tight_layout()

    plt.show()

assessed_lists = sorted(assessed_lists, key=lambda x:x[score_index], reverse=True)
names, selected_features, scores, stds = zip(*assessed_lists)

labels = ["%s\n(#%d)" % (name, len(feats)) for name, feats in zip(names, selected_features)]
show_barchart_merging_methods(labels, scores, stds)