# Random features scores reference

The purpose of this notebook is to have an insight of the scores that randomly selected features get. We want to compare these scores with different subset sizes.

In [None]:
import numpy as np
from matplotlib import pyplot as plt

In [None]:
# Use Golub
#GROUP_NAME = "golub_19122016"
#GROUP_NAME = "golub_16012017"
GROUP_NAME = "golub_06022017"
DATASET = "Golub" # choose between "Golub" and "MILE"

# Use MILE
#GROUP_NAME = "MILE_21012017"
#DATASET = "MILE"

In [None]:
import pickle

ds = pickle.load(open("%s.pkl" % GROUP_NAME,"rb"))

In [None]:
X = ds.get_X()
y = ds.get_y()

X_train = ds.get_X_train()
y_train = ds.get_y_train()
X_test = ds.get_X_test()
y_test = ds.get_y_test()

class_names = range(len(set(ds.get_y())))

N_FEATURES = len(X_train[0])
print("Number of genes: %d" % N_FEATURES)
print("Dataset samples: %d" % len(y))
print("Train set size %d" % len(X_train))
print("Test set size %d" % len(X_test))

In [None]:
# The used score function is F1-Score. This function can leads to 0/0 division.
# These following lines hide warnings about 0/0 divisions when computing the F-Score. 
# When looking at the source code, all 0/0 divisions are set to 0. 
import warnings
from sklearn.exceptions import UndefinedMetricWarning

warnings.filterwarnings("ignore", category=UndefinedMetricWarning)

In [None]:
assessed_lists = list()
score_index = 2

In [None]:
from merge.SubsetAssessor import SubsetAssessor
from itertools import chain
import random


score_std = []
N = 5

K = chain([1, 5, 10, 50, 100],(range(200, N_FEATURES, int(0.15*N_FEATURES))))
for k in K:
    for _ in range(N):
        random_features = random.sample(range(N_FEATURES), k)
        sa = SubsetAssessor(random_features, ds, k=5)
        score_std.append((sa.score, sa.std))


    # get the median of the scores. Warning: This is not the real median. 
    # The real one would take the mean between the n/2 and (n/2)+1 elements if the n is even
    score, std = sorted(score_std, key=lambda x:x[0])[len(score_std)//2]
    print("Random features scores: %.2f" % score)

    assessed_lists.append(("%d random f." % k, random_features, score, std))

In [None]:
def show_barchart_merging_methods(labels, scores, stds):
    y_pos = np.arange(len(labels))

    fig = plt.figure(figsize=(16,4))
    ax = fig.add_subplot(111)

    ax.bar(y_pos, scores, align='center', yerr=stds, 
           alpha=0.8, width=0.3, color="turquoise", edgecolor="turquoise", ecolor="black")

    plt.xticks(y_pos, labels)

    # add values above the bars
    for a,b in enumerate(scores):
        plt.text(a, b, " %.2f" % b, ha='left', va='bottom')

    plt.ylabel('Score')
    plt.ylim(0.0, 1.1)
    plt.title('Median score between several merging methods')
    plt.gca().yaxis.grid(True)
    plt.tight_layout()

    plt.show()

assessed_lists = sorted(assessed_lists, key=lambda x:len(x[1]), reverse=False)
names, selected_features, scores, stds = zip(*assessed_lists)

labels = ["%s\n(#%d)" % (name, len(feats)) for name, feats in zip(names, selected_features)]
show_barchart_merging_methods(labels, scores, stds)