# BIO-SELECT - Marigliano

In [None]:
from sklearn import neighbors, datasets
import pandas as pd
import os
from matplotlib import pyplot as plt
import numpy as np
from sklearn import preprocessing

from sklearn.feature_selection import SelectKBest
from sklearn.feature_selection import f_classif
from sklearn.multiclass import OneVsOneClassifier, OneVsRestClassifier
from sklearn.pipeline import Pipeline
from sklearn.svm import LinearSVC

%matplotlib inline

# set float precision at 2 digits
np.set_printoptions(precision=2)

In [None]:
def min_max_norm(X):
    min_x = np.min(X)
    return (X - min_x) / (np.max(X) - min_x)

def plot_scores_per_features(scores_per_features, N=50, title=""):
    
    fig = plt.figure(figsize=(12,6))
    ax = fig.add_subplot(111)
    
    if len(scores_per_features)-1 > N:
        features, scores = zip(*scores_per_features[:N])
    else:
        features, scores = zip(*scores_per_features)
    
    xs = range(len(scores))
    ys = scores
    
    ax.bar(xs, ys, align='center', width=0.8, alpha=0.3)
    
    ax.set_ylabel('Score')
    ax.set_xlabel('Features')
    
    ax.set_xlim(-1)    
    
    # add values above the bars
    for a,b in zip(xs, ys):
        plt.text(a, b, str(features[a]), ha='center', va='bottom', rotation=90)
    
    if title != "":
        title = "[" + title + "]"
    plt.title("%s Scores per features" % title)
    plt.show()

## Load Datasets

In [None]:
from sklearn.model_selection import train_test_split

from datasets.EGEOD22619.EGEOD22619Dataset import EGEOD22619Dataset
from datasets.MILE.MileDataset import MileDataset
from datasets.Golub99.GolubDataset import GolubDataset

from datasets.DatasetEncoder import DatasetEncoder
from datasets.DatasetSplitter import DatasetSplitter

#ds = MileDataset()
#ds = EGEOD22619Dataset()
ds = GolubDataset()

# encode Dataset string classes into numbers
ds_encoder = DatasetEncoder(ds)
ds = ds_encoder.encode()
ds = DatasetSplitter(ds, test_size=0.4)

X = ds.get_X()
y = ds.get_y()
print(len(y))

X_train = ds.get_X_train()
y_train = ds.get_y_train()
X_test = ds.get_X_test()
y_test = ds.get_y_test()

print(len(X_train[0]))

## Classification using dumb KNN (all features)
Used as reference

In [None]:
from sklearn.neighbors import KNeighborsClassifier
classifier = KNeighborsClassifier(n_neighbors=4, algorithm="auto")
classifier = classifier.fit(X_train, y_train)

score = classifier.score(X_test, y_test)
print("score :", score)

## Classification using ExtraTrees

In [None]:
from sklearn.ensemble import ExtraTreesClassifier

classifier = OneVsRestClassifier(
    ExtraTreesClassifier(n_jobs=-1, n_estimators=100), n_jobs=-1)
classifier = classifier.fit(X_train, y_train)

print("Score %.3f" % classifier.score(X_test, y_test))

In [None]:
# get feature importance when using OneVs[Rest|One]Classifier
# iterate over estimators_[i] for RF
feat_importances = classifier.estimators_[0].feature_importances_
feat_importances_sorted = sorted(enumerate(feat_importances), key=lambda x: x[1], reverse=True)

N = 1000
plt.plot([feat[1] for feat in feat_importances_sorted[:N]])
plt.xlim(-100)
plt.title("Best %d features importances for ExtraTrees" % N)

In [None]:
plot_scores_per_features(feat_importances_sorted, title="ExtraTrees")

## Pipeline example

SelectKBest + f-classif + LinearSVC

In [None]:
clf = Pipeline([('f_classif', SelectKBest(f_classif, k=1000)),
                ('svm', OneVsRestClassifier(LinearSVC()))])

clf.fit(X_train, y_train)

predictions = clf.predict(X_train)

score = clf.score(X_test, y_test)
print("Score %.3f " % score)

s = clf.named_steps["f_classif"].scores_
print(s[:10])

support = clf.named_steps['f_classif'].get_support()

features = enumerate(support)
used_features = [f[0] for f in features if f[1] == True]
print("Nb used features : %d " % len(used_features))
print("5 first used features indices: %s" % used_features[:5])

# Plot scores per features
scores_per_features = sorted(enumerate(s), key=lambda x: x[1], reverse=True)
plot_scores_per_features(scores_per_features, title="SelectKBest + f_classif + LinearSVC")

## ReliefF

In [None]:
from skfeature.function.similarity_based import reliefF

scores = reliefF.reliefF(X_train, np.array(y_train))

scores_per_features = sorted(enumerate(scores), key=lambda p:p[1], reverse=True)

# print the best 5 features with their score
N = 5
print("Best %d features and their ranking" % N)
for i in range(N):
    print("\tfeat: %d, ranking: %.2f" % scores_per_features[i])

# get best features indices
indices = reliefF.feature_ranking(scores)
print(indices)

# Plot scores per features
plot_scores_per_features(scores_per_features, title="ReliefF")

### Fisher Score

In [None]:
from skfeature.function.similarity_based import fisher_score

score = fisher_score.fisher_score(X_train, y_train)

print("Classes: %s" % list(set(y_train)))

# print the best 5 features with their score
n_best_features = sorted(enumerate(score), key=lambda p:p[1], reverse=True)
print(n_best_features[:5])

# get best features indices
indices = fisher_score.feature_ranking(score)
print(indices)

In [None]:
f = [f[0] for f in n_best_features]
s = [f[1] for f in n_best_features]
plt.plot(s)

In [None]:
plot_scores_per_features(n_best_features, title="Fisher Score")

### Test with http://featureselection.asu.edu/tutorial.php

Takes the best N features from a random subset of size M, P times

with N in [1, M], M = 1000, P = 10

In [None]:
from sklearn import svm
from sklearn.metrics import accuracy_score
import random

acc = []

max_features = 20

def get_best_features_subset(features_indices):
    max_acc = -1.0
    
    for N in range(1, len(features_indices)):
        selected_features_train = X_train[:, features_indices[:N]]
        selected_features_test = X_test[:, features_indices[:N]]

        clf = svm.LinearSVC()

        clf.fit(selected_features_train, y_train)
        y_predict = clf.predict(selected_features_test)

        last_acc = accuracy_score(y_test, y_predict)
        acc.append(last_acc)

        if last_acc > max_acc:
            max_acc = last_acc
            best_features = features_indices[:N]

    return best_features, max_acc


best_of_best_features = set()
total_of_best_features = 0
P = 3
for _ in range(P):
    random_features_indices = random.sample(range(1, len(X[0])), max_features)
    best_features, max_acc = get_best_features_subset(random_features_indices)

    print("max score %s with %s features" % (max_acc, len(best_features)))
    #print("Best features are %s" % best_features)
    print("")
    
    best_of_best_features.update(best_features)
    total_of_best_features += len(best_features)

print("%s uniques features over a total of %s" % (len(best_of_best_features), total_of_best_features))
print("best of best: %s" % best_of_best_features)

In [None]:
#from skfeature.function.wrapper import svm_backward
#
#print(X_train)
#print(y_train)
#score = svm_backward.svm_backward(X_train, np.array(y_train), n_selected_features=3)
#
## print the best 3 features with their score
#n_best_features = sorted(enumerate(score), key=lambda p:p[1], reverse=True)
#print(n_best_features[:3])
#
## get best features indice
#idx = svm_backward.feature_ranking(score)
#print(idx)

In [None]:
#from skfeature.function.statistical_based import CFS
#
#F = CFS.cfs(X_train, y_train)
#print(F)

## [debug] Test with f_classif to understand F and pvalues

In [None]:
from sklearn.feature_selection import f_classif
import math

# basic example where only the 1st feature is important
totoX = [[1,2], [-1,3], [-1,-2], [-1,23], [1,-2], [1,2]]
totoY = [1, -1, -1, -1, 1, 1]
F, pvalues = f_classif(totoX, totoY)

print(F)
print(pvalues)
# we see that

In [None]:
from sklearn.feature_selection import f_classif

F, pvalues = f_classif(X, y)
F_sorted = sorted(enumerate(F), key=lambda x: x[1], reverse=True)

N = 1000

print("Best features according to F score: ")
for x in F_sorted[:4]:
    print("%d : %0.3f" % (x[0], x[1]))

best_X_F = F_sorted[:N]

F_scores = [x[1] for x in best_X_F]
 
plt.plot(F_scores)
plt.ylabel('F score')

plt.title('Best %s features according to F score' % N)
plt.show()

In [None]:
plot_scores_per_features(F_sorted, title="F Score")

## Mutual Information classifier

In [None]:
from sklearn.feature_selection import mutual_info_classif
import math

X_mi = mutual_info_classif(X, y, n_neighbors=10)

In [None]:
X_mi_sorted = sorted(enumerate(X_mi), key=lambda x: x[1], reverse=True)

N = 1000

print("Best features according to MI score: ")
for x in X_mi_sorted[:4]:
    print("%d : %0.3f" % (x[0], x[1]))

best_X_mi = X_mi_sorted[:N]

mi_scores = [x[1] for x in best_X_mi]
 
plt.plot(mi_scores)
plt.ylabel('Estimated Mutual Info')

plt.title('Best %s features according to mutual info' % N)
 
plt.show()

In [None]:
plot_scores_per_features(X_mi_sorted, title="Mutual Info")