# Example using ilastik-feature-selection using vigra RF

This notebook demonstrates how to use `ilastik-feature-selection`.
It is more or less the same as the one using the sklearn random forest, but kept separate as not to confuse users with the peculiarities of vigra.

In [None]:
import ilastik_feature_selection
import seaborn as sns
import sklearn.ensemble
from sklearn.model_selection import train_test_split
from sklearn.metrics import confusion_matrix
from sklearn.ensemble import RandomForestClassifier
import pandas as pd
import numpy as np
from vigra.learning import RandomForest as VigraRF
from sklearn.metrics import accuracy_score

In [None]:
# load and visualize the dataset
penguins = sns.load_dataset('penguins')
penguins['species'] = penguins['species'].astype('category')
# Drop two columns and the rows that have NaN values in them
penguins_filtered = penguins.drop(columns=['island', 'sex']).dropna()

# Extract columns corresponding to features
penguins_features = penguins_filtered.drop(columns=['species'])
# define targets
targets = pd.factorize(penguins_filtered["species"])[0]

In [None]:
# For vigra we need to fix some data types:
penguins_features_np = penguins_features.to_numpy().astype("float32")
targets_np = targets.astype("uint32")[..., np.newaxis]

In [None]:
X_train, X_test, y_train, y_test = train_test_split(
    penguins_features_np, targets_np,test_size=0.2, random_state=0, shuffle=True, stratify=targets_np
)

In [None]:
# add an extra class that exposes the same interface on a vigra RF as a sklearn RF
class VigraSklearnRF(VigraRF):
    """
    Adaptor class that exposes an interface more similar to sklearn.ensemble.RandomForestClassifier
    which is expected in wrapper_selection.
    
    With this class the vigra RF is more or less a drop-in replacement.
    """
    def __init__(self, *args, n_estimators=100, random_state=None, **kwargs):
        super().__init__(*args, treeCount=n_estimators, **kwargs)
        self._random_state = random_state

    def fit(self, X, y):
        self.learnRF(X, y, self._random_state if self._random_state else 0)
        return self
        
    def score(self, X, y, sample_weight=None) -> float:
        """
        evaluates X and returns mean accuracy wrt y
        """
        return accuracy_score(y, self.predictLabels(X), sample_weight=sample_weight)
    
    def predict(self, X):
        return self.predictLabels(X)

In [None]:
# Train a random forest with the full set of four features
rf_full = VigraSklearnRF(random_state=42, n_estimators=100)
score_rf_full = rf_full.fit(X_train, y_train).score(X_test, y_test)
print(
    "Random forest trained with all four features achieves an accuracy of "
    f"{score_rf_full:.3f} on the test set."
)

## Run the feature selection to reduce the feature set
### Wrapper method feature selection

In [None]:
# prepare and run the feature selection
size_penalty = 0.2 # higher values result in less features, lower values will result in more features
classifier = VigraSklearnRF(random_state=42, n_estimators=100)
evaluation_function = ilastik_feature_selection.wrapper_feature_selection.EvaluationFunction(
    classifier,
    complexity_penalty=size_penalty)
feat_selector = ilastik_feature_selection.wrapper_feature_selection.WrapperFeatureSelection(
    X_train, y_train, evaluation_function.evaluate_feature_set_size_penalty, method="BFS")
result = feat_selector.run(do_advanced_search=True)
print(f"selected feature indices {result[0]}")

In [None]:
# train and evaluate random forest with reduced feature set
rf_wrapper = VigraSklearnRF(random_state=42, n_estimators=100)
score_rf_wrapper = rf_wrapper.fit(X_train[:, result[0]], y_train).score(X_test[:, result[0]], y_test)
print(
    "Random forest trained features from wrapper selection method achieves an accuracy "
    f"of {score_rf_wrapper:.3f} on the test set."
)