# Feature selection

 - In this notebook forward and backward selection methods are implemented.
 - The methods are run on models with already optimized hyperparameters.
 - The union of feature sets returned by each of these methods is considered to be the optimal selection.

In [None]:
# dependencies

import pandas as pd
import numpy as np

__Restore the splitted data__

In [None]:
%store -r X_train
%store -r y_train

In [None]:
%store -r X_val
%store -r y_val

__Metrics function__

In [None]:
# r2 and rmse and AARD returning function
from sklearn.metrics import r2_score, mean_squared_error

def result_stats(actual, predicted):
    """
    Returns r_2, rmse and AARD value for two arrays of equal length
    """
    
    r2 = r2_score(actual, predicted)
    rmse = np.sqrt(mean_squared_error( actual, predicted ))
    aard = (100 / len(actual)) * np.sum(np.abs((actual - predicted) / actual))
    
    return r2,rmse, aard

__Restore optimal hyperparameters for SVM and random forest models__

In [None]:
%store -r svm_params

In [None]:
%store -r rf_params

__Import the models__

In [None]:
from sklearn.svm import SVR as SVM
svm = SVM(gamma = "auto", **svm_params)    # gamma value is left at default; explicitly set to surpress warnings

In [None]:
from sklearn.ensemble import RandomForestRegressor as RF
rf = RF(n_estimators = 20, **rf_params)    # use 20 estimators

## Feature selection functions definition

In [None]:
# use 5 fold cross validation to evaluate model performance
from sklearn.model_selection import cross_val_predict

In [None]:
def forward_feature_selection(left, right, best):
    """
    This function performs forward feature selection.
    It expects global variables for data: X_train and y_train.
    The evaluated model must be assigned to global variable model.
    Input parameters: empty list, list of all features, 10000 (large enough number).
    Output: Prints RMSE for all evaluated feature sets, until RMSE stops improving.
    """
    if len(right) == 0:
        print(left, best)
        return
        
    # currently most helpful feature
    best_f = ""
    
    # look for next best feature
    for f in right:
        if left == []:
            fs = [f]
        else:
            fs = left + [f]
              
        prediction = cross_val_predict(model, X_train[fs], y_train, cv=3)
        _, rmse, _ = result_stats(prediction, y_train)
        
        # check if adding feature f improves the current best result
        if rmse < best:
            best = rmse
            best_f = f
    
    # feature addition helped, add feature and call the function again on current feature set    
    if best_f != "":
        if left == []:
            fs = [best_f]
        else:
            fs = left + [best_f]
        right.remove(best_f)
        print("RMSE: {0:.2f}; Features: {1}".format(best,fs))
        forward_feature_selection(fs, right, best)
        return
    # no additional features can be added  
    else:
        print("RMSE: {0:.2f}; Features: {1}".format(best,left))
        return

In [None]:
def backward_feature_selection(features, best):
    """
    This function performs backward feature selection.
    It expects global variables for data: X_train and y_train.
    The evaluated model must be assigned to global variable model.
    Input parameters: list of all features, 10000 (large enough number).
    Output: Prints RMSE for all evaluated feature sets, until RMSE stops improving.
    """
    if len(features) == 0:
        return
    
    # currently least helpful feature
    worst_f = ""
    
    # look for next worst feature
    for f in features:
        fs = features.copy()
        fs.remove(f)
              
        prediction = cross_val_predict(model, X_train[fs], y_train, cv=3)
        _, rmse, _ = result_stats(prediction, y_train)
        
        # check if removing feature f improves the current best result
        if rmse < best:
            best = rmse
            worst_f = f
    
    # feature removal helped, remove feature and call the function again on current feature set
    if worst_f != "":
        features.remove(worst_f)
        print("RMSE: {0:.2f}; Features: {1}".format(best,features))
        backward_feature_selection(features, best)
        return
    # no additional features can be removed  
    else:
        print("RMSE: {0:.2f}; Features: {1}".format(best,features))
        return

## Feature selection on SVM model

In [None]:
# set model variable
model = svm
svm

### Forward feature selection

In [None]:
all_features = list(X_train.columns).copy()
forward_feature_selection([], all_features,10000)
# save the result
forward = set(X_train.columns) - set(all_features)

### Backward feature selection

In [None]:
all_features = list(X_train.columns).copy()
backward_feature_selection(all_features, 10000)
# save the result
backward = set(all_features)

### SVM feature set

In [None]:
# save the optimized feature set
# final feature set is union of feature sets obtained through backward and forward selection
svm_features = list(forward & backward)
print("The optimized feature set for SVM: {0}".format(svm_features))
%store svm_features

## Feature selection on random forest model

In [None]:
# set model variable
model = rf
# must reset the max_features hyperparameter from the optimized value back to all features
rf.set_params(max_features = "auto")

### Forward feature selection

In [None]:
all_features = list(X_train.columns).copy()
forward_feature_selection([], all_features,10000)
# save the result
forward = set(X_train.columns) - set(all_features)

### Backward feature selection

In [None]:
all_features = list(X_train.columns).copy()
backward_feature_selection(all_features, 10000)
# save the result
backward = set(all_features)

__Note:__ Optimization of the max_features hyperparameter gives better results and as such feature selection is not used for the random forest model