# Feature selection - Feature Importance with Random Forest

In [None]:
import pandas as pd
from sklearn.model_selection import train_test_split
from sklearn.preprocessing import MinMaxScaler
from sklearn.preprocessing import StandardScaler
import matplotlib.pyplot as plt
import numpy as np
from sklearn.datasets import load_wine

#Load data
dataObj = load_wine()
X = dataObj.data
y = dataObj.target
# Split
X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.3, random_state=0)

# Standardization
stdsc = StandardScaler()
X_train_std = stdsc.fit_transform(X_train)
X_test_std = stdsc.transform(X_test)

In [None]:
from sklearn.ensemble import RandomForestClassifier

#Create objects 
forest = RandomForestClassifier(n_estimators=500, random_state=0, n_jobs=-1)

#Training
forest.fit(X_train, y_train)

In [None]:
# Extract importance measure
importances = forest.feature_importances_

# Column names
cols = np.array(dataObj.feature_names)

# Sort array from based on importances from large to small
idxs = np.argsort(importances)[::-1]
importances = importances[idxs]
cols2 = cols[idxs]

# Print results
for count, (col, importance) in enumerate(zip(cols2, importances)):
    print(f"{count+1:2d}) {col:30s} \t{importance:5.3f}")

In [None]:
#Reverse the order for plotting
cols2 = cols[::-1]
importances2 = importances[::-1]

#Plotting
fig, ax = plt.subplots(figsize=(5,8))
ax.barh(cols2,importances2,  color='lightblue')
ax.set_title('Importances by features')
plt.show()

### Using Feature Selection object in SKL

In [None]:
from sklearn.feature_selection import SelectFromModel

#Create object
sfm = SelectFromModel(forest, threshold=0.1)

# Training
sfm.fit(X_train, y_train)

# Columns chosen
cols_bool = sfm.get_support()

# Extract importances values
importances = sfm.estimator_.feature_importances_

# Select only chosen columns
cols_reduced = cols[cols_bool]
importances = importances[cols_bool]

In [None]:
# Sort array from based on importances from large to small
idxs = np.argsort(importances)[::-1]
importances = importances[idxs]
cols_reduced = cols_reduced[idxs]

# Print results
for count, (col, importance) in enumerate(zip(cols_reduced, importances)):
    print(f"{count+1:2d}) {col:30s} \t{importance:5.3f}")

In [None]:
# Transform X
X_selected = sfm.transform(X_train)

print(X_selected.shape)