# Feature Selection: Most Important Features

In [1]:
import numpy as np
import pandas as pd


df = pd.read_csv("prof_ratings.csv")
df = df.drop(['eval'], axis=1) 
df.head()

Unnamed: 0,age,gender,minority,native_english,lower_division,attractive,high_eval
0,36,1,1,1,0,1,1
1,59,0,0,1,0,0,1
2,51,0,0,1,0,0,1
3,40,1,0,1,0,0,1
4,31,1,0,1,0,1,1


### Train/Test Split and Standardize

In [2]:
from sklearn.model_selection import train_test_split
from sklearn.preprocessing import StandardScaler

X = df.iloc[:, :-1].values
y = df.iloc[:, -1].values

# split dataset into test/train sets
X_train, X_test, y_train, y_test = train_test_split(X, y, random_state=1, test_size=0.3)

# standardize features
stdsc = StandardScaler()
X_train_std = stdsc.fit_transform(X_train)
X_test_std = stdsc.transform(X_test) 

# A Random Forest will identify feature importances
Feature importance displays how important each feature is to the Random Forest. 

Feature importances will always sum to 1.

## Train a Random Forest model

In [3]:
from sklearn.ensemble import RandomForestClassifier

rf = RandomForestClassifier()
rf.fit(X_train_std, y_train);

## Display the proportional importance of each feature

In [4]:
# feature_importances_ will always sum to 1

rf.feature_importances_

array([0.67925085, 0.09420199, 0.04635367, 0.04006509, 0.08565384,
       0.05447457])

## Get the indices of the most important features sorted by importance

In [5]:
# Sort the feature importances by their index position, then reverse the order
# so that they are displayed from largest (most important) to smallest.

index_of_best_features_in_order = rf.feature_importances_.argsort()[::-1]
index_of_best_features_in_order

array([0, 1, 4, 5, 2, 3])

## Order the features by importance using the sorted indices above

In [6]:
df.columns

Index(['age', 'gender', 'minority', 'native_english', 'lower_division',
       'attractive', 'high_eval'],
      dtype='object')

In [7]:
# Display the best features in order

features = np.array(df.columns)

list_of_best_features_in_order = features[index_of_best_features_in_order]

print("The best features in order:")
print(list_of_best_features_in_order)

The best features in order:
['age' 'gender' 'lower_division' 'attractive' 'minority' 'native_english']


## Sort the importance values, largest to smallest

In [8]:
importances_ordered = sorted(rf.feature_importances_, reverse=True)
importances_ordered

[0.6792508454825127,
 0.09420198625717288,
 0.08565383783036938,
 0.054474569505412365,
 0.04635367473611009,
 0.0400650861884226]

# View the best features as a DataFrame

In [9]:
most_important_features_df = pd.DataFrame(data=[importances_ordered], 
                                          columns=list_of_best_features_in_order,
                                         index=["importance"])
most_important_features_df

Unnamed: 0,age,gender,lower_division,attractive,minority,native_english
importance,0.679251,0.094202,0.085654,0.054475,0.046354,0.040065


---