# Feature Selection

### Introduction

### Loading our data

In [1]:
import pandas as pd
df_train = pd.read_feather('./bnb_train.feather')
df_X_train = df_train.drop(columns = ['price'])
y_train = df_train.price

df_val = pd.read_feather('./bnb_val.feather')
df_X_val = df_val.drop(columns = ['price'])
y_val = df_val.price

> Then we train our model.

In [2]:
from sklearn.ensemble import RandomForestRegressor
rfr = RandomForestRegressor(n_estimators=40, max_features='log2')
rfr.fit(df_X_train, y_train)
rfr.score(df_X_val, y_val)

0.7597783476979612

### Feature Selection

In [3]:
from eli5.sklearn import PermutationImportance
perm = PermutationImportance(rfr).fit(df_X_val, y_val)

In [14]:
import eli5
eli5.explain_weights_df(perm, feature_names = df_X_val.columns.to_list())[:20]

Unnamed: 0,feature,weight,std
0,calculated_host_listings_count,0.083433,0.001301
1,host_listings_count,0.078769,0.000443
2,availability_60,0.077384,0.004574
3,summary_is_na,0.064149,0.00814
4,host_sinceYear,0.052009,0.006225
5,host_total_listings_count,0.042911,0.001421
6,host_sinceElapsed,0.040649,0.004633
7,property_type_other,0.038506,0.000565
8,index,0.037237,0.004978
9,host_sinceMonth_5.0,0.036698,0.005514


* Select from model 

In [19]:
df_X_val.iloc[:, first_selection.get_support()].shape

(4736, 41)

In [15]:
from sklearn.feature_selection import SelectFromModel
first_selection = SelectFromModel(perm, threshold=0.01, prefit=True)
# first_selection.fit()
X_val_first_select = first_selection.transform(df_X_val)
X_train_first_select = first_selection.transform(df_X_train)

In [16]:
X_val_first_select.shape

(4736, 41)

In [20]:
rfr_first_select = RandomForestRegressor(n_estimators=40, max_features='log2')
rfr_first_select.fit(X_train_first_select, y_train)
rfr_first_select.score(X_val_first_select, y_val)

0.8201422741753963

* get support 

In [11]:
first_selection.get_support()[:10]

array([ True,  True,  True,  True,  True,  True,  True,  True, False,
       False])

In [23]:
first_select_cols = df_X_train.columns[first_selection.get_support()]
first_select_cols.shape

(41,)

In [None]:
first_select_cols.shape

### Round two

In [24]:
from eli5.sklearn import PermutationImportance
second_pmi = PermutationImportance(rfr_first_select).fit(X_val_first_select, y_val)

* Notice that the weights change 

In [25]:
eli5.explain_weights_df(second_pmi, top=5, feature_names = first_select_cols.to_list())

Unnamed: 0,feature,weight,std
0,host_total_listings_count,0.10234,0.004797
1,index,0.095265,0.001258
2,calculated_host_listings_count,0.093673,0.001728
3,id,0.086402,0.000944
4,availability_365,0.066779,0.00199


In [None]:
eli5.explain_weights_df(second_pmi, feature_names = first_select_cols.to_list()).tail(5)

So let's select from our model again.

In [26]:
second_select = SelectFromModel(second_pmi, threshold=0.01, prefit=True)
X_train_second_select = second_select.transform(X_train_first_select)
X_val_second_select = second_select.transform(X_val_first_select)

rfr_second_select = RandomForestRegressor(n_estimators=40, max_features='log2')
rfr_second_select.fit(X_train_second_select, y_train)
rfr_second_select.score(X_val_second_select, y_val)

0.8302590772399254

In [27]:
X_val_second_select.shape

(4736, 24)

And again.

In [28]:
third_pmi = PermutationImportance(rfr_second_select).fit(X_val_second_select, y_val)
third_select = SelectFromModel(third_pmi, threshold=0.01, prefit=True)

X_train_third_select = third_select.transform(X_train_second_select)
X_val_third_select = third_select.transform(X_val_second_select)

rfr = RandomForestRegressor(n_estimators=40, max_features='log2')
rfr.fit(X_train_third_select, y_train)
rfr.score(X_val_third_select, y_val)

0.8050601208426202

In [29]:
second_select_cols = first_select_cols[second_select.get_support()]
third_select_cols = second_select_cols[third_select.get_support()]
third_select_cols.shape

(20,)

In [31]:
eli5.explain_weights_df(third_pmi, feature_names = second_select_cols.to_list())

Unnamed: 0,feature,weight,std
0,index,0.135917,0.004076
1,host_id,0.126213,0.004824
2,availability_365,0.113033,0.006911
3,calculated_host_listings_count,0.111507,0.006655
4,host_sinceElapsed,0.106859,0.005292
5,id,0.093245,0.000795
6,summary_is_na,0.089774,0.009151
7,"host_verifications_['email', 'phone']",0.083766,0.008828
8,license_is_na,0.078529,0.003952
9,host_listings_count,0.06801,0.003992


So from here, we can begin to identify the most important features.

In [None]:
selected_X_train = pd.DataFrame(X_train_third_select, columns=third_select_cols)
selected_X_train.loc[:, 'price'] = y_train

selected_X_val = pd.DataFrame(X_val_third_select, columns=third_select_cols)
selected_X_val.loc[:, 'price'] = y_val

In [125]:
selected_X_train.to_feather('./selected_train.feather')

In [126]:
selected_X_val.to_feather('./selected_val.feather')