# Feature Pruning and Selections

In [3]:
import pandas as pd

df_removed_zeros = pd.read_feather('removed_zeros.feather')

## View the selected features

Now let's see which features we selected.

In [11]:
import numpy as np
y_removed_logged = np.log(df_removed_zeros.price)
X_removed = df_removed_zeros.drop(columns=['price'])

In [13]:
from sklearn.feature_selection import RFE
from sklearn.linear_model import LinearRegression
model = LinearRegression()

rfe_logged_model = RFE(model, n_features_to_select=35)
rfe_logged_model.fit(X_removed, y_removed_logged)

RFE(estimator=LinearRegression(copy_X=True, fit_intercept=True, n_jobs=None,
         normalize=False),
  n_features_to_select=35, step=1, verbose=0)

In [16]:
len(X_removed.columns[rfe_logged_model.support_])
# 35

selected_cols = X_removed.columns[rfe_logged_model.support_]
pruned_X_removed = X_removed.iloc[:, rfe_logged_model.support_]

In [18]:
from sklearn.model_selection import train_test_split
X_train_pruned, X_test_pruned, y_train_pruned, y_test_pruned = train_test_split(
     pruned_X_removed, y_removed_logged, test_size=0.33, random_state=20)

In [19]:
model.fit(X_train_pruned, y_train_pruned)
model.score(X_test_pruned, y_test_pruned)

0.4628732594292488

So we see that we get similar results.

## Rank the Features

Now let's rank the features.

In [20]:
def feature_importances(df, estimator, df_cols = None):
    df_cols = df_cols or df.columns
    df_cols = df_cols.to_numpy()
    coefs = estimator.coef_
    abs_coefs = abs(coefs)
    paired = np.hstack((df_cols.reshape(-1, 1), coefs.reshape(-1, 1), abs_coefs.reshape(-1, 1)))
    ordered_cols = paired[paired[:, -1].argsort()][::-1]
    return ordered_cols

In [23]:
feature_scores = feature_importances(X_train_pruned,model)
feature_scores[:20]

array([['room_type_Shared room', -0.9325864146096644, 0.9325864146096644],
       ['requires_license', 0.7512186371717212, 0.7512186371717212],
       ['longitude', -0.699624113760337, 0.699624113760337],
       ['street_is_berlin_berlin_germany', -0.6286935860639178,
        0.6286935860639178],
       ['state_is_berlin', 0.5753531284572152, 0.5753531284572152],
       ['room_type_Private room', -0.5248380248393115,
        0.5248380248393115],
       ['beds_is_na', 0.48648631738296705, 0.48648631738296705],
       ['cancellation_policy_super_strict_60', 0.47830219491266435,
        0.47830219491266435],
       ['neighbourhood_cleansed_Moabit West', -0.3989526703401287,
        0.3989526703401287],
       ['property_type_Loft', 0.37294290980223554, 0.37294290980223554],
       ['neighbourhood_Wilmersdorf', 0.36930563451518983,
        0.36930563451518983],
       ['property_type_other', 0.36020576555137046, 0.36020576555137046],
       ['host_neighbourhood_Wilmersdorf', -0.31711660861

In [22]:
feature_scores[:, 0]

array(['room_type_Shared room', 'requires_license', 'longitude',
       'street_is_berlin_berlin_germany', 'state_is_berlin',
       'room_type_Private room', 'beds_is_na',
       'cancellation_policy_super_strict_60',
       'neighbourhood_cleansed_Moabit West', 'property_type_Loft',
       'neighbourhood_Wilmersdorf', 'property_type_other',
       'host_neighbourhood_Wilmersdorf', 'host_sinceIs_year_end',
       'bedrooms_is_na', 'bathrooms_is_na',
       'neighbourhood_cleansed_Moabit Ost', 'bedrooms',
       'neighbourhood_Wedding',
       'neighbourhood_group_cleansed_Reinickendorf',
       'neighbourhood_group_cleansed_other', 'zip_dists_1335',
       'license_is_na', 'zip_dists_other', 'last_reviewMonth_3.0',
       'neighbourhood_group_cleansed_Mitte', 'last_reviewMonth_2.0',
       'neighbourhood_cleansed_Parkviertel',
       'neighbourhood_group_cleansed_Neukölln', 'summary_is_na',
       'zipcode_is_na', 'last_reviewMonth_12.0', 'zip_dists_1082',
       'host_response_time_o

### Potential features to Combine

Now with these 35 columns, we can start to group them: 

* Location
    * longitude
    * street_is_berlin_berlin_germany
    * state_is_berlin
    * neighbourhood_Wilmersdorf
    * host_neighbourhood_Wilmersdorf
    * neighbourhood_cleansed_Moabit West
    * neighbourhood_cleansed_Moabit Ost
    * neighbourhood_Wedding
    * neighbourhood_group_cleansed_Mitte
    * neighbourhood_cleansed_Parkviertel
    * neighbourhood_group_cleansed_Neukölln
    * neighbourhood_group_cleansed_Reinickendorf
    * neighbourhood_group_cleansed_other
    * zip_dists_1335
    * zip_dists_1082
    * zip_dists_other
    * zipcode_is_na

### Recombine Features that appear Linear

### From here can dedicate feature engineering

### Resources

* Feature selection in theory
    * Want those that are correlated, or pearson correlated with the outputs
        * Could compare both techniques

In [None]:
https://towardsdatascience.com/why-how-and-when-to-apply-feature-selection-e9c69adfabf2