In [1]:
import pandas as pd

from sklearn.ensemble import RandomForestRegressor
from sklearn.model_selection import train_test_split

import eli5
from eli5.sklearn import PermutationImportance

In [2]:
data = pd.read_csv('PermutationImportance/taxi_fare_train.csv', nrows=5000)

# remove samples with outlier coords and neg fares
data = data.query('pickup_latitude > 40.7 and pickup_latitude < 40.8 and ' +
                  'dropoff_latitude > 40.7 and dropoff_latitude < 40.8 and ' +
                  'pickup_longitude > -74 and pickup_longitude < -73.9 and ' +
                  'dropoff_longitude > -74 and dropoff_longitude < -73.9 and ' +
                  'fare_amount > 0'
                  )

y = data.fare_amount

# we will use Permutation Importance to find the importance of these features
base_features = ['pickup_longitude',
                 'pickup_latitude',
                 'dropoff_longitude',
                 'dropoff_latitude',
                 'passenger_count']

X = data[base_features]

In [3]:
# train val set
train_X, val_X, train_y, val_y = train_test_split(X, y, random_state=1)

In [4]:
# step 1 - train a classifier
rf_model = RandomForestRegressor(n_estimators=25, random_state=1).fit(train_X, train_y)

In [5]:
# step 2 - calculate permutation importance using val set
p = PermutationImportance(rf_model, random_state=1).fit(val_X, val_y)

eli5.show_weights(p, feature_names=val_X.columns.tolist())

Weight,Feature
0.9049  ± 0.1122,dropoff_latitude
0.7884  ± 0.0722,pickup_latitude
0.5931  ± 0.0454,dropoff_longitude
0.5849  ± 0.0828,pickup_longitude
0.0025  ± 0.0044,passenger_count


In [6]:
# observation
# latitude feature seems to be more important than longitude for predicting fare
# it may be that the data samples contain more latitude distance than longitude,
# so shuffling longitude does not matter as much


In [7]:
# add two new features of lat_distance & lon_distance
data['abs_lat_distance'] = abs(data.dropoff_latitude - data.pickup_latitude)
data['abs_lon_distance'] = abs(data.dropoff_longitude - data.pickup_longitude)

# find importance with these added feature
features_new = ['pickup_longitude',
                'pickup_latitude',
                'dropoff_longitude',
                'dropoff_latitude',
                'abs_lat_distance',
                'abs_lon_distance']

X = data[features_new]

new_train_X, new_val_X, new_train_y, new_val_y = train_test_split(X, y, random_state=1)
rf_model = RandomForestRegressor(n_estimators=25, random_state=1).fit(new_train_X, new_train_y)

p = PermutationImportance(rf_model, random_state=1).fit(new_val_X, new_val_y)
eli5.show_weights(p, feature_names=new_val_X.columns.tolist())


Weight,Feature
0.8158  ± 0.0949,abs_lat_distance
0.4738  ± 0.0949,abs_lon_distance
0.0445  ± 0.0307,pickup_latitude
0.0267  ± 0.0347,dropoff_latitude
-0.0141  ± 0.0119,dropoff_longitude
-0.0143  ± 0.0111,pickup_longitude


In [8]:
# observation
# distance is more important than lat & lon

In [9]:
# Although the scale of abs_lat_distance & abs_lon_distance is small compared to other features,
# scaling does not affect permutation importance directly, only indirectly if scaling affect the trained model.
# RF is not affected by feature scales, so scaling is not needed for PI in this case.