In [1]:
import pandas as pd

from sklearn.ensemble import RandomForestRegressor
from sklearn.model_selection import train_test_split

from matplotlib import pyplot as plt

from pdpbox import pdp

In [2]:
data = pd.read_csv('PartialDependencyPlot/taxi_fare_train.csv', nrows=5000)

# remove samples with outlier coords and neg fares
data = data.query('pickup_latitude > 40.7 and pickup_latitude < 40.8 and ' +
                  'dropoff_latitude > 40.7 and dropoff_latitude < 40.8 and ' +
                  'pickup_longitude > -74 and pickup_longitude < -73.9 and ' +
                  'dropoff_longitude > -74 and dropoff_longitude < -73.9 and ' +
                  'fare_amount > 0'
                  )

y = data.fare_amount

# we will use partial dependency plot to find how these features affect fare
base_features = ['pickup_longitude',
                 'pickup_latitude',
                 'dropoff_longitude',
                 'dropoff_latitude']

X = data[base_features]


In [3]:
# train val set
train_X, val_X, train_y, val_y = train_test_split(X, y, random_state=1)
data.head()

Unnamed: 0,key,fare_amount,pickup_datetime,pickup_longitude,pickup_latitude,dropoff_longitude,dropoff_latitude,passenger_count
2,2011-08-18 00:35:00.00000049,5.7,2011-08-18 00:35:00 UTC,-73.982738,40.76127,-73.991242,40.750562,2
3,2012-04-21 04:30:42.0000001,7.7,2012-04-21 04:30:42 UTC,-73.98713,40.733143,-73.991567,40.758092,1
4,2010-03-09 07:51:00.000000135,5.3,2010-03-09 07:51:00 UTC,-73.968095,40.768008,-73.956655,40.783762,1
6,2012-11-20 20:35:00.0000001,7.5,2012-11-20 20:35:00 UTC,-73.980002,40.751662,-73.973802,40.764842,1
7,2012-01-04 17:22:00.00000081,16.5,2012-01-04 17:22:00 UTC,-73.9513,40.774138,-73.990095,40.751048,1


In [4]:
data.describe()

Unnamed: 0,fare_amount,pickup_longitude,pickup_latitude,dropoff_longitude,dropoff_latitude,passenger_count
count,3085.0,3085.0,3085.0,3085.0,3085.0,3085.0
mean,8.452768,-73.977032,40.756615,-73.975552,40.757332,1.610373
std,4.411415,0.01496,0.018406,0.015636,0.018595,1.247523
min,0.01,-73.999998,40.700013,-73.999995,40.701282,0.0
25%,5.5,-73.98859,40.744035,-73.987361,40.745731,1.0
50%,7.5,-73.980242,40.75779,-73.978403,40.758677,1.0
75%,10.1,-73.967927,40.76911,-73.966457,40.77006,2.0
max,52.0,-73.901047,40.799952,-73.90179,40.799937,6.0


In [5]:
# step 1 - train a model
rf_model = RandomForestRegressor(n_estimators=25, random_state=1).fit(train_X, train_y)

In [6]:
# step 2 - use the trained model for pdp on val set

# check pdp for all loc
for feature_name in base_features:
    pdp_dist = pdp.pdp_isolate(model=rf_model, dataset=val_X, model_features=base_features, feature=feature_name)
    pdp.pdp_plot(pdp_dist, feature_name)
    plt.show()


In [7]:
# observation
# plots suggest that when location is at middle seems to have a lower fare
# because location at middle seems to mean less distance

In [8]:
# fare is more affected by distance, so we need to see interaction of the locations
inter = pdp.pdp_interact(model=rf_model, dataset=val_X, model_features=base_features,
                         features=['pickup_longitude', 'dropoff_longitude'])
pdp.pdp_interact_plot(pdp_interact_out=inter, feature_names=['pickup_longitude', 'dropoff_longitude'],
                      plot_type='contour')
plt.show()

In [9]:
# observation
# when the two locations are different, meaning greater distance, the fare is more

In [10]:
# create the 2 distance features, and check pdp for one location feature again
data['abs_lat_distance'] = abs(data.dropoff_latitude - data.pickup_latitude)
data['abs_lon_distance'] = abs(data.dropoff_longitude - data.pickup_longitude)

# find importance with these added feature
features_new = ['pickup_longitude',
                'pickup_latitude',
                'dropoff_longitude',
                'dropoff_latitude',
                'abs_lat_distance',
                'abs_lon_distance']

X = data[features_new]

new_train_X, new_val_X, new_train_y, new_val_y = train_test_split(X, y, random_state=1)
rf_model = RandomForestRegressor(n_estimators=25, random_state=1).fit(new_train_X, new_train_y)

pdp_lon = pdp.pdp_isolate(model=rf_model, dataset=new_val_X, model_features=features_new, feature=feature_name)
pdp.pdp_plot(pdp_lon, feature_name)
plt.show()


In [11]:
# observation
# the effect of pickup lon is less now because of the added important feature

In [12]:
# having a greater pdp does not necessary mean greater permutation importance
# for example, a feature may be most of the same value but has one outlier sample,
# it will have great pdp but it is not important feature because most samples have same value