In [1]:
import pandas as pd
import pandas_profiling as profiling

from sklearn.ensemble import RandomForestRegressor
from sklearn.linear_model import LinearRegression
from sklearn.model_selection import train_test_split
from sklearn.metrics import explained_variance_score, r2_score

import warnings
warnings.filterwarnings("ignore")

df = pd.read_csv("./data/train.csv")
# df.describe()

profile = profiling.ProfileReport(df)
# profile

In [2]:
data = df.query('pickup_longitude > -74 and pickup_longitude < -73.9 and ' +
                'pickup_latitude > 40.7 and pickup_latitude < 40.8 and ' +
                'dropoff_longitude > -74 and dropoff_longitude < -73.9 and ' +
                'dropoff_latitude > 40.7 and dropoff_latitude < 40.8 and ' +
                'fare_amount > 0')

base_features = ['pickup_longitude',
                 'pickup_latitude',
                 'dropoff_longitude',
                 'dropoff_latitude',
                 'passenger_count']

X = data[base_features]
y = data.fare_amount

X_train, X_val, y_train, y_val = train_test_split(X, y, random_state=1)
first_model = RandomForestRegressor(n_estimators=50, random_state=1).fit(X_train, y_train)

y_pred = first_model.predict(X_val)

score = r2_score(y_val, y_pred)
score

0.41968601515439785

In [6]:
X_train.describe()

Unnamed: 0,pickup_longitude,pickup_latitude,dropoff_longitude,dropoff_latitude,passenger_count
count,23466.0,23466.0,23466.0,23466.0,23466.0
mean,-73.976827,40.756931,-73.975359,40.757434,1.66232
std,0.014625,0.018206,0.01593,0.018659,1.290729
min,-73.999999,40.700013,-73.999999,40.70002,0.0
25%,-73.987964,40.744901,-73.987143,40.745756,1.0
50%,-73.979629,40.758076,-73.978588,40.758542,1.0
75%,-73.967797,40.769602,-73.966459,40.770406,2.0
max,-73.900062,40.799952,-73.900062,40.799999,6.0


In [None]:
y_train.describe()

In [None]:
data.corr()

In [3]:
import eli5
from eli5.sklearn import PermutationImportance

# Make a small change to the code below to use in this problem. 
perm = PermutationImportance(first_model, random_state=1).fit(X_val, y_val)

# uncomment the following line to visualize your results
eli5.show_weights(perm, feature_names=base_features)

Weight,Feature
0.8387  ± 0.0168,dropoff_latitude
0.8326  ± 0.0212,pickup_latitude
0.5947  ± 0.0432,pickup_longitude
0.5326  ± 0.0275,dropoff_longitude
-0.0022  ± 0.0014,passenger_count


In [4]:
data['abs_lon_change'] = abs(data.dropoff_longitude - data.pickup_longitude)
data['abs_lat_change'] = abs(data.dropoff_latitude - data.pickup_latitude)

new_features = ['pickup_longitude',
                'pickup_latitude',
                'dropoff_longitude',
                'dropoff_latitude',
                'abs_lon_change',
                'abs_lat_change',
                'passenger_count']

new_X = data[new_features]
new_y = data.fare_amount

new_X_train, new_X_val, new_y_train, new_y_val = train_test_split(new_X, new_y, random_state=1)
second_model = RandomForestRegressor(n_estimators=50, random_state=1).fit(new_X_train, new_y_train)

new_y_pred = second_model.predict(new_X_val)

score_2 = r2_score(new_y_val, new_y_pred)
score_2

0.4489856910907549

In [7]:
new_X_train.describe()

Unnamed: 0,pickup_longitude,pickup_latitude,dropoff_longitude,dropoff_latitude,abs_lon_change,abs_lat_change,passenger_count
count,23466.0,23466.0,23466.0,23466.0,23466.0,23466.0,23466.0
mean,-73.976827,40.756931,-73.975359,40.757434,0.013042,0.014827,1.66232
std,0.014625,0.018206,0.01593,0.018659,0.011674,0.012141,1.290729
min,-73.999999,40.700013,-73.999999,40.70002,0.0,0.0,0.0
25%,-73.987964,40.744901,-73.987143,40.745756,0.004943,0.00603,1.0
50%,-73.979629,40.758076,-73.978588,40.758542,0.010022,0.01165,1.0
75%,-73.967797,40.769602,-73.966459,40.770406,0.017673,0.02041,2.0
max,-73.900062,40.799952,-73.900062,40.799999,0.094065,0.094655,6.0


In [8]:
# Make a small change to the code below to use in this problem. 
new_perm = PermutationImportance(second_model, random_state=1).fit(new_X_val, new_y_val)

# uncomment the following line to visualize your results
eli5.show_weights(new_perm, feature_names=new_features)

Weight,Feature
0.5672  ± 0.0248,abs_lat_change
0.4320  ± 0.0413,abs_lon_change
0.0609  ± 0.0196,dropoff_longitude
0.0600  ± 0.0167,pickup_longitude
0.0560  ± 0.0189,pickup_latitude
0.0492  ± 0.0084,dropoff_latitude
-0.0031  ± 0.0019,passenger_count
