In [1]:
import pandas as pd

from sklearn.model_selection import train_test_split, cross_val_score
from sklearn.linear_model import LinearRegression
import xgboost as xg

import pickle

**price model:**

In [2]:
price_df = pd.read_csv('price_model_data.zip')
price_df.drop(columns=['Unnamed: 0'], inplace=True)
price_df.head()

Unnamed: 0,pickup_day,pickup_hour,pickup_minute,passenger_count,trip_distance,pickup_longitude,pickup_latitude,dropoff_longitude,dropoff_latitude,fare_amount
0,-0.712856,-0.707044,-1.617379,1.855659,1.860723,0.04755,-0.306325,2.83543,0.403426,1.620933
1,-0.712856,-0.707044,-1.617379,1.855659,2.997441,-0.762775,-1.266622,0.414554,1.332761,2.948632
2,-0.712856,-0.707044,-1.617379,-0.538892,-0.68364,0.087911,-0.241193,-0.760951,0.083656,-0.27578
3,-0.712856,-0.707044,-1.617379,-0.538892,0.500196,-1.012038,-1.103952,-0.935617,-2.195185,0.198398
4,-0.712856,-0.707044,-1.617379,-0.538892,-0.925119,-1.465767,-0.228484,-0.841647,-0.351103,-0.844794


In [3]:
features_p = list(price_df.columns)
target_p = 'fare_amount'
features_p.remove(target_p)

In [4]:
X_p = price_df[features_p]
y_p = price_df[target_p]

X_p_train, X_p_test, y_p_train, y_p_test = train_test_split(X_p, y_p, test_size=0.2, random_state=42)
X_p_train.shape, X_p_test.shape, y_p_train.shape, y_p_test.shape

((53481, 9), (13371, 9), (53481,), (13371,))

In [5]:
baseline_p = LinearRegression()
baseline_p.fit(X_p_train, y_p_train)
baseline_p_pred = baseline_p.predict(X_p_test)

baseline_p_r2_scores = cross_val_score(baseline_p, X_p_train, y_p_train, scoring='r2')
baseline_p_rmse_scores = -cross_val_score(baseline_p, X_p_train, y_p_train, scoring='neg_root_mean_squared_error')

baseline_p_r2 = baseline_p_r2_scores.mean()
baseline_p_rmse = baseline_p_rmse_scores.mean()

In [6]:
price_regr = xg.XGBRegressor(objective='reg:squarederror', n_estimators = 100)
price_regr.fit(X_p_train, y_p_train)

price_r2_scores = cross_val_score(price_regr, X_p_train, y_p_train, scoring='r2')
price_rmse_scores = -cross_val_score(price_regr, X_p_train, y_p_train, scoring='neg_root_mean_squared_error')

price_r2 = price_r2_scores.mean()
price_rmse = price_rmse_scores.mean()

In [7]:
print(f'price baseline model scores:\n\nr2: {baseline_p_r2:.3f}\trmse:{baseline_p_rmse:.3f}\n\n')
print(f'price model scores:\n\nr2: {price_r2:.3f}\trmse:{price_rmse:.3f}')

price baseline model scores:

r2: 0.920	rmse:0.282


price model scores:

r2: 0.949	rmse:0.226


**duration model:**

In [8]:
duration_df = pd.read_csv('duration_model_data.zip')
duration_df.drop(columns=['Unnamed: 0'], inplace=True)
duration_df.head()

Unnamed: 0,pickup_day,pickup_hour,pickup_minute,trip_distance,pickup_longitude,pickup_latitude,dropoff_longitude,dropoff_latitude,duration
0,-0.712856,-0.707044,-1.617379,1.860723,0.04755,-0.306325,2.83543,0.403426,1.08766
1,-0.712856,-0.707044,-1.617379,2.997441,-0.762775,-1.266622,0.414554,1.332761,2.301171
2,-0.712856,-0.707044,-1.617379,-0.68364,0.087911,-0.241193,-0.760951,0.083656,0.116852
3,-0.712856,-0.707044,-1.617379,0.500196,-1.012038,-1.103952,-0.935617,-2.195185,-0.000822
4,-0.712856,-0.707044,-1.617379,-0.925119,-1.465767,-0.228484,-0.841647,-0.351103,-0.711767


In [9]:
features_d = list(duration_df.columns)
target_d = 'duration'
features_d.remove(target_d)

In [10]:
X_d = duration_df[features_d]
y_d = duration_df[target_d]

X_d_train, X_d_test, y_d_train, y_d_test = train_test_split(X_d, y_d, test_size=0.2, random_state=42)
X_d_train.shape, X_d_test.shape, y_d_train.shape, y_d_test.shape

((53481, 8), (13371, 8), (53481,), (13371,))

In [11]:
baseline_d = LinearRegression()
baseline_d.fit(X_d_train, y_d_train)
baseline_d_pred = baseline_d.predict(X_d_test)

baseline_d_r2_scores = cross_val_score(baseline_d, X_d_train, y_d_train, scoring='r2')
baseline_d_rmse_scores = -cross_val_score(baseline_d, X_d_train, y_d_train, scoring='neg_root_mean_squared_error')

baseline_d_r2 = baseline_d_r2_scores.mean()
baseline_d_rmse = baseline_d_rmse_scores.mean()

In [12]:
duration_regr = xg.XGBRegressor(objective='reg:squarederror', n_estimators = 100)
duration_regr.fit(X_d_train, y_d_train)

duration_r2_scores = cross_val_score(duration_regr, X_d_train, y_d_train, scoring='r2')
duration_rmse_scores = -cross_val_score(duration_regr, X_d_train, y_d_train, scoring='neg_root_mean_squared_error')

duration_r2 = duration_r2_scores.mean()
duration_rmse = duration_rmse_scores.mean()

In [13]:
print(f'duration baseline model scores:\n\nr2: {baseline_d_r2:.3f}\trmse:{baseline_d_rmse:.3f}\n\n')
print(f'duration model scores:\n\nr2: {duration_r2:.3f}\trmse:{duration_rmse:.3f}')

duration baseline model scores:

r2: 0.687	rmse:0.558


duration model scores:

r2: 0.833	rmse:0.408


**creating models:**

In [14]:
price_regr.fit(X_p, y_p)

filename = 'price_model.pkl'
pickle.dump(price_regr, open(filename, 'wb'))

In [15]:
duration_regr.fit(X_d, y_d)

filename = 'duration_model.pkl'
pickle.dump(duration_regr, open(filename, 'wb'))