# Predicting Uber Ride Fares

## Austin Nguyen & Hilary Le

## Dataset: Uber Fares Datasets from Kaggle &mdash; www.kaggle.com/datasets/yasserh/uber-fares-dataset 

In [7]:
import numpy as np
import pandas as pd
import sklearn
from sklearn.metrics import mean_squared_error, mean_absolute_error
from sklearn.preprocessing import PolynomialFeatures

## Training & Testing Various Models & using MSE, MAE, RMSE, & R2 Score to evaluate models

### Import Dataset & Preprocess Data

In [8]:
data = pd.read_csv('uber.csv')
data

Unnamed: 0.1,Unnamed: 0,key,fare_amount,pickup_datetime,pickup_longitude,pickup_latitude,dropoff_longitude,dropoff_latitude,passenger_count
0,24238194,2015-05-07 19:52:06.0000003,7.5,2015-05-07 19:52:06 UTC,-73.999817,40.738354,-73.999512,40.723217,1
1,27835199,2009-07-17 20:04:56.0000002,7.7,2009-07-17 20:04:56 UTC,-73.994355,40.728225,-73.994710,40.750325,1
2,44984355,2009-08-24 21:45:00.00000061,12.9,2009-08-24 21:45:00 UTC,-74.005043,40.740770,-73.962565,40.772647,1
3,25894730,2009-06-26 08:22:21.0000001,5.3,2009-06-26 08:22:21 UTC,-73.976124,40.790844,-73.965316,40.803349,3
4,17610152,2014-08-28 17:47:00.000000188,16.0,2014-08-28 17:47:00 UTC,-73.925023,40.744085,-73.973082,40.761247,5
...,...,...,...,...,...,...,...,...,...
199995,42598914,2012-10-28 10:49:00.00000053,3.0,2012-10-28 10:49:00 UTC,-73.987042,40.739367,-73.986525,40.740297,1
199996,16382965,2014-03-14 01:09:00.0000008,7.5,2014-03-14 01:09:00 UTC,-73.984722,40.736837,-74.006672,40.739620,1
199997,27804658,2009-06-29 00:42:00.00000078,30.9,2009-06-29 00:42:00 UTC,-73.986017,40.756487,-73.858957,40.692588,2
199998,20259894,2015-05-20 14:56:25.0000004,14.5,2015-05-20 14:56:25 UTC,-73.997124,40.725452,-73.983215,40.695415,1


In [9]:
data['pickup_datetime'] = pd.to_datetime(data['pickup_datetime'], errors='coerce')

data['pickup_year'] = data['pickup_datetime'].dt.year
data['pickup_month'] = data['pickup_datetime'].dt.month
data['pickup_day'] = data['pickup_datetime'].dt.day
data['pickup_hour'] = data['pickup_datetime'].dt.hour
data['pickup_minute'] = data['pickup_datetime'].dt.minute
data['pickup_second'] = data['pickup_datetime'].dt.second

data.drop(["pickup_datetime"], axis=1, inplace=True)
data

Unnamed: 0.1,Unnamed: 0,key,fare_amount,pickup_longitude,pickup_latitude,dropoff_longitude,dropoff_latitude,passenger_count,pickup_year,pickup_month,pickup_day,pickup_hour,pickup_minute,pickup_second
0,24238194,2015-05-07 19:52:06.0000003,7.5,-73.999817,40.738354,-73.999512,40.723217,1,2015,5,7,19,52,6
1,27835199,2009-07-17 20:04:56.0000002,7.7,-73.994355,40.728225,-73.994710,40.750325,1,2009,7,17,20,4,56
2,44984355,2009-08-24 21:45:00.00000061,12.9,-74.005043,40.740770,-73.962565,40.772647,1,2009,8,24,21,45,0
3,25894730,2009-06-26 08:22:21.0000001,5.3,-73.976124,40.790844,-73.965316,40.803349,3,2009,6,26,8,22,21
4,17610152,2014-08-28 17:47:00.000000188,16.0,-73.925023,40.744085,-73.973082,40.761247,5,2014,8,28,17,47,0
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
199995,42598914,2012-10-28 10:49:00.00000053,3.0,-73.987042,40.739367,-73.986525,40.740297,1,2012,10,28,10,49,0
199996,16382965,2014-03-14 01:09:00.0000008,7.5,-73.984722,40.736837,-74.006672,40.739620,1,2014,3,14,1,9,0
199997,27804658,2009-06-29 00:42:00.00000078,30.9,-73.986017,40.756487,-73.858957,40.692588,2,2009,6,29,0,42,0
199998,20259894,2015-05-20 14:56:25.0000004,14.5,-73.997124,40.725452,-73.983215,40.695415,1,2015,5,20,14,56,25


In [10]:
data.drop(["Unnamed: 0", "key"], axis=1, inplace=True)

In [11]:
data

Unnamed: 0,fare_amount,pickup_longitude,pickup_latitude,dropoff_longitude,dropoff_latitude,passenger_count,pickup_year,pickup_month,pickup_day,pickup_hour,pickup_minute,pickup_second
0,7.5,-73.999817,40.738354,-73.999512,40.723217,1,2015,5,7,19,52,6
1,7.7,-73.994355,40.728225,-73.994710,40.750325,1,2009,7,17,20,4,56
2,12.9,-74.005043,40.740770,-73.962565,40.772647,1,2009,8,24,21,45,0
3,5.3,-73.976124,40.790844,-73.965316,40.803349,3,2009,6,26,8,22,21
4,16.0,-73.925023,40.744085,-73.973082,40.761247,5,2014,8,28,17,47,0
...,...,...,...,...,...,...,...,...,...,...,...,...
199995,3.0,-73.987042,40.739367,-73.986525,40.740297,1,2012,10,28,10,49,0
199996,7.5,-73.984722,40.736837,-74.006672,40.739620,1,2014,3,14,1,9,0
199997,30.9,-73.986017,40.756487,-73.858957,40.692588,2,2009,6,29,0,42,0
199998,14.5,-73.997124,40.725452,-73.983215,40.695415,1,2015,5,20,14,56,25


In [12]:
# validate latitude and longitude
# latitude: [-90, 90]
# longitude: [-180, 180]

data = data[
    (data.pickup_latitude < 90) & (data.dropoff_latitude > -90) &
    (data.pickup_latitude > -90) & (data.dropoff_latitude < 90) &
    (data.pickup_longitude > -180) & (data.pickup_longitude < 180)&
    (data.dropoff_longitude > -180) & (data.dropoff_longitude < 180)
]

In [13]:
data

Unnamed: 0,fare_amount,pickup_longitude,pickup_latitude,dropoff_longitude,dropoff_latitude,passenger_count,pickup_year,pickup_month,pickup_day,pickup_hour,pickup_minute,pickup_second
0,7.5,-73.999817,40.738354,-73.999512,40.723217,1,2015,5,7,19,52,6
1,7.7,-73.994355,40.728225,-73.994710,40.750325,1,2009,7,17,20,4,56
2,12.9,-74.005043,40.740770,-73.962565,40.772647,1,2009,8,24,21,45,0
3,5.3,-73.976124,40.790844,-73.965316,40.803349,3,2009,6,26,8,22,21
4,16.0,-73.925023,40.744085,-73.973082,40.761247,5,2014,8,28,17,47,0
...,...,...,...,...,...,...,...,...,...,...,...,...
199995,3.0,-73.987042,40.739367,-73.986525,40.740297,1,2012,10,28,10,49,0
199996,7.5,-73.984722,40.736837,-74.006672,40.739620,1,2014,3,14,1,9,0
199997,30.9,-73.986017,40.756487,-73.858957,40.692588,2,2009,6,29,0,42,0
199998,14.5,-73.997124,40.725452,-73.983215,40.695415,1,2015,5,20,14,56,25


In [14]:
data.info()

<class 'pandas.core.frame.DataFrame'>
Index: 199987 entries, 0 to 199999
Data columns (total 12 columns):
 #   Column             Non-Null Count   Dtype  
---  ------             --------------   -----  
 0   fare_amount        199987 non-null  float64
 1   pickup_longitude   199987 non-null  float64
 2   pickup_latitude    199987 non-null  float64
 3   dropoff_longitude  199987 non-null  float64
 4   dropoff_latitude   199987 non-null  float64
 5   passenger_count    199987 non-null  int64  
 6   pickup_year        199987 non-null  int32  
 7   pickup_month       199987 non-null  int32  
 8   pickup_day         199987 non-null  int32  
 9   pickup_hour        199987 non-null  int32  
 10  pickup_minute      199987 non-null  int32  
 11  pickup_second      199987 non-null  int32  
dtypes: float64(5), int32(6), int64(1)
memory usage: 15.3 MB


In [15]:
from sklearn.preprocessing import StandardScaler

X = data.drop("fare_amount", axis=1)
y = data["fare_amount"]

### Linear Regression

In [16]:
from sklearn.model_selection import train_test_split

# Split the dataset
X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.3, random_state=1)

In [17]:
from sklearn.linear_model import LinearRegression
from sklearn.metrics import mean_squared_error, r2_score

lin_reg_model = LinearRegression()

lin_reg_model.fit(X_train, y_train)

y_pred = lin_reg_model.predict(X_test)

linear_mse = mean_squared_error(y_test, y_pred)
linear_rmse = np.sqrt(linear_mse)
linear_mae = mean_absolute_error(y_test, y_pred)

print("linear regression")

print(f"mse: {linear_mse}")
print(f"rmse: {linear_rmse}")
print(f"mae: {linear_mae}")
print(f"R2: {r2_score(y_test, y_pred)} ")

linear regression
mse: 95.35109378541839
rmse: 9.764788465984216
mae: 5.980208070927663
R2: 0.016491404474989313 


### Ridge Regression w/ Cross Validation

In [20]:
from sklearn.linear_model import RidgeCV

k = 5

alphas = np.logspace(-3, 3, 10)

ridge_cv = RidgeCV(alphas=alphas, cv=k)
ridge_cv.fit(X_train, y_train)

y_pred = ridge_cv.predict(X_test)

ridge_mse = mean_squared_error(y_test, y_pred)
ridge_rmse = np.sqrt(ridge_mse)
ridge_mae = mean_absolute_error(y_test, y_pred)

print("ridge regression")

print(f"mse: {ridge_mse}")
print(f"rmse: {ridge_rmse}")
print(f"mae: {ridge_mae}")
print(f"R2: {ridge_cv.score(X_test, y_test)}")

ridge regression
mse: 95.35105523075077
rmse: 9.764786491815926
mae: 5.98019564404649
R2: 0.016491802151046175


### Polynomial Regression

In [21]:
from sklearn.model_selection import cross_val_score
for d in range(1, 4):
    poly = PolynomialFeatures(degree=d, include_bias=False)
    X_poly = poly.fit_transform(X)
    
    model = LinearRegression()
    scores = cross_val_score(model, X_poly, y, cv=5, scoring='neg_mean_squared_error')
    mean_score = -np.mean(scores)
    print(f"deg {d}, mean mse: {mean_score:.2f}")

deg 1, mean mse: 96.48
deg 2, mean mse: 170.14
deg 3, mean mse: 773848.59


In [22]:
poly2 = PolynomialFeatures(degree=2, include_bias=False)
X_poly2_train = poly2.fit_transform(X_train)

poly2_model = LinearRegression()
poly2_model.fit(X_poly2_train, y_train)

In [23]:
poly2_pred = poly2_model.predict(poly2.transform(X_test))

poly2_mse = mean_squared_error(y_test, poly2_pred)
poly2_rmse = np.sqrt(poly2_mse)
poly2_mae = mean_absolute_error(y_test, poly2_pred)

print("polynomial regression")

print(f"mse: {poly2_mse}")
print(f"rmse: {poly2_rmse}")
print(f"mae: {poly2_mae}")
print(f"R2: {poly2_model.score(poly2.transform(X_test), y_test)}")

polynomial regression
mse: 349.05191432956184
rmse: 18.682931095777285
mae: 5.679087501661748
R2: -2.600331620737867


### Decision Tree Regression

In [24]:
from sklearn.tree import DecisionTreeRegressor

dt = DecisionTreeRegressor(random_state=42)

dt.fit(X_train, y_train)

In [25]:
dt_y_pred = dt.predict(X_test)

dt_mse = mean_squared_error(y_test, dt_y_pred)
dt_rmse = np.sqrt(dt_mse)
dt_mae = mean_absolute_error(y_test, dt_y_pred)

print("decision tree regression")

print(f"mse: {dt_mse}")
print(f"rmse: {dt_rmse}")
print(f"mae: {dt_mae}")
print(f"R2: {dt.score(X_test, y_test)}")

decision tree regression
mse: 44.27722347450706
rmse: 6.6541132748479015
mae: 2.9285177592212945
R2: 0.5432980562219946


### Random Forest Regression

In [26]:
from sklearn.ensemble import RandomForestRegressor

rf = RandomForestRegressor(n_estimators=100, max_depth=None, random_state=42)

rf.fit(X_train, y_train)

In [27]:
rf_y_pred = rf.predict(X_test)

rf_mse = mean_squared_error(y_test, rf_y_pred)
rf_rmse = np.sqrt(rf_mse)
rf_mae = mean_absolute_error(y_test, rf_y_pred)

print("random forest regression")

print(f"mse: {rf_mse}")
print(f"rmse: {rf_rmse}")
print(f"mae: {rf_mae}")
print(f"R2: {rf.score(X_test, y_test)}")

random forest regression
mse: 20.311349953682686
rmse: 4.506811506340451
mae: 2.0382314849075787
R2: 0.7904965064048552


### Extra Tree Regression

In [63]:
from sklearn.ensemble import ExtraTreesRegressor

n = {25, 50, 100, 200}
best_n = 0
best_score = 0

for i in n:
    et = ExtraTreesRegressor(n_estimators=i, max_depth=None, random_state=42)
    et.fit(X_train, y_train)

    et_y_pred = et.predict(X_test)

    if et.score(X_test, y_test) > best_score or best_n == 0:
        best_n = i
        best_score = et.score(X_test, y_test)
        et_model = et

print(f"best n: {best_n}")

best n: 200


In [64]:
y_pred = et_model.predict(X_test)

et_mse = mean_squared_error(y_test, y_pred)
et_rmse = np.sqrt(et_mse)
et_mae = mean_absolute_error(y_test, y_pred)

print("extra trees regression")

print(f"mse: {et_mse}")
print(f"rmse: {et_rmse}")
print(f"mae: {et_mae}")
print(f"R2: {et_model.score(X_test, y_test)}")

extra trees regression
mse: 30.339128040769914
rmse: 5.508096589636924
mae: 2.933619136790173
R2: 0.6870639651393888


### Gradient Boosting Machines (GBM)

In [29]:
import xgboost as xgb

xgb_model = xgb.XGBRegressor(
    n_estimators=100,       
    learning_rate=0.1,      
    max_depth=6,         
    subsample=0.8,          
    colsample_bytree=0.8     
)

xgb_model.fit(X_train, y_train)

In [32]:
y_pred = xgb_model.predict(X_test)

xgb_mse = mean_squared_error(y_test, y_pred)
xgb_rmse = np.sqrt(xgb_mse)
xgb_mae = mean_absolute_error(y_test, y_pred)

print("lightgbm regression")

print(f"mse: {xgb_mse}")
print(f"rmse: {xgb_rmse}")
print(f"mae: {xgb_mae}")
print(f"R2: {xgb_model.score(X_test, y_test)}")

lightgbm regression
mse: 21.91111331542891
rmse: 4.680930817201736
mae: 2.344890541616419
R2: 0.773995583818438


### Adaboost Regression

In [75]:
from sklearn.tree import DecisionTreeRegressor

base_estimator = DecisionTreeRegressor(max_depth=4)

n={5, 7, 9, 10, 25, 50, 100}

for i in n:
    curr_adaboost_model = AdaBoostRegressor(
        estimator=base_estimator,  
        n_estimators=i,              
        learning_rate=0.1,             
        loss='square',                
        random_state=42                
    )

    curr_adaboost_model.fit(X_train, y_train)
    y_pred = curr_adaboost_model.predict(X_test)

    score = curr_adaboost_model.score(X_test, y_test)
    print(f"n_estimators: {i}, R2: {score}")

    if curr_adaboost_model.score(X_test, y_test) > best_score or best_n == 0:
        best_n = i
        best_score = curr_adaboost_model.score(X_test, y_test)
        adaboost_model = curr_adaboost_model

print(f"best n: {best_n}")

n_estimators: 50, R2: 0.508519020015948
n_estimators: 100, R2: 0.4977872759832621
n_estimators: 5, R2: 0.5747323394459198
n_estimators: 7, R2: 0.5765104221992268
n_estimators: 9, R2: 0.5705371587427734
n_estimators: 10, R2: 0.5698477462054738
n_estimators: 25, R2: 0.5498553414010761
best n: 7


In [76]:
y_pred = adaboost_model.predict(X_test)

adaboost_mse = mean_squared_error(y_test, y_pred)
adaboost_rmse = np.sqrt(adaboost_mse)
adaboost_mae = mean_absolute_error(y_test, y_pred)

print("adaboost regression")

print(f"mse: {adaboost_mse}")
print(f"rmse: {adaboost_rmse}")
print(f"mae: {adaboost_mae}")
print(f"R2: {adaboost_model.score(X_test, y_test)}")

adaboost regression
mse: 41.05728677284539
rmse: 6.407596021351954
mae: 4.11359860924032
R2: 0.5765104221992268


### ALL MODELS & METRICS

In [None]:
print("linear regression")

print(f"mse: {linear_mse}")
print(f"rmse: {linear_rmse}")
print(f"mae: {linear_mae}")
print(f"R2: {lin_reg_model.score(X_test, y_test)}")
print("\n")

print("ridge regression")

print(f"mse: {ridge_mse}")
print(f"rmse: {ridge_rmse}")
print(f"mae: {ridge_mae}")
print(f"R2: {ridge_cv.score(X_test, y_test)}")
print("\n")

print("polynomial regression")

print(f"mse: {poly2_mse}")
print(f"rmse: {poly2_rmse}")
print(f"mae: {poly2_mae}")
print(f"R2: {poly2_model.score(poly2.transform(X_test), y_test)}")
print("\n")

print("decision tree regression")

print(f"mse: {dt_mse}")
print(f"rmse: {dt_rmse}")
print(f"mae: {dt_mae}")
print(f"R2: {dt.score(X_test, y_test)}")
print("\n")

print("random forest regression")

print(f"mse: {rf_mse}")
print(f"rmse: {rf_rmse}")
print(f"mae: {rf_mae}")
print(f"R2: {rf.score(X_test, y_test)}")
print("\n")

print("extra trees regression")

print(f"mse: {et_mse}")
print(f"rmse: {et_rmse}")
print(f"mae: {et_mae}")
print(f"R2: {et_model.score(X_test, y_test)}")  
print("\n")

print("lightgbm regression")

print(f"mse: {xgb_mse}")
print(f"rmse: {xgb_rmse}")
print(f"mae: {xgb_mae}")
print(f"R2: {xgb_model.score(X_test, y_test)}")
print("\n")

print("adaboost regression")

print(f"mse: {adaboost_mse}")
print(f"rmse: {adaboost_rmse}")
print(f"mae: {adaboost_mae}")
print(f"R2: {adaboost_model.score(X_test, y_test)}")

print("\n")


linear regression
mse: 95.35109378541839
rmse: 9.764788465984216
mae: 5.980208070927663
R2: 0.016491404474989313


ridge regression
mse: 95.35105523075077
rmse: 9.764786491815926
mae: 5.98019564404649
R2: 0.016491802151046175


polynomial regression
mse: 349.05191432956184
rmse: 18.682931095777285
mae: 5.679087501661748
R2: -2.600331620737867


decision tree regression
mse: 44.27722347450706
rmse: 6.6541132748479015
mae: 2.9285177592212945
R2: 0.5432980562219946


random forest regression
mse: 20.311349953682686
rmse: 4.506811506340451
mae: 2.0382314849075787
R2: 0.7904965064048552


extra trees regression
mse: 30.339128040769914
rmse: 5.508096589636924
mae: 2.933619136790173
R2: 0.6870639651393888


lightgbm regression
mse: 21.91111331542891
rmse: 4.680930817201736
mae: 2.344890541616419
R2: 0.773995583818438


adaboost regression
mse: 41.05728677284539
rmse: 6.407596021351954
mae: 4.11359860924032
R2: 0.5765104221992268


In [78]:
best_model = rf
y_pred = best_model.predict(X_test)

best_mse = mean_squared_error(y_test, y_pred)
best_rmse = np.sqrt(best_mse)
best_mae = mean_absolute_error(y_test, y_pred)

print("best model: random forest regression")

print(f"mse: {best_mse}")
print(f"rmse: {best_rmse}")
print(f"mae: {best_mae}")
print(f"R2: {best_model.score(X_test, y_test)}")

best model: random forest regression
mse: 20.311349953682686
rmse: 4.506811506340451
mae: 2.0382314849075787
R2: 0.7904965064048552
