# Predicting Uber Ride Fares

## Austin Nguyen & Hilary Le

## Dataset: Uber Fares Datasets from Kaggle &mdash; www.kaggle.com/datasets/yasserh/uber-fares-dataset 

In [45]:
import numpy as np
import pandas as pd
import sklearn
from sklearn.metrics import mean_squared_error, mean_absolute_error

### Import Dataset & Preprocess Data

In [34]:
data = pd.read_csv('uber.csv')
data

Unnamed: 0.1,Unnamed: 0,key,fare_amount,pickup_datetime,pickup_longitude,pickup_latitude,dropoff_longitude,dropoff_latitude,passenger_count
0,24238194,2015-05-07 19:52:06.0000003,7.5,2015-05-07 19:52:06 UTC,-73.999817,40.738354,-73.999512,40.723217,1
1,27835199,2009-07-17 20:04:56.0000002,7.7,2009-07-17 20:04:56 UTC,-73.994355,40.728225,-73.994710,40.750325,1
2,44984355,2009-08-24 21:45:00.00000061,12.9,2009-08-24 21:45:00 UTC,-74.005043,40.740770,-73.962565,40.772647,1
3,25894730,2009-06-26 08:22:21.0000001,5.3,2009-06-26 08:22:21 UTC,-73.976124,40.790844,-73.965316,40.803349,3
4,17610152,2014-08-28 17:47:00.000000188,16.0,2014-08-28 17:47:00 UTC,-73.925023,40.744085,-73.973082,40.761247,5
...,...,...,...,...,...,...,...,...,...
199995,42598914,2012-10-28 10:49:00.00000053,3.0,2012-10-28 10:49:00 UTC,-73.987042,40.739367,-73.986525,40.740297,1
199996,16382965,2014-03-14 01:09:00.0000008,7.5,2014-03-14 01:09:00 UTC,-73.984722,40.736837,-74.006672,40.739620,1
199997,27804658,2009-06-29 00:42:00.00000078,30.9,2009-06-29 00:42:00 UTC,-73.986017,40.756487,-73.858957,40.692588,2
199998,20259894,2015-05-20 14:56:25.0000004,14.5,2015-05-20 14:56:25 UTC,-73.997124,40.725452,-73.983215,40.695415,1


In [35]:
data['pickup_datetime'] = pd.to_datetime(data['pickup_datetime'], errors='coerce')

data['pickup_year'] = data['pickup_datetime'].dt.year
data['pickup_month'] = data['pickup_datetime'].dt.month
data['pickup_day'] = data['pickup_datetime'].dt.day
data['pickup_hour'] = data['pickup_datetime'].dt.hour
data['pickup_minute'] = data['pickup_datetime'].dt.minute
data['pickup_second'] = data['pickup_datetime'].dt.second

data.drop(["pickup_datetime"], axis=1, inplace=True)
data

Unnamed: 0.1,Unnamed: 0,key,fare_amount,pickup_longitude,pickup_latitude,dropoff_longitude,dropoff_latitude,passenger_count,pickup_year,pickup_month,pickup_day,pickup_hour,pickup_minute,pickup_second
0,24238194,2015-05-07 19:52:06.0000003,7.5,-73.999817,40.738354,-73.999512,40.723217,1,2015,5,7,19,52,6
1,27835199,2009-07-17 20:04:56.0000002,7.7,-73.994355,40.728225,-73.994710,40.750325,1,2009,7,17,20,4,56
2,44984355,2009-08-24 21:45:00.00000061,12.9,-74.005043,40.740770,-73.962565,40.772647,1,2009,8,24,21,45,0
3,25894730,2009-06-26 08:22:21.0000001,5.3,-73.976124,40.790844,-73.965316,40.803349,3,2009,6,26,8,22,21
4,17610152,2014-08-28 17:47:00.000000188,16.0,-73.925023,40.744085,-73.973082,40.761247,5,2014,8,28,17,47,0
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
199995,42598914,2012-10-28 10:49:00.00000053,3.0,-73.987042,40.739367,-73.986525,40.740297,1,2012,10,28,10,49,0
199996,16382965,2014-03-14 01:09:00.0000008,7.5,-73.984722,40.736837,-74.006672,40.739620,1,2014,3,14,1,9,0
199997,27804658,2009-06-29 00:42:00.00000078,30.9,-73.986017,40.756487,-73.858957,40.692588,2,2009,6,29,0,42,0
199998,20259894,2015-05-20 14:56:25.0000004,14.5,-73.997124,40.725452,-73.983215,40.695415,1,2015,5,20,14,56,25


In [36]:
data.drop(["Unnamed: 0", "key"], axis=1, inplace=True)

In [37]:
data

Unnamed: 0,fare_amount,pickup_longitude,pickup_latitude,dropoff_longitude,dropoff_latitude,passenger_count,pickup_year,pickup_month,pickup_day,pickup_hour,pickup_minute,pickup_second
0,7.5,-73.999817,40.738354,-73.999512,40.723217,1,2015,5,7,19,52,6
1,7.7,-73.994355,40.728225,-73.994710,40.750325,1,2009,7,17,20,4,56
2,12.9,-74.005043,40.740770,-73.962565,40.772647,1,2009,8,24,21,45,0
3,5.3,-73.976124,40.790844,-73.965316,40.803349,3,2009,6,26,8,22,21
4,16.0,-73.925023,40.744085,-73.973082,40.761247,5,2014,8,28,17,47,0
...,...,...,...,...,...,...,...,...,...,...,...,...
199995,3.0,-73.987042,40.739367,-73.986525,40.740297,1,2012,10,28,10,49,0
199996,7.5,-73.984722,40.736837,-74.006672,40.739620,1,2014,3,14,1,9,0
199997,30.9,-73.986017,40.756487,-73.858957,40.692588,2,2009,6,29,0,42,0
199998,14.5,-73.997124,40.725452,-73.983215,40.695415,1,2015,5,20,14,56,25


In [38]:
# validate latitude and longitude
# latitude: [-90, 90]
# longitude: [-180, 180]

data = data[
    (data.pickup_latitude < 90) & (data.dropoff_latitude > -90) &
    (data.pickup_latitude > -90) & (data.dropoff_latitude < 90) &
    (data.pickup_longitude > -180) & (data.pickup_longitude < 180)&
    (data.dropoff_longitude > -180) & (data.dropoff_longitude < 180)
]

In [39]:
data

Unnamed: 0,fare_amount,pickup_longitude,pickup_latitude,dropoff_longitude,dropoff_latitude,passenger_count,pickup_year,pickup_month,pickup_day,pickup_hour,pickup_minute,pickup_second
0,7.5,-73.999817,40.738354,-73.999512,40.723217,1,2015,5,7,19,52,6
1,7.7,-73.994355,40.728225,-73.994710,40.750325,1,2009,7,17,20,4,56
2,12.9,-74.005043,40.740770,-73.962565,40.772647,1,2009,8,24,21,45,0
3,5.3,-73.976124,40.790844,-73.965316,40.803349,3,2009,6,26,8,22,21
4,16.0,-73.925023,40.744085,-73.973082,40.761247,5,2014,8,28,17,47,0
...,...,...,...,...,...,...,...,...,...,...,...,...
199995,3.0,-73.987042,40.739367,-73.986525,40.740297,1,2012,10,28,10,49,0
199996,7.5,-73.984722,40.736837,-74.006672,40.739620,1,2014,3,14,1,9,0
199997,30.9,-73.986017,40.756487,-73.858957,40.692588,2,2009,6,29,0,42,0
199998,14.5,-73.997124,40.725452,-73.983215,40.695415,1,2015,5,20,14,56,25


In [40]:
data.info()

<class 'pandas.core.frame.DataFrame'>
Index: 199987 entries, 0 to 199999
Data columns (total 12 columns):
 #   Column             Non-Null Count   Dtype  
---  ------             --------------   -----  
 0   fare_amount        199987 non-null  float64
 1   pickup_longitude   199987 non-null  float64
 2   pickup_latitude    199987 non-null  float64
 3   dropoff_longitude  199987 non-null  float64
 4   dropoff_latitude   199987 non-null  float64
 5   passenger_count    199987 non-null  int64  
 6   pickup_year        199987 non-null  int32  
 7   pickup_month       199987 non-null  int32  
 8   pickup_day         199987 non-null  int32  
 9   pickup_hour        199987 non-null  int32  
 10  pickup_minute      199987 non-null  int32  
 11  pickup_second      199987 non-null  int32  
dtypes: float64(5), int32(6), int64(1)
memory usage: 15.3 MB


In [43]:
X = data.drop("fare_amount", axis=1)
y = data["fare_amount"]

### Linear Regression

In [47]:
from sklearn.model_selection import train_test_split


X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.3, random_state=1)

In [48]:
from sklearn.linear_model import LinearRegression

lin_reg_model = LinearRegression()

lin_reg_model.fit(X_train, y_train)

y_pred = lin_reg_model.predict(X_test)

linear_mse = mean_squared_error(y_test, y_pred)
linear_rmse = np.sqrt(linear_mse)
linear_mae = mean_absolute_error(y_test, y_pred)

print("linear regression")

print(f"mse: {linear_mse}")
print(f"rmse: {linear_rmse}")
print(f"mae: {linear_mae}")
print(f"R2: {lin_reg_model.score(X_test, y_test)}")

linear regression
mse: 95.35109378541839
rmse: 9.764788465984216
mae: 5.980208070927663
R2: 0.016491404474989313


### Ridge Regression

### Polynomial Regression

### Support Vector Regression

### Decision Tree Regression

### Random Forest Regression