# Project 3
**Jay Kynerd —** 
Uber Fares Dataset

# Importing the Data

In [174]:
import pandas as pd

with open('data/uber.csv') as f:
    uber = pd.DataFrame(pd.read_csv(f))

uber

Unnamed: 0,id,key,fare_amount,pickup_datetime,pickup_longitude,pickup_latitude,dropoff_longitude,dropoff_latitude,passenger_count
0,24238194,2015-05-07 19:52:06.0000003,7.5,2015-05-07 19:52:06 UTC,-73.999817,40.738354,-73.999512,40.723217,1
1,27835199,2009-07-17 20:04:56.0000002,7.7,2009-07-17 20:04:56 UTC,-73.994355,40.728225,-73.994710,40.750325,1
2,44984355,2009-08-24 21:45:00.00000061,12.9,2009-08-24 21:45:00 UTC,-74.005043,40.740770,-73.962565,40.772647,1
3,25894730,2009-06-26 08:22:21.0000001,5.3,2009-06-26 08:22:21 UTC,-73.976124,40.790844,-73.965316,40.803349,3
4,17610152,2014-08-28 17:47:00.000000188,16.0,2014-08-28 17:47:00 UTC,-73.925023,40.744085,-73.973082,40.761247,5
...,...,...,...,...,...,...,...,...,...
199995,42598914,2012-10-28 10:49:00.00000053,3.0,2012-10-28 10:49:00 UTC,-73.987042,40.739367,-73.986525,40.740297,1
199996,16382965,2014-03-14 01:09:00.0000008,7.5,2014-03-14 01:09:00 UTC,-73.984722,40.736837,-74.006672,40.739620,1
199997,27804658,2009-06-29 00:42:00.00000078,30.9,2009-06-29 00:42:00 UTC,-73.986017,40.756487,-73.858957,40.692588,2
199998,20259894,2015-05-20 14:56:25.0000004,14.5,2015-05-20 14:56:25 UTC,-73.997124,40.725452,-73.983215,40.695415,1


# Cleaning the Data

In [175]:
uber.set_index('id', inplace=True)

uber.info()

<class 'pandas.core.frame.DataFrame'>
Int64Index: 200000 entries, 24238194 to 11951496
Data columns (total 8 columns):
 #   Column             Non-Null Count   Dtype  
---  ------             --------------   -----  
 0   key                200000 non-null  object 
 1   fare_amount        200000 non-null  float64
 2   pickup_datetime    200000 non-null  object 
 3   pickup_longitude   200000 non-null  float64
 4   pickup_latitude    200000 non-null  float64
 5   dropoff_longitude  199999 non-null  float64
 6   dropoff_latitude   199999 non-null  float64
 7   passenger_count    200000 non-null  int64  
dtypes: float64(5), int64(1), object(2)
memory usage: 13.7+ MB


**It looks line only one row is missing entries in the `dropoff_longitude` and `dropoff_latitude` columns. Let's get rid of that now:**

In [176]:
uber.dropna(axis=0, inplace=True)
uber.info()

<class 'pandas.core.frame.DataFrame'>
Int64Index: 199999 entries, 24238194 to 11951496
Data columns (total 8 columns):
 #   Column             Non-Null Count   Dtype  
---  ------             --------------   -----  
 0   key                199999 non-null  object 
 1   fare_amount        199999 non-null  float64
 2   pickup_datetime    199999 non-null  object 
 3   pickup_longitude   199999 non-null  float64
 4   pickup_latitude    199999 non-null  float64
 5   dropoff_longitude  199999 non-null  float64
 6   dropoff_latitude   199999 non-null  float64
 7   passenger_count    199999 non-null  int64  
dtypes: float64(5), int64(1), object(2)
memory usage: 13.7+ MB


# Custom Transformers
**Looking forward to our model, we care more about the time of day than the month, year, etc. Let's change the `pickup_datetime` column to be the time of day:**

In [177]:
import numpy as np
from datetime import time
from sklearn.base import BaseEstimator, TransformerMixin

class DatetimeParser(BaseEstimator, TransformerMixin):
    def fit(self, X, y=None):
        return self

    def transform(self, X):
        
        time_objects = []

        for row in X[:,2]:
            time_string = row.split(' ')[1].split(':')
            hr, min, sec = int(time_string[0]), int(time_string[1]), int(time_string[2])
            time_object = time(hr, min, sec)

            time_objects.append(time_object)

        return np.c_[time_objects]

dtp = DatetimeParser()
times = dtp.transform(uber.values)

uber['pickup_time'] = times
uber = uber.drop('pickup_datetime', axis=1)

uber

Unnamed: 0_level_0,key,fare_amount,pickup_longitude,pickup_latitude,dropoff_longitude,dropoff_latitude,passenger_count,pickup_time
id,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1,Unnamed: 5_level_1,Unnamed: 6_level_1,Unnamed: 7_level_1,Unnamed: 8_level_1
24238194,2015-05-07 19:52:06.0000003,7.5,-73.999817,40.738354,-73.999512,40.723217,1,19:52:06
27835199,2009-07-17 20:04:56.0000002,7.7,-73.994355,40.728225,-73.994710,40.750325,1,20:04:56
44984355,2009-08-24 21:45:00.00000061,12.9,-74.005043,40.740770,-73.962565,40.772647,1,21:45:00
25894730,2009-06-26 08:22:21.0000001,5.3,-73.976124,40.790844,-73.965316,40.803349,3,08:22:21
17610152,2014-08-28 17:47:00.000000188,16.0,-73.925023,40.744085,-73.973082,40.761247,5,17:47:00
...,...,...,...,...,...,...,...,...
42598914,2012-10-28 10:49:00.00000053,3.0,-73.987042,40.739367,-73.986525,40.740297,1,10:49:00
16382965,2014-03-14 01:09:00.0000008,7.5,-73.984722,40.736837,-74.006672,40.739620,1,01:09:00
27804658,2009-06-29 00:42:00.00000078,30.9,-73.986017,40.756487,-73.858957,40.692588,2,00:42:00
20259894,2015-05-20 14:56:25.0000004,14.5,-73.997124,40.725452,-73.983215,40.695415,1,14:56:25


**The attributes `pickup_longitude`, `pickup_latitude`, `dropoff_longitude`, and `dropoff_latitude` don't help us much on their own... let's calculate the distance using the coordinates and create a new column in the dataframe:**

In [178]:
import haversine as hs
from haversine import Unit


class DistanceParser(BaseEstimator, TransformerMixin):
    def __init__(self, units):
        self.units = units

    def fit(self, X, y=None):
        return self

    def transform(self, X):
        distances = []
        
        for pickup1, pickup2, dropoff1, dropoff2 in zip(X[:,2], X[:,3], X[:,4], X[:,5]):
            distance = hs.haversine( (pickup1, pickup2), (dropoff1, dropoff2), unit=self.units )
            distances.append(distance)
        

        return np.c_[distances]


dp = DistanceParser(Unit.MILES)
distances = dp.transform(uber.values)

uber['distance'] = distances
uber.drop(columns=['pickup_longitude','pickup_latitude','dropoff_longitude','dropoff_latitude'], inplace=True)

uber

Unnamed: 0_level_0,key,fare_amount,passenger_count,pickup_time,distance
id,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1,Unnamed: 5_level_1
24238194,2015-05-07 19:52:06.0000003,7.5,1,19:52:06,0.289051
27835199,2009-07-17 20:04:56.0000002,7.7,1,20:04:56,0.421742
44984355,2009-08-24 21:45:00.00000061,12.9,1,21:45:00,2.997201
25894730,2009-06-26 08:22:21.0000001,5.3,3,08:22:21,0.783947
17610152,2014-08-28 17:47:00.000000188,16.0,5,17:47:00,3.336707
...,...,...,...,...,...
42598914,2012-10-28 10:49:00.00000053,3.0,1,10:49:00,0.039878
16382965,2014-03-14 01:09:00.0000008,7.5,1,01:09:00,1.517527
27804658,2009-06-29 00:42:00.00000078,30.9,2,00:42:00,8.863743
20259894,2015-05-20 14:56:25.0000004,14.5,1,14:56:25,1.118528


In [179]:
uber.describe()

Unnamed: 0,fare_amount,passenger_count,distance
count,199999.0,199999.0,199999.0
mean,11.359892,1.684543,12.736598
std,9.90176,1.385995,241.979518
min,-52.0,0.0,0.0
25%,6.0,1.0,0.512261
50%,8.5,1.0,0.948325
75%,12.5,2.0,1.744637
max,499.0,208.0,10141.618345


# Min/Max Scaler (SKLearn Preprocessing)

In [180]:
from sklearn import preprocessing as pp

uber['distance'] = pp.minmax_scale(uber['distance'])
uber

Unnamed: 0_level_0,key,fare_amount,passenger_count,pickup_time,distance
id,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1,Unnamed: 5_level_1
24238194,2015-05-07 19:52:06.0000003,7.5,1,19:52:06,0.000029
27835199,2009-07-17 20:04:56.0000002,7.7,1,20:04:56,0.000042
44984355,2009-08-24 21:45:00.00000061,12.9,1,21:45:00,0.000296
25894730,2009-06-26 08:22:21.0000001,5.3,3,08:22:21,0.000077
17610152,2014-08-28 17:47:00.000000188,16.0,5,17:47:00,0.000329
...,...,...,...,...,...
42598914,2012-10-28 10:49:00.00000053,3.0,1,10:49:00,0.000004
16382965,2014-03-14 01:09:00.0000008,7.5,1,01:09:00,0.000150
27804658,2009-06-29 00:42:00.00000078,30.9,2,00:42:00,0.000874
20259894,2015-05-20 14:56:25.0000004,14.5,1,14:56:25,0.000110


# SGD Classifier

In [181]:
# formatting times for model
formatted_times = []

for pickup_time in uber['pickup_time']:
    formatted_times.append(int(pickup_time.strftime("%H%M%S")))

uber['pickup_time'] = formatted_times

In [182]:
# from sklearn.linear_model import SGDClassifier

# x, y = uber[['passenger_count', 'pickup_time', 'distance']], uber[['fare_amount']]

# x_train, x_test, y_train, y_test = x[:180000], x[19999:], y[:180000], y[19999:]

# test = y_train.values.ravel()

# sgd_clf = SGDClassifier(random_state=42)
# sgd_clf.fit(x_train.values, test)




# Linear Regression

In [183]:
from sklearn.linear_model import LinearRegression

lin_reg = LinearRegression()
lin_reg.fit(x_train, y_train)

LinearRegression()

# Decision Tree Regressor

In [184]:
from sklearn.tree import DecisionTreeRegressor

tree_reg = DecisionTreeRegressor()
tree_reg.fit(x_train, y_train)

DecisionTreeRegressor()

# K-fold Cross Validation

In [185]:
from sklearn.model_selection import cross_val_score

scores = cross_val_score(tree_reg, x_train, y_train,
                         scoring="neg_mean_squared_error", cv=10)

tree_rmse_scores = np.sqrt(-scores)

print("Scores:", scores)
print("Mean:", scores.mean())
print("Standard deviation:", scores.std())

Scores: [-66.96497554 -66.49540764 -63.15813137 -58.29422338 -61.54971386
 -59.84340203 -56.76634985 -62.53716823 -64.78206415 -73.76331951]
Mean: -63.415475556098144
Standard deviation: 4.674899922554436


# Evaluate Using Test Data

In [186]:
lin_reg_results = lin_reg.predict(x_test)

y_test['lin_reg'] = lin_reg_results
y_test['difference'] = y_test['lin_reg'] - y_test['fare_amount']

display(y_test.head())
y_test['difference'].describe()

A value is trying to be set on a copy of a slice from a DataFrame.
Try using .loc[row_indexer,col_indexer] = value instead

See the caveats in the documentation: https://pandas.pydata.org/pandas-docs/stable/user_guide/indexing.html#returning-a-view-versus-a-copy
  y_test['lin_reg'] = lin_reg_results
A value is trying to be set on a copy of a slice from a DataFrame.
Try using .loc[row_indexer,col_indexer] = value instead

See the caveats in the documentation: https://pandas.pydata.org/pandas-docs/stable/user_guide/indexing.html#returning-a-view-versus-a-copy
  y_test['difference'] = y_test['lin_reg'] - y_test['fare_amount']


Unnamed: 0_level_0,fare_amount,lin_reg,difference
id,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1
14990730,7.0,11.715727,4.715727
32968831,29.5,11.317135,-18.182865
22247475,8.5,11.688471,3.188471
48411131,9.0,11.650239,2.650239
32985180,12.1,11.41362,-0.68638


count    180000.000000
mean          0.008805
std           9.856986
min        -487.401095
25%          -1.276610
50%           2.945450
75%           5.370123
max          68.592263
Name: difference, dtype: float64

In [187]:
tree_reg_results = tree_reg.predict(x_test)

y_test.drop(columns=['lin_reg', 'difference'], inplace=True)

y_test['tree_reg'] = tree_reg_results
y_test['difference'] = y_test['tree_reg'] - y_test['fare_amount']

display(y_test.head())
y_test['difference'].describe()

A value is trying to be set on a copy of a slice from a DataFrame

See the caveats in the documentation: https://pandas.pydata.org/pandas-docs/stable/user_guide/indexing.html#returning-a-view-versus-a-copy
  return super().drop(
A value is trying to be set on a copy of a slice from a DataFrame.
Try using .loc[row_indexer,col_indexer] = value instead

See the caveats in the documentation: https://pandas.pydata.org/pandas-docs/stable/user_guide/indexing.html#returning-a-view-versus-a-copy
  y_test['tree_reg'] = tree_reg_results
A value is trying to be set on a copy of a slice from a DataFrame.
Try using .loc[row_indexer,col_indexer] = value instead

See the caveats in the documentation: https://pandas.pydata.org/pandas-docs/stable/user_guide/indexing.html#returning-a-view-versus-a-copy
  y_test['difference'] = y_test['tree_reg'] - y_test['fare_amount']


Unnamed: 0_level_0,fare_amount,tree_reg,difference
id,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1
14990730,7.0,7.0,0.0
32968831,29.5,29.5,0.0
22247475,8.5,8.5,0.0
48411131,9.0,9.0,0.0
32985180,12.1,12.1,0.0


count    180000.000000
mean          0.022504
std           2.946714
min        -243.700000
25%           0.000000
50%           0.000000
75%           0.000000
max         143.930000
Name: difference, dtype: float64