In [42]:
# load some default Python modules
import numpy as np
import pandas as pd

import matplotlib.pyplot as plt
import seaborn as sns
%matplotlib inline

plt.style.use('seaborn-whitegrid')

import warnings
warnings.filterwarnings('ignore')



In [54]:
new_test = pd.read_csv("../data/price_test.csv")
new_test.head()

Unnamed: 0,key,pickup_datetime,pickup_longitude,pickup_latitude,dropoff_longitude,dropoff_latitude,passenger_count
0,2015-01-27 13:08:24.0000002,2015-01-27 13:08:24 UTC,-73.97332,40.763805,-73.98143,40.743835,1
1,2015-01-27 13:08:24.0000003,2015-01-27 13:08:24 UTC,-73.986862,40.719383,-73.998886,40.739201,1
2,2011-10-08 11:53:44.0000002,2011-10-08 11:53:44 UTC,-73.982524,40.75126,-73.979654,40.746139,1
3,2012-12-01 21:12:12.0000002,2012-12-01 21:12:12 UTC,-73.98116,40.767807,-73.990448,40.751635,1
4,2012-12-01 21:12:12.0000003,2012-12-01 21:12:12 UTC,-73.966046,40.789775,-73.988565,40.744427,1


In [55]:
train = pd.read_csv("../data/price_train.csv", nrows = 5000000)
test = pd.read_csv("../data/price_test.csv")
print("shape of train data", train.shape)
train.head()

shape of train data (5000000, 8)


Unnamed: 0,key,fare_amount,pickup_datetime,pickup_longitude,pickup_latitude,dropoff_longitude,dropoff_latitude,passenger_count
0,2009-06-15 17:26:21.0000001,4.5,2009-06-15 17:26:21 UTC,-73.844311,40.721319,-73.84161,40.712278,1
1,2010-01-05 16:52:16.0000002,16.9,2010-01-05 16:52:16 UTC,-74.016048,40.711303,-73.979268,40.782004,1
2,2011-08-18 00:35:00.00000049,5.7,2011-08-18 00:35:00 UTC,-73.982738,40.76127,-73.991242,40.750562,2
3,2012-04-21 04:30:42.0000001,7.7,2012-04-21 04:30:42 UTC,-73.98713,40.733143,-73.991567,40.758092,1
4,2010-03-09 07:51:00.000000135,5.3,2010-03-09 07:51:00 UTC,-73.968095,40.768008,-73.956655,40.783762,1


In [3]:
train.describe()

Unnamed: 0,fare_amount,pickup_longitude,pickup_latitude,dropoff_longitude,dropoff_latitude,passenger_count
count,5000000.0,5000000.0,5000000.0,4999964.0,4999964.0,5000000.0
mean,11.3408,-72.50678,39.91974,-72.50652,39.91725,1.684695
std,9.820175,12.8097,8.963509,12.84777,9.486767,1.331854
min,-100.0,-3426.609,-3488.08,-3412.653,-3488.08,0.0
25%,6.0,-73.99206,40.73491,-73.99139,40.73404,1.0
50%,8.5,-73.98181,40.75263,-73.98016,40.75315,1.0
75%,12.5,-73.96711,40.76712,-73.96367,40.76811,2.0
max,1273.31,3439.426,3310.364,3457.622,3345.917,208.0


In [4]:
#The minimum fare for a taxi is 2.5 so drop rows with fares lower than that
train = train[train.fare_amount >=2.5]

In [5]:
#check dtypes
train.dtypes

key                   object
fare_amount          float64
pickup_datetime       object
pickup_longitude     float64
pickup_latitude      float64
dropoff_longitude    float64
dropoff_latitude     float64
passenger_count        int64
dtype: object

In [6]:
# check missing data
train.isnull().sum()

key                   0
fare_amount           0
pickup_datetime       0
pickup_longitude      0
pickup_latitude       0
dropoff_longitude    36
dropoff_latitude     36
passenger_count       0
dtype: int64

In [7]:
train = train.dropna(how='any', axis=0)

In [8]:
train.shape

(4999554, 8)

In [9]:
#Get max and min of coordinates from test and delete those from train

In [10]:
min_drop_long = min(test.pickup_longitude.min(),test.dropoff_longitude.min())
max_drop_lat = max(test.pickup_longitude.max(),test.dropoff_longitude.max())
print(min_drop_long, max_drop_lat)

-74.263242 -72.986532


In [11]:
min_pick_long = min(test.pickup_latitude.min(),test.dropoff_latitude.min())
min_pick_lat =max(test.pickup_latitude.max(),test.dropoff_latitude.max())
print(min_pick_long, min_pick_lat)

40.568973 41.709555


In [12]:
def narrow_area(df, boundary):
    return (df.pickup_longitude >= boundary[0]) & (df.pickup_longitude <= boundary[1]) & \
           (df.pickup_latitude >= boundary[2]) & (df.pickup_latitude <= boundary[3]) & \
           (df.dropoff_longitude >= boundary[0]) & (df.dropoff_longitude <= boundary[1]) & \
           (df.dropoff_latitude >= boundary[2]) & (df.dropoff_latitude <= boundary[3])

In [14]:
boundary = (min_drop_long, max_drop_lat, min_pick_long, min_pick_lat)
train = train[narrow_area(train, boundary)]

In [16]:
def prepare_time_features(df):
    df['pickup_datetime'] = df['pickup_datetime'].str.slice(0, 16)
    df['pickup_datetime'] = pd.to_datetime(df['pickup_datetime'], utc=True, format='%Y-%m-%d %H:%M')
    df['hour_of_day'] = df.pickup_datetime.dt.hour
    df['month'] = df.pickup_datetime.dt.month
    df["year"] = df.pickup_datetime.dt.year
    df["weekday"] = df.pickup_datetime.dt.weekday
    return df

In [17]:
train = prepare_time_features(train)
test = prepare_time_features(test)

In [18]:
# calculate-distance-between-two-latitude-longitude-points-haversine-formula 
# Returns distance in miles
def distance(lat1, lon1, lat2, lon2):
    p = 0.017453292519943295 # Pi/180
    a = 0.5 - np.cos((lat2 - lat1) * p)/2 + np.cos(lat1 * p) * np.cos(lat2 * p) * (1 - np.cos((lon2 - lon1) * p)) / 2
    return 0.6213712 * 12742 * np.arcsin(np.sqrt(a))   # 2*R*asin...

In [19]:
train['distance_miles'] = distance(train.pickup_latitude, train.pickup_longitude, \
                                      train.dropoff_latitude, train.dropoff_longitude)

In [20]:
test['distance_miles'] = distance(test.pickup_latitude, test.pickup_longitude, \
                                      test.dropoff_latitude, test.dropoff_longitude)

In [21]:
def transform(data):
    # Distances to nearby airports, 
    jfk = (-73.7781, 40.6413)
    ewr = (-74.1745, 40.6895)
    lgr = (-73.8740, 40.7769)

    data['distance_to_jfk'] = distance(jfk[1], jfk[0],
                                         data['pickup_latitude'], data['pickup_longitude'])
    data['distance_to_ewr'] = distance(ewr[1], ewr[0], 
                                          data['pickup_latitude'], data['pickup_longitude'])
    data['distance_to_lgr'] = distance(lgr[1], lgr[0],
                                          data['pickup_latitude'], data['pickup_longitude'])
    
    return data

train = transform(train)
test = transform(test)

In [23]:
train.shape

(4892491, 16)

In [30]:
train.columns

Index(['key', 'fare_amount', 'pickup_datetime', 'pickup_longitude',
       'pickup_latitude', 'dropoff_longitude', 'dropoff_latitude',
       'passenger_count', 'hour_of_day', 'month', 'year', 'weekday',
       'distance_miles', 'distance_to_jfk', 'distance_to_ewr',
       'distance_to_lgr'],
      dtype='object')

In [32]:
X = train.drop(columns= ['key','pickup_datetime'], axis= 1).copy()
Y = test.drop(columns= ['key','pickup_datetime'], axis= 1).copy()
print(df_train.shape)
print(df_test.shape)

(4892491, 14)
(9914, 13)


In [33]:
from sklearn.model_selection import train_test_split 
X_train, X_test, y_train, y_test = train_test_split(df_train.drop('fare_amount', axis=1), df_train['fare_amount'], test_size=0.2, random_state = 42)
print(X_train.shape)
print(X_test.shape)
print(y_train.shape)
print(y_test.shape)

(3913992, 13)
(978499, 13)
(3913992,)
(978499,)


In [43]:
from sklearn.metrics import mean_squared_error as MSE

In [46]:
%%time

from sklearn.ensemble import RandomForestRegressor

rf = RandomForestRegressor(n_jobs=-1)
rf.fit(X_train, y_train)
print(rf.score(X_train, y_train), rf.score(X_test, y_test))
print(np.sqrt(MSE(y_test, rf.predict(X_test))))

0.971840706106586 0.8476830540753778
3.7677752840332093
CPU times: user 56min 31s, sys: 18.3 s, total: 56min 49s
Wall time: 4min 59s


In [50]:
predictions = rf.predict(Y)

In [51]:
preds = pd.DataFrame({'id': test.key, 'fare_amount': predictions})
preds.head(10)

Unnamed: 0,id,fare_amount
0,2015-01-27 13:08:24.0000002,10.05
1,2015-01-27 13:08:24.0000003,12.1
2,2011-10-08 11:53:44.0000002,4.62
3,2012-12-01 21:12:12.0000002,9.25
4,2012-12-01 21:12:12.0000003,16.68
5,2012-12-01 21:12:12.0000005,9.05
6,2011-10-06 12:10:20.0000001,5.62
7,2011-10-06 12:10:20.0000003,48.84
8,2011-10-06 12:10:20.0000002,10.54
9,2014-02-18 15:22:20.0000002,6.25


In [135]:
Y.head()

Unnamed: 0,pickup_longitude,pickup_latitude,dropoff_longitude,dropoff_latitude,passenger_count,hour_of_day,month,year,weekday,distance_miles,distance_to_jfk,distance_to_ewr,distance_to_lgr
0,-73.97332,40.763805,-73.98143,40.743835,1,13,1,2015,1,1.443607,13.274326,11.718491,5.275249
1,-73.986862,40.719383,-73.998886,40.739201,1,13,1,2015,1,1.507044,12.196699,10.042731,7.119935
2,-73.982524,40.75126,-73.979654,40.746139,1,11,10,2011,5,0.384398,13.13008,10.921174,5.949126
3,-73.98116,40.767807,-73.990448,40.751635,1,21,12,2012,5,1.218529,13.766807,11.478334,5.642243
4,-73.966046,40.789775,-73.988565,40.744427,1,21,12,2012,5,3.347514,14.216819,12.926314,4.896995


**Export model**

In [58]:
import pickle
filename = 'prize_model.sav'
pickle.dump(rf, open(filename, 'wb'))

In [60]:
loaded_model = pickle.load(open("prize_model.sav", 'rb'))


In [158]:
year = "2020"
month = "01"
day = "01"
hour = "13"
minute = "12"
second = "11"

timestamp = year + "-" +  month + "-" + day + " " + hour + ":" + minute + ":" + second
timestamp

'2020-01-01 13:12:11'

In [159]:
d = {'pickup_datetime': [timestamp], 'passenger_count': [1],  'pickup_latitude' : [40.763805], 'dropoff_longitude' : [-73.981430], 'dropoff_latitude' : [40.743835],'pickup_longitude' : [-73.973320]}
manual_df = pd.DataFrame(data=d)

In [160]:
manual_df

Unnamed: 0,pickup_datetime,passenger_count,pickup_latitude,dropoff_longitude,dropoff_latitude,pickup_longitude
0,2020-01-01 13:12:11,1,40.763805,-73.98143,40.743835,-73.97332


In [161]:
prepare_time_features(manual_df)
transform(manual_df)
manual_df['distance_miles'] = distance(manual_df.pickup_latitude, manual_df.pickup_longitude, \
                                      manual_df.dropoff_latitude, manual_df.dropoff_longitude)
manual_df.drop(columns= ['pickup_datetime'], axis= 1, inplace=True)

In [162]:
manual_df

Unnamed: 0,passenger_count,pickup_latitude,dropoff_longitude,dropoff_latitude,pickup_longitude,hour_of_day,month,year,weekday,distance_to_jfk,distance_to_ewr,distance_to_lgr,distance_miles
0,1,40.763805,-73.98143,40.743835,-73.97332,13,1,2020,2,13.274308,11.718479,5.275253,1.44361


In [164]:
manual_predictions = loaded_model.predict(manual_df)
manual_predictions

array([10.05])

In [107]:
manual_df.columns

Index(['passenger_count', 'pickup_longitude', 'pickup_latitude',
       'dropoff_longitude', 'dropoff_latitude', 'hour_of_day', 'month', 'year',
       'weekday', 'distance_to_jfk', 'distance_to_ewr', 'distance_to_lgr'],
      dtype='object')

In [106]:
Y.columns

Index(['pickup_longitude', 'pickup_latitude', 'dropoff_longitude',
       'dropoff_latitude', 'passenger_count', 'hour_of_day', 'month', 'year',
       'weekday', 'distance_miles', 'distance_to_jfk', 'distance_to_ewr',
       'distance_to_lgr'],
      dtype='object')

In [163]:
manual_df = manual_df[Y.columns]

In [136]:
new_test = Y.head(1)

In [137]:
new_test

Unnamed: 0,pickup_longitude,pickup_latitude,dropoff_longitude,dropoff_latitude,passenger_count,hour_of_day,month,year,weekday,distance_miles,distance_to_jfk,distance_to_ewr,distance_to_lgr
0,-73.97332,40.763805,-73.98143,40.743835,1,13,1,2015,1,1.443607,13.274326,11.718491,5.275249


In [148]:
loaded_model.predict(new_test)

array([10.05])

In [140]:
manual_df

Unnamed: 0,passenger_count,pickup_longitude,pickup_latitude,dropoff_longitude,dropoff_latitude,hour_of_day,month,year,weekday,distance_to_jfk,distance_to_ewr,distance_to_lgr,distance_miles
0,1,-73.97332,40.763805,-73.98143,40.743835,13,1,2015,3,13.274308,11.718479,5.275253,1.44361


In [165]:
def get_price_pipeline(timestamp, n_passengers, pickup_latitude, dropoff_longitude, dropoff_latitude, pickup_longitude):
    d = {'pickup_datetime': [timestamp], 'passenger_count': [n_passengers],  'pickup_latitude' : [pickup_latitude], 'dropoff_longitude' : [dropoff_longitude], 'dropoff_latitude' : [dropoff_latitude],'pickup_longitude' : [pickup_longitude]}
    manual_df = pd.DataFrame(data=d)
    col_order = ['pickup_longitude', 'pickup_latitude', 'dropoff_longitude',
       'dropoff_latitude', 'passenger_count', 'hour_of_day', 'month', 'year',
       'weekday', 'distance_miles', 'distance_to_jfk', 'distance_to_ewr',
       'distance_to_lgr']
    
    prepare_time_features(manual_df)
    transform(manual_df)
    manual_df['distance_miles'] = distance(manual_df.pickup_latitude, manual_df.pickup_longitude, \
                                      manual_df.dropoff_latitude, manual_df.dropoff_longitude)
    manual_df.drop(columns= ['pickup_datetime'], axis= 1, inplace=True)

    manual_df = manual_df[Y.columns]
    
    manual_predictions = loaded_model.predict(manual_df)
    
    return manual_predictions

In [166]:
d = {'pickup_datetime': [timestamp], 'passenger_count': [1],  'pickup_latitude' : [40.763805], 'dropoff_longitude' : [-73.981430], 'dropoff_latitude' : [40.743835],'pickup_longitude' : [-73.973320]}

In [171]:
get_price_pipeline(timestamp, 1, 40.763805, -73.981430, 40.743835, -73.97332)

array([10.05])

In [168]:
d.values()

dict_values([['2020-01-01 13:12:11'], [1], [40.763805], [-73.98143], [40.743835], [-73.97332]])