In [1]:
%matplotlib inline
import pandas as pd
from datetime import datetime
from sklearn.model_selection import train_test_split
import xgboost as xgb
from sklearn.linear_model import LinearRegression, Ridge, BayesianRidge
from sklearn.cluster import MiniBatchKMeans
from sklearn.metrics import mean_squared_error
from math import radians, cos, sin, asin, sqrt
import seaborn as sns
import matplotlib
import numpy as np
import matplotlib.pyplot as plt

In [2]:
plt.rcParams['figure.figsize']  = [20,10]

In [3]:
train = pd.read_csv('input/new-york-city-taxi-with-osrm/train.csv')
test = pd.read_csv('input/new-york-city-taxi-with-osrm/test.csv')

In [4]:
train.head()

Unnamed: 0,id,vendor_id,pickup_datetime,dropoff_datetime,passenger_count,pickup_longitude,pickup_latitude,dropoff_longitude,dropoff_latitude,store_and_fwd_flag,trip_duration
0,id2875421,2,2016-03-14 17:24:55,2016-03-14 17:32:30,1,-73.982155,40.767937,-73.96463,40.765602,N,455
1,id2377394,1,2016-06-12 00:43:35,2016-06-12 00:54:38,1,-73.980415,40.738564,-73.999481,40.731152,N,663
2,id3858529,2,2016-01-19 11:35:24,2016-01-19 12:10:48,1,-73.979027,40.763939,-74.005333,40.710087,N,2124
3,id3504673,2,2016-04-06 19:32:31,2016-04-06 19:39:40,1,-74.01004,40.719971,-74.012268,40.706718,N,429
4,id2181028,2,2016-03-26 13:30:55,2016-03-26 13:38:10,1,-73.973053,40.793209,-73.972923,40.78252,N,435


In [5]:
pd.set_option('display.float_format', lambda x: '%.5f' % x)
train.describe()

Unnamed: 0,vendor_id,passenger_count,pickup_longitude,pickup_latitude,dropoff_longitude,dropoff_latitude,trip_duration
count,1458644.0,1458644.0,1458644.0,1458644.0,1458644.0,1458644.0,1458644.0
mean,1.53495,1.66453,-73.97349,40.75092,-73.97342,40.7518,959.49227
std,0.49878,1.31424,0.0709,0.03288,0.07064,0.03589,5237.43172
min,1.0,0.0,-121.93334,34.3597,-121.9333,32.18114,1.0
25%,1.0,1.0,-73.99187,40.73735,-73.99133,40.73588,397.0
50%,2.0,1.0,-73.98174,40.7541,-73.97975,40.75452,662.0
75%,2.0,2.0,-73.96733,40.76836,-73.96301,40.76981,1075.0
max,2.0,9.0,-61.33553,51.88108,-61.33553,43.92103,3526282.0


In [6]:
train.info()

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 1458644 entries, 0 to 1458643
Data columns (total 11 columns):
id                    1458644 non-null object
vendor_id             1458644 non-null int64
pickup_datetime       1458644 non-null object
dropoff_datetime      1458644 non-null object
passenger_count       1458644 non-null int64
pickup_longitude      1458644 non-null float64
pickup_latitude       1458644 non-null float64
dropoff_longitude     1458644 non-null float64
dropoff_latitude      1458644 non-null float64
store_and_fwd_flag    1458644 non-null object
trip_duration         1458644 non-null int64
dtypes: float64(4), int64(3), object(4)
memory usage: 122.4+ MB


In [7]:
m = np.mean(train['trip_duration'])
s = np.std(train['trip_duration'])

# Restricting the training to within two standard deviations.
train_medium_time = train[train['trip_duration'] >= m - 2*s]
train_medium_time = train[train['trip_duration'] <= m + 2*s]
m

959.4922729603659

In [8]:
train_medium_time = train[train['pickup_longitude'] <= -73.75]
train_medium_time = train[train['pickup_longitude'] >= -74.03]
train_medium_time = train[train['pickup_latitude'] <= 40.85]
train_medium_time = train[train['pickup_latitude'] >= 40.63]
train_medium_time = train[train['dropoff_longitude'] <= -73.75]
train_medium_time = train[train['dropoff_longitude'] >= -74.03]
train_medium_time = train[train['dropoff_latitude'] <= 40.85]
train_medium_time = train[train['dropoff_latitude'] >= 40.63]

In [11]:
# Formatting the date variables.
train['pickup_datetime'] = pd.to_datetime(train.pickup_datetime)
test['pickup_datetime'] = pd.to_datetime(test.pickup_datetime)
train.loc[:, 'pickup_date'] = train['pickup_datetime'].dt.date
test.loc[:, 'pickup_date'] = test['pickup_datetime'].dt.date
train['dropoff_datetime'] = pd.to_datetime(train.dropoff_datetime) #Not in Test

In [12]:
train_medium_time.describe()

Unnamed: 0,vendor_id,passenger_count,pickup_longitude,pickup_latitude,dropoff_longitude,dropoff_latitude,trip_duration
count,1454311.0,1454311.0,1454311.0,1454311.0,1454311.0,1454311.0,1454311.0
mean,1.53496,1.66456,-73.97354,40.75109,-73.97332,40.75228,955.89284
std,0.49878,1.31438,0.0396,0.03002,0.03848,0.03216,5241.65782
min,1.0,0.0,-79.56973,39.80393,-80.35543,40.63,1.0
25%,1.0,1.0,-73.99186,40.73745,-73.99131,40.73618,396.0
50%,2.0,1.0,-73.98174,40.75417,-73.97975,40.75463,661.0
75%,2.0,2.0,-73.96737,40.76839,-73.96305,40.76988,1071.0
max,2.0,9.0,-66.97216,51.88108,-69.04802,43.92103,3526282.0
