New York Taxi Trip Predicition

The data was originally published by the NYC Taxi and Limousine Commission (TLC).

File description

train.csv - the training set (contains 1458644 trip records)
test.csv - the testing set (contains 625134 trip records)


Data fields

id - a unique identifier for each trip
vendor_id - a code indicating the provider associated with the trip record
pickup_datetime - date and time when the meter was engaged
dropoff_datetime - date and time when the meter was disengaged
passenger_count - the number of passengers in the vehicle (driver entered value)
pickup_longitude - the longitude where the meter was engaged
pickup_latitude - the latitude where the meter was engaged
dropoff_longitude - the longitude where the meter was disengaged
dropoff_latitude - the latitude where the meter was disengaged
store_and_fwd_flag - This flag indicates whether the trip record was held in vehicle memory before sending to the vendor because the vehicle did not have a connection to the server - Y=store and forward; N=not a store and forward trip
trip_duration - duration of the trip in seconds

In [None]:
import pandas as pd
import numpy as np
import matplotlib.pyplot as plt
from datetime import datetime
import seaborn as sns
%matplotlib inline

In [None]:
df_train=pd.read_csv('train.csv')
df_test=pd.read_csv('test.csv')

In [None]:
df_test.head()

Creating a function to convert the lattitude and longitude of pickup and drop off locations into distance.

In [None]:
def haversine(lon1, lat1, lon2, lat2):

    lon1, lat1, lon2, lat2 = map(np.radians, [lon1, lat1, lon2, lat2])

    dlon = lon2 - lon1
    dlat = lat2 - lat1

    a = np.sin(dlat/2.0)**2 + np.cos(lat1) * np.cos(lat2) * np.sin(dlon/2.0)**2

    c = 2 * np.arcsin(np.sqrt(a))
    km = 6367 * c
    miles = km *  0.621371
    return miles

In [None]:
df_train['distance'] = haversine(df_train.pickup_longitude, df_train.pickup_latitude,
                                           df_train.dropoff_longitude, df_train.dropoff_latitude)

In [None]:
df_train.head()

In [None]:
fig, ax = plt.subplots(ncols=1, nrows=1,figsize=(12,10))
plt.ylim(40.6, 40.9)
plt.xlim(-74.1,-73.7)
ax.scatter(df_train['pickup_longitude'],df_train['pickup_latitude'], s=0.01, alpha=1)

In [None]:
df_test.head()

In [None]:
df_test['distance'] = haversine(df_test.pickup_longitude, df_test.pickup_latitude,
                                           df_test.dropoff_longitude, df_test.dropoff_latitude)

In [None]:
flag=pd.get_dummies(df_train['store_and_fwd_flag'],drop_first=True)

In [None]:
df_train=pd.concat([df_train,flag],axis=1)

In [None]:
df_train.drop(['store_and_fwd_flag'] , axis=1,inplace =True)

In [None]:
flagtest=pd.get_dummies(df_test['store_and_fwd_flag'],drop_first=True)

In [None]:
df_test=pd.concat([df_test,flagtest],axis=1)

In [None]:
df_test.drop(['store_and_fwd_flag'] , axis=1,inplace =True)

In [None]:
df_train['pickup_datetime'] = pd.to_datetime(df_train['pickup_datetime'] )

In [None]:
df_test['pickup_datetime'] = pd.to_datetime(df_test['pickup_datetime'])

In [None]:
df_train['pickup_hour'] = df_train.pickup_datetime.dt.hour
df_train['day'] = df_train.pickup_datetime.dt.dayofyear
df_train['weekday'] = df_train.pickup_datetime.dt.dayofweek
df_train['month'] = df_train.pickup_datetime.dt.month

In [None]:
df_train['date']=pd.to_datetime(df_train.pickup_datetime.dt.date,format='%Y-%m-%d')

In [None]:
df_train.head()

In [None]:
df_train['date'] = pd.to_datetime(df_train['date'], format = "%Y-%m-%d")

In [None]:
type(df_train['date'][0])

In [None]:
from sklearn.linear_model import LinearRegression

In [None]:
lm=LinearRegression()

In [None]:
X_train=df_train.drop (['trip_duration','dropoff_datetime','pickup_datetime','date'] , axis=1)
y_train=df_train['trip_duration']
X_test=df_test.drop(['pickup_datetime'],axis=1)



In [None]:
lm.fit(X_train,y_train)

In [None]:
df_train['id']=df_train['id'].str[2:]

In [None]:
df_test['id']=df_test['id'].str[2:]

In [None]:
df_train.head()

In [None]:
df_test['pickup_hour'] = df_test.pickup_datetime.dt.hour
df_test['day'] = df_test.pickup_datetime.dt.dayofyear
df_test['weekday'] = df_test.pickup_datetime.dt.dayofweek
df_test['month'] = df_test.pickup_datetime.dt.month

In [None]:
df_test.head()

In [None]:
predict_linear=lm.predict(X_test)

In [None]:
predict_linear

In [None]:
out=df

In [None]:
df_test.head()

In [None]:
test=pd.read_csv('test.csv')

In [None]:
test['trip_duration']=predict_linear.astype(int)

output= test[['id','trip_duration']]

In [None]:
output.isnull().values.any()

In [None]:
output.to_csv('prediction_linearRegression',index=False)