In [1]:
import time
import pandas as pd
import numpy as np
import seaborn as sns
import datetime as datetime
import matplotlib.pyplot as plt

from sklearn import linear_model
from sklearn import metrics
from sklearn.cluster import KMeans
from sklearn.linear_model import LinearRegression
from sklearn.model_selection import train_test_split
from sklearn.preprocessing import OneHotEncoder

In [2]:
import geopy.distance
import multiprocessing
from multiprocessing import Pool

In [3]:
from tqdm import tqdm
tqdm.pandas()

## Helper functions

In [4]:
def cal_km(geo_points):
    """
    It will calculate distance between given geo-point in kilometers
    
    Args:
      geo_points (tuple): Geo co-ordinate in following format
                          (point1_lat, point1_long, point2_lat, point2_long)
    
    Returns:
      distance in kilometers, else numpy.NaN in case of errors 
    """
    
    coord_1 = (geo_points[0],geo_points[1])
    coord_2 = (geo_points[2],geo_points[3])
    
    try:
        d = geopy.distance.geodesic(coord_1, coord_2).km
        return d
    
    except Exception as e:
        return np.NaN

In [5]:
def cal_distance_feature(df):
    """
    This will calculate distance between given pickup and dropoff points of each trip record.
    (Uses multiprocessing to speed the computations)
    
    Args:
      df (dataframe): data foe which distance needs to be calculated.
      
    Returns:
      dataframe with new column 'distance for each sample record'
    """
    
    geo_data = []
    for plong,plat,dlong,dlat in zip(df.pickup_longitude,
                                     df.pickup_latitude,
                                     df.dropoff_longitude,
                                     df.dropoff_latitude):

        # Geo point for a sample trip
        point = (plat,plong,dlat,dlong)
        geo_data.append(point)

    start_time = time.time()
    with Pool(6) as p:
        distance = p.map(cal_km, geo_data)

    print("Distance feature generation time : {:.2f} seconds".format(time.time() - start_time))
    
    df['distance'] = distance
    
    return df

In [6]:
def train_location_clusters(data, n):
    """
    Will train Kmeans model on provided location co-ordinates
    
    Args:
      data (dataframe): dataframe with location co-ordinates
      n (int): Number of clusters to form
      
    Return:
      Trained kmeans model
    """
    
    # kmeans training
    kmeans_model = KMeans(n_jobs = -1, n_clusters = 6, init='k-means++')
    kmeans_model.fit(data)

    return kmeans_model


In [7]:
def get_location_cluster(model, data):
    """
    Will generate cluster number for each sample using passed Kmeans model.
    
    Args:
      model (obj):  Kmeans model
      data (dataframe): dataframe containing location data
    
    Returns:
      Cluster predictions list made for each sample
    """
    
    pred = model.predict(data)
    return pred

In [8]:
def generate_date_features(df):
    """
    This will generate various feature from pickup date time.
    
    Args:
      df (dataframe) : dataframe with records for which features needs to be calculated
    
    Returns:
      dataframe with added features to it.
    """
    
    df['pickup_datetime'] = pd.to_datetime(df['pickup_datetime'], format = '%Y-%m-%d %H:%M:%S', errors = 'coerce')
    
    df['pickup_datetime_month'] = df['pickup_datetime'].dt.month
    df['pickup_datetime_dayofweek'] = df['pickup_datetime'].dt.dayofweek
    df['pickup_datetime_hour'] = df['pickup_datetime'].dt.hour
    df['pickup_datetime_minute'] = df['pickup_datetime'].dt.minute
    
    return df

### Load training data

In [9]:
train_data = pd.read_csv("/home/nihaln.karne/Personal_practice/Kaggle_nyc_taxi/nyc-taxi-trip-duration/train.csv" , date_parser=True)
print("Train Shape: " , train_data.shape)
train_data.head()

Train Shape:  (1458644, 11)


Unnamed: 0,id,vendor_id,pickup_datetime,dropoff_datetime,passenger_count,pickup_longitude,pickup_latitude,dropoff_longitude,dropoff_latitude,store_and_fwd_flag,trip_duration
0,id2875421,2,2016-03-14 17:24:55,2016-03-14 17:32:30,1,-73.982155,40.767937,-73.96463,40.765602,N,455
1,id2377394,1,2016-06-12 00:43:35,2016-06-12 00:54:38,1,-73.980415,40.738564,-73.999481,40.731152,N,663
2,id3858529,2,2016-01-19 11:35:24,2016-01-19 12:10:48,1,-73.979027,40.763939,-74.005333,40.710087,N,2124
3,id3504673,2,2016-04-06 19:32:31,2016-04-06 19:39:40,1,-74.01004,40.719971,-74.012268,40.706718,N,429
4,id2181028,2,2016-03-26 13:30:55,2016-03-26 13:38:10,1,-73.973053,40.793209,-73.972923,40.78252,N,435


### Feature generations

In [10]:
# Generate the distance feature
train_data = cal_distance_feature(train_data)
print("Distance feature calculated")

# Forms cluster for pickup co-ordinates
data = train_data[['pickup_longitude','pickup_latitude']]
pickup_kmeans_model = train_location_clusters(data, 6)

# pickup location cluster
pred_cluster = get_location_cluster(pickup_kmeans_model, data)
train_data['pickup_cluster'] = pred_cluster
print("pickup_cluster feature calculated")

# Forms cluster for dropoff co-ordinates
data = train_data[['dropoff_longitude','dropoff_latitude']]
dropoff_kmeans_model = train_location_clusters(data, 6)

# dropoff location cluster
pred_cluster = get_location_cluster(dropoff_kmeans_model, data)
train_data['dropoff_cluster'] = pred_cluster
print("dropoff_cluster feature calculated")

# Pickup date-time features
train_data = generate_date_features(train_data)
print("Pickup date-time feature calculated")

Distance feature generation time : 73.61 seconds
Distance feature calculated
pickup_cluster feature calculated
dropoff_cluster feature calculated
Pickup date-time feature calculated


In [11]:
train_data.head()

Unnamed: 0,id,vendor_id,pickup_datetime,dropoff_datetime,passenger_count,pickup_longitude,pickup_latitude,dropoff_longitude,dropoff_latitude,store_and_fwd_flag,trip_duration,distance,pickup_cluster,dropoff_cluster,pickup_datetime_month,pickup_datetime_dayofweek,pickup_datetime_hour,pickup_datetime_minute
0,id2875421,2,2016-03-14 17:24:55,2016-03-14 17:32:30,1,-73.982155,40.767937,-73.96463,40.765602,N,455,1.502172,0,1,3,0,17,24
1,id2377394,1,2016-06-12 00:43:35,2016-06-12 00:54:38,1,-73.980415,40.738564,-73.999481,40.731152,N,663,1.80866,3,0,6,6,0,43
2,id3858529,2,2016-01-19 11:35:24,2016-01-19 12:10:48,1,-73.979027,40.763939,-74.005333,40.710087,N,2124,6.379687,0,0,1,1,11,35
3,id3504673,2,2016-04-06 19:32:31,2016-04-06 19:39:40,1,-74.01004,40.719971,-74.012268,40.706718,N,429,1.483632,3,0,4,2,19,32
4,id2181028,2,2016-03-26 13:30:55,2016-03-26 13:38:10,1,-73.973053,40.793209,-73.972923,40.78252,N,435,1.187038,0,1,3,5,13,30


### Preparing data for model training

In [12]:
train_features = train_data[['id', 'vendor_id', "passenger_count", 'store_and_fwd_flag', 
                             "distance", 'pickup_cluster', 'dropoff_cluster',
                             "pickup_datetime_month", 'pickup_datetime_dayofweek',
                             "pickup_datetime_hour", 'pickup_datetime_minute', "trip_duration"]]

# Categorial features names
cat_columns = ['vendor_id', 'store_and_fwd_flag', 'pickup_cluster', 'dropoff_cluster']

# encoding the categorical features
train_features = pd.get_dummies(train_features, columns=cat_columns)

# creating instance of one-hot-encoder
# enc = OneHotEncoder(handle_unknown='ignore')

# enc_df = pd.DataFrame(enc.fit_transform(train[['pickup_datetime_hour']]).toarray())
# enc_df

print(train_features.columns)
train_features.head()

Index(['id', 'passenger_count', 'distance', 'pickup_datetime_month',
       'pickup_datetime_dayofweek', 'pickup_datetime_hour',
       'pickup_datetime_minute', 'trip_duration', 'vendor_id_1', 'vendor_id_2',
       'store_and_fwd_flag_N', 'store_and_fwd_flag_Y', 'pickup_cluster_0',
       'pickup_cluster_1', 'pickup_cluster_2', 'pickup_cluster_3',
       'pickup_cluster_4', 'pickup_cluster_5', 'dropoff_cluster_0',
       'dropoff_cluster_1', 'dropoff_cluster_2', 'dropoff_cluster_3',
       'dropoff_cluster_4', 'dropoff_cluster_5'],
      dtype='object')


Unnamed: 0,id,passenger_count,distance,pickup_datetime_month,pickup_datetime_dayofweek,pickup_datetime_hour,pickup_datetime_minute,trip_duration,vendor_id_1,vendor_id_2,...,pickup_cluster_2,pickup_cluster_3,pickup_cluster_4,pickup_cluster_5,dropoff_cluster_0,dropoff_cluster_1,dropoff_cluster_2,dropoff_cluster_3,dropoff_cluster_4,dropoff_cluster_5
0,id2875421,1,1.502172,3,0,17,24,455,0,1,...,0,0,0,0,0,1,0,0,0,0
1,id2377394,1,1.80866,6,6,0,43,663,1,0,...,0,1,0,0,1,0,0,0,0,0
2,id3858529,1,6.379687,1,1,11,35,2124,0,1,...,0,0,0,0,1,0,0,0,0,0
3,id3504673,1,1.483632,4,2,19,32,429,0,1,...,0,1,0,0,1,0,0,0,0,0
4,id2181028,1,1.187038,3,5,13,30,435,0,1,...,0,0,0,0,0,1,0,0,0,0


### Model training pipeline

In [13]:
# Separating independent and dependant variable
X = train_features.loc[:, ~train_features.columns.isin(['id','trip_duration'])]
y = train_features[['trip_duration']]

In [14]:
X.head()

Unnamed: 0,passenger_count,distance,pickup_datetime_month,pickup_datetime_dayofweek,pickup_datetime_hour,pickup_datetime_minute,vendor_id_1,vendor_id_2,store_and_fwd_flag_N,store_and_fwd_flag_Y,...,pickup_cluster_2,pickup_cluster_3,pickup_cluster_4,pickup_cluster_5,dropoff_cluster_0,dropoff_cluster_1,dropoff_cluster_2,dropoff_cluster_3,dropoff_cluster_4,dropoff_cluster_5
0,1,1.502172,3,0,17,24,0,1,1,0,...,0,0,0,0,0,1,0,0,0,0
1,1,1.80866,6,6,0,43,1,0,1,0,...,0,1,0,0,1,0,0,0,0,0
2,1,6.379687,1,1,11,35,0,1,1,0,...,0,0,0,0,1,0,0,0,0,0
3,1,1.483632,4,2,19,32,0,1,1,0,...,0,1,0,0,1,0,0,0,0,0
4,1,1.187038,3,5,13,30,0,1,1,0,...,0,0,0,0,0,1,0,0,0,0


In [15]:
# train validation data split
X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.2, random_state=0)

In [16]:
# train linear regression model
reg_model = linear_model.LinearRegression()
reg_model.fit(X_train, y_train)

LinearRegression(copy_X=True, fit_intercept=True, n_jobs=None, normalize=False)

In [17]:
eq = "trip_duration = {:.2f}\n".format(reg_model.intercept_[0])

for feature, coef in zip (X.columns, reg_model.coef_[0]):
    eq = eq + "\t\t + {}*({:.2f}) \n".format(feature, coef)
    
print(eq)

trip_duration = -5466.26
		 + passenger_count*(8.47) 
		 + distance*(123.51) 
		 + pickup_datetime_month*(16.13) 
		 + pickup_datetime_dayofweek*(-3.79) 
		 + pickup_datetime_hour*(3.49) 
		 + pickup_datetime_minute*(-0.46) 
		 + vendor_id_1*(-101.36) 
		 + vendor_id_2*(101.36) 
		 + store_and_fwd_flag_N*(-20.34) 
		 + store_and_fwd_flag_Y*(20.34) 
		 + pickup_cluster_0*(-3082.05) 
		 + pickup_cluster_1*(3059.35) 
		 + pickup_cluster_2*(-3094.81) 
		 + pickup_cluster_3*(-3060.30) 
		 + pickup_cluster_4*(-2934.55) 
		 + pickup_cluster_5*(9112.36) 
		 + dropoff_cluster_0*(9014.96) 
		 + dropoff_cluster_1*(8962.57) 
		 + dropoff_cluster_2*(3059.35) 
		 + dropoff_cluster_3*(8955.89) 
		 + dropoff_cluster_4*(-15714.50) 
		 + dropoff_cluster_5*(-14278.28) 



In [18]:
# Validation error
y_pred = reg_model.predict(X_test)

y_pred = [ele[0] for ele in y_pred]
y_pred = np.array(y_pred)

# If predicted durations are negative
npos = np.where(y_pred<0)[0]
for i in npos:
    y_pred[i] = 0

In [19]:
# print('Mean Absolute Error:', metrics.mean_absolute_error(y_test, y_pred))  
# print('Mean Squared Error:', metrics.mean_squared_error(y_test, y_pred))  
# print('Root Mean Squared Error:', np.sqrt(metrics.mean_squared_error(y_test, y_pred)))
print('Validation RMSLE :', np.sqrt(metrics.mean_squared_log_error(y_test, y_pred)))

Validation RMSLE : 0.6710086697973838


### Load test data

In [20]:
test_data = pd.read_csv("/home/nihaln.karne/Personal_practice/Kaggle_nyc_taxi/nyc-taxi-trip-duration/test.csv", date_parser=True)
print("Test Shape: " , test_data.shape)
test_data.head()

Test Shape:  (625134, 9)


Unnamed: 0,id,vendor_id,pickup_datetime,passenger_count,pickup_longitude,pickup_latitude,dropoff_longitude,dropoff_latitude,store_and_fwd_flag
0,id3004672,1,2016-06-30 23:59:58,1,-73.988129,40.732029,-73.990173,40.75668,N
1,id3505355,1,2016-06-30 23:59:53,1,-73.964203,40.679993,-73.959808,40.655403,N
2,id1217141,1,2016-06-30 23:59:47,1,-73.997437,40.737583,-73.98616,40.729523,N
3,id2150126,2,2016-06-30 23:59:41,1,-73.95607,40.7719,-73.986427,40.730469,N
4,id1598245,1,2016-06-30 23:59:33,1,-73.970215,40.761475,-73.96151,40.75589,N


### Test data feature generation

In [21]:
# Generate the distance feature
test_data = cal_distance_feature(test_data)
print("Distance feature calculated")

# pickup location cluster
data = test_data[['pickup_longitude','pickup_latitude']]
pred_cluster = get_location_cluster(pickup_kmeans_model, data)
test_data['pickup_cluster'] = pred_cluster
print("pickup_cluster feature calculated")

# dropoff location cluster
data = test_data[['dropoff_longitude','dropoff_latitude']]
pred_cluster = get_location_cluster(dropoff_kmeans_model, data)
test_data['dropoff_cluster'] = pred_cluster
print("dropoff_cluster feature calculated")

# Pickup date-time features
test_data = generate_date_features(test_data)
print("Pickup date-time feature calculated")

Distance feature generation time : 29.90 seconds
Distance feature calculated
pickup_cluster feature calculated
dropoff_cluster feature calculated
Pickup date-time feature calculated


In [22]:
test_data.head()

Unnamed: 0,id,vendor_id,pickup_datetime,passenger_count,pickup_longitude,pickup_latitude,dropoff_longitude,dropoff_latitude,store_and_fwd_flag,distance,pickup_cluster,dropoff_cluster,pickup_datetime_month,pickup_datetime_dayofweek,pickup_datetime_hour,pickup_datetime_minute
0,id3004672,1,2016-06-30 23:59:58,1,-73.988129,40.732029,-73.990173,40.75668,N,2.742863,3,0,6,3,23,59
1,id3505355,1,2016-06-30 23:59:53,1,-73.964203,40.679993,-73.959808,40.655403,N,2.755774,3,0,6,3,23,59
2,id1217141,1,2016-06-30 23:59:47,1,-73.997437,40.737583,-73.98616,40.729523,N,1.307112,3,0,6,3,23,59
3,id2150126,2,2016-06-30 23:59:41,1,-73.95607,40.7719,-73.986427,40.730469,N,5.266978,0,0,6,3,23,59
4,id1598245,1,2016-06-30 23:59:33,1,-73.970215,40.761475,-73.96151,40.75589,N,0.961745,0,1,6,3,23,59


### Data transformation for prediction

In [23]:
test_features = test_data[['id', 'vendor_id', "passenger_count", 'store_and_fwd_flag', 
                             "distance", 'pickup_cluster', 'dropoff_cluster',
                             "pickup_datetime_month", 'pickup_datetime_dayofweek',
                             "pickup_datetime_hour", 'pickup_datetime_minute']]

# Categorial features names
cat_columns = ['vendor_id', 'store_and_fwd_flag', 'pickup_cluster', 'dropoff_cluster']

# encoding the categorical features
test_features = pd.get_dummies(test_features, columns=cat_columns)

# creating instance of one-hot-encoder
# enc = OneHotEncoder(handle_unknown='ignore')

# enc_df = pd.DataFrame(enc.fit_transform(train[['pickup_datetime_hour']]).toarray())
# enc_df

print(test_features.columns)
test_features.head()

Index(['id', 'passenger_count', 'distance', 'pickup_datetime_month',
       'pickup_datetime_dayofweek', 'pickup_datetime_hour',
       'pickup_datetime_minute', 'vendor_id_1', 'vendor_id_2',
       'store_and_fwd_flag_N', 'store_and_fwd_flag_Y', 'pickup_cluster_0',
       'pickup_cluster_1', 'pickup_cluster_2', 'pickup_cluster_3',
       'pickup_cluster_4', 'pickup_cluster_5', 'dropoff_cluster_0',
       'dropoff_cluster_1', 'dropoff_cluster_2', 'dropoff_cluster_3',
       'dropoff_cluster_4', 'dropoff_cluster_5'],
      dtype='object')


Unnamed: 0,id,passenger_count,distance,pickup_datetime_month,pickup_datetime_dayofweek,pickup_datetime_hour,pickup_datetime_minute,vendor_id_1,vendor_id_2,store_and_fwd_flag_N,...,pickup_cluster_2,pickup_cluster_3,pickup_cluster_4,pickup_cluster_5,dropoff_cluster_0,dropoff_cluster_1,dropoff_cluster_2,dropoff_cluster_3,dropoff_cluster_4,dropoff_cluster_5
0,id3004672,1,2.742863,6,3,23,59,1,0,1,...,0,1,0,0,1,0,0,0,0,0
1,id3505355,1,2.755774,6,3,23,59,1,0,1,...,0,1,0,0,1,0,0,0,0,0
2,id1217141,1,1.307112,6,3,23,59,1,0,1,...,0,1,0,0,1,0,0,0,0,0
3,id2150126,1,5.266978,6,3,23,59,0,1,1,...,0,0,0,0,1,0,0,0,0,0
4,id1598245,1,0.961745,6,3,23,59,1,0,1,...,0,0,0,0,0,1,0,0,0,0


### Generate predictions for test data

In [24]:
test_records = test_features.loc[:, ~test_features.columns.isin(['id'])]

predictions = reg_model.predict(test_records)

predictions = [ele[0] for ele in predictions]
predictions = np.array(predictions)

# If predicted durations are negative
npos = np.where(predictions<0)[0]
for i in npos:
    predictions[i] = 0

In [25]:
# Submission format

submission = pd.DataFrame({ "id": test_features.id.values,
                            "trip_duration": predictions
             })

print("Predictions shape:", submission.shape)
submission.head()

Predictions shape: (625134, 2)


Unnamed: 0,id,trip_duration
0,id3004672,852.32898
1,id3505355,853.923602
2,id1217141,675.000023
3,id2150126,1345.045945
4,id1598245,558.211105
