In [None]:
'''
Project site: https://www.kaggle.com/competitions/nyc-taxi-trip-duration/


In this competition, the challenge is to build a model that predicts the total ride duration of taxi trips in New York City.
Dataset is originally released by the NYC Taxi and Limousine Commission, which includes pickup time, geo-coordinates, number of passengers, and several other variables as follows:


id - a unique identifier for each trip
vendor_id - a code indicating the provider associated with the trip record
pickup_datetime - date and time when the meter was engaged
dropoff_datetime - date and time when the meter was disengaged
passenger_count - the number of passengers in the vehicle (driver entered value)
pickup_longitude - the longitude where the meter was engaged
pickup_latitude - the latitude where the meter was engaged
dropoff_longitude - the longitude where the meter was disengaged
dropoff_latitude - the latitude where the meter was disengaged
store_and_fwd_flag - This flag indicates whether the trip record was held in vehicle memory before sending to the vendor because the vehicle did not have a connection to the server - Y=store and forward; N=not a store and forward trip
trip_duration - duration of the trip in seconds
'''

In [1]:
import pandas as pd 
import numpy as np

In [2]:
df = pd.read_csv("train.csv")

In [3]:
df = df.sample(20000)

In [4]:
#df = df.sample(30000)

In [5]:
df.T

Unnamed: 0,709309,190396,468662,1387639,881148,1397138,1317594,766471,526859,116719,...,1358688,1203866,76942,828395,689606,816904,1339346,912051,900184,622334
id,id2272843,id3524337,id3222770,id2293972,id2383362,id1424668,id2703391,id2701989,id2563606,id3131624,...,id0931794,id1394129,id2114576,id2511893,id2360484,id2047416,id1779024,id3219651,id2175706,id1813986
vendor_id,2,1,1,2,2,2,1,2,2,2,...,2,2,2,2,2,2,2,1,1,2
pickup_datetime,2016-01-21 15:57:56,2016-02-23 21:57:17,2016-03-19 11:49:58,2016-02-07 23:40:17,2016-02-25 14:26:10,2016-04-25 21:34:52,2016-02-16 22:58:26,2016-01-01 13:53:20,2016-06-17 14:40:22,2016-02-10 09:28:21,...,2016-06-01 21:32:04,2016-01-07 07:36:53,2016-06-15 14:52:04,2016-02-09 20:21:16,2016-02-03 08:46:54,2016-03-24 15:59:43,2016-04-21 23:10:59,2016-03-18 11:10:15,2016-06-07 20:43:32,2016-01-13 17:28:08
dropoff_datetime,2016-01-21 16:02:20,2016-02-23 22:16:46,2016-03-19 12:38:30,2016-02-07 23:50:12,2016-02-25 14:33:54,2016-04-25 21:44:26,2016-02-16 23:30:22,2016-01-01 13:57:58,2016-06-17 15:03:41,2016-02-10 09:44:23,...,2016-06-01 21:38:27,2016-01-07 07:53:16,2016-06-15 15:06:30,2016-02-09 20:40:43,2016-02-03 08:48:58,2016-03-24 16:08:14,2016-04-21 23:28:42,2016-03-18 11:18:19,2016-06-07 20:56:55,2016-01-13 17:31:54
passenger_count,1,1,1,1,1,1,1,2,5,1,...,1,1,3,1,1,1,2,1,1,1
pickup_longitude,-73.985352,-73.961601,-73.992287,-73.973633,-73.985771,-73.983452,-74.000824,-73.952461,-73.989799,-73.988907,...,-73.988419,-73.963303,-73.975655,-73.989525,-73.95446,-73.974274,-74.00267,-73.98774,-73.984734,-73.947678
pickup_latitude,40.744549,40.771378,40.749035,40.788841,40.741272,40.762001,40.742188,40.823551,40.743351,40.736698,...,40.723358,40.771492,40.792019,40.730038,40.731049,40.762676,40.733315,40.738369,40.754181,40.783417
dropoff_longitude,-73.978333,-73.98362,-73.870209,-73.96225,-73.992607,-73.980179,-73.905663,-73.94207,-74.004532,-74.01136,...,-73.994942,-73.963364,-73.956795,-73.954559,-73.955315,-73.959938,-73.986549,-73.985931,-74.010849,-73.953682
dropoff_latitude,40.75383,40.765953,40.683563,40.76326,40.737637,40.746071,40.737911,40.837688,40.75679,40.702068,...,40.73423,40.771973,40.783566,40.778004,40.738155,40.758488,40.670063,40.743565,40.716129,40.777908
store_and_fwd_flag,N,N,N,N,N,N,N,N,N,N,...,N,N,N,N,N,N,N,N,N,N


In [6]:
df['trip_duration'] = df['trip_duration'].astype(int)

In [7]:
import math

def haversine_distance(lat1, lon1, lat2, lon2):
    # I obtained the haversine_distance function from chatGPT
    # Radius of the Earth in kilometers
    R = 6371.0
    
    # Convert latitude and longitude from degrees to radians
    lat1, lon1, lat2, lon2 = map(math.radians, [lat1, lon1, lat2, lon2])
    
    # Differences between latitudes and longitudes
    dlat = lat2 - lat1
    dlon = lon2 - lon1
    
    # Haversine formula
    a = math.sin(dlat / 2) ** 2 + math.cos(lat1) * math.cos(lat2) * math.sin(dlon / 2) ** 2
    c = 2 * math.atan2(math.sqrt(a), math.sqrt(1 - a))
    
    # Calculate distance in kilometers
    distance = R * c
    return distance

df['distance'] = df.apply(lambda row: haversine_distance(row['pickup_latitude'], row['pickup_longitude'],  \
                                                      row['dropoff_latitude'], row['dropoff_longitude'] \
                                                      ), axis=1)


In [8]:
# is the taxi coming to or leaving from downtown manhattan
# from domain knowledge we know that  
# there have usualy traffic in downtown manhattan

#lattitute 0 
# zeropoint:

zrp_lat  = 40.701159
zrp_long =-79.014991
zrpl_lat = 40.718303
zrpl_long = -74.016846
zrpr_lat = 40.711339
zrpr_long = -73.977179


# somepoint from midwest and miteast
mwup_lat= 40.762459
mwup_long=-74.004136

meup_lat=40.749023
meup_long= -73.9586601


#triangle 2

zrpl_lat = 40.718303
zrpl_long = -74.016846
zrpr_lat = 40.711339
zrpr_long = -73.977179
mwup_lat= 40.762459
mwup_long=-74.004136

#triangle 3 

mwup_lat= 40.762459
mwup_long=-74.004136
meup_lat=40.749023
meup_long= -73.9586601
zrpr_lat = 40.711339
zrpr_long = -73.977179


def triangle_area(lat1, lon1, lat2, lon2, lat3, lon3):
    # Calculate the distances between the three points
    side1 = haversine_distance(lat1, lon1, lat2, lon2)
    side2 = haversine_distance(lat2, lon2, lat3, lon3)
    side3 = haversine_distance(lat3, lon3, lat1, lon1)

    # Calculate the semi-perimeter
    s = (side1 + side2 + side3) / 2

    # Calculate the area using Heron's formula
    area = math.sqrt(s * (s - side1) * (s - side2) * (s - side3))

    return area

def check_if_in(p_lat, p_long, zrp_lat, zrp_long, zrpl_lat, zrpl_long, zrpr_lat, zrpr_long) :

    S_tot =  triangle_area(zrp_lat, zrp_long, \
                           zrpl_lat, zrpl_long,\
                           zrpr_lat, zrpr_long)
    
    S1 =  triangle_area(p_lat , p_long, \
                        zrpl_lat, zrpl_long,\
                        zrpr_lat, zrpr_long) 
    
    S2 =  triangle_area(zrp_lat, zrp_long, \
                        p_lat, p_long, \
                        zrpr_lat, zrpr_long) 
    
    S3 =  triangle_area(zrp_lat, zrp_long, \
                            zrpl_lat, zrpl_long, \
                            p_lat, p_long)
    
    Sin_tot = S1+S2+S3
    
    if (Sin_tot - S_tot) < 0.1 :
        return True
    else :
        return False
    

# divide the locations points into 3 Trianges and check if the traxies leaves from or come to 
#within trianges
df['from_Manhatan_DT'] = df.apply(lambda row: check_if_in(row['pickup_latitude'], row['pickup_longitude'], \
                       zrp_lat, zrp_long, zrpl_lat, zrpl_long, zrpr_lat, zrpr_long)  , axis=1) 

df['to_Manhatan_DT'] = df.apply(lambda row: check_if_in(row['dropoff_latitude'], row['dropoff_longitude'], \
                       zrp_lat, zrp_long, zrpl_lat, zrpl_long, zrpr_lat, zrpr_long)  , axis=1) 

df['from_Manhatan_2DT'] = df.apply(lambda row: check_if_in(row['pickup_latitude'], row['pickup_longitude'], \
                               zrpl_lat,zrpl_long,zrpr_lat,zrpr_long,mwup_lat,mwup_long), axis=1)  

df['to_Manhatan_2DT'] = df.apply(lambda row: check_if_in(row['dropoff_latitude'], row['dropoff_longitude'], \
                               zrpl_lat,zrpl_long,zrpr_lat,zrpr_long,mwup_lat,mwup_long), axis=1)  


df['from_Manhatan_3DT'] = df.apply(lambda row: check_if_in(row['pickup_latitude'], row['pickup_longitude'], \
                               mwup_lat, mwup_long, meup_lat, meup_long, zrpr_lat, zrpr_long), axis=1)  

df['to_Manhatan_3DT'] = df.apply(lambda row: check_if_in(row['dropoff_latitude'], row['dropoff_latitude'], \
                               mwup_lat, mwup_long, meup_lat, meup_long, zrpr_lat, zrpr_long), axis=1)  


df['in_Manhattan_DT'] = df['from_Manhatan_DT'] | df['to_Manhatan_DT'] | df['from_Manhatan_2DT'] | df['to_Manhatan_2DT'] | df['from_Manhatan_3DT'] | df['to_Manhatan_3DT'] 

DT  = {True:1, False:0}
df['in_Manhattan_DT'] = df['in_Manhattan_DT'].map(DT)

In [9]:
df['in_Manhattan_DT'].unique()

array([1, 0], dtype=int64)

In [10]:
df['pickup_datetime'] = pd.to_datetime(df['pickup_datetime'])
df['dropoff_datetime'] = pd.to_datetime(df['dropoff_datetime'])

In [11]:
df['trip_time'] = df['dropoff_datetime'] - df['pickup_datetime']
df['duration_minutes'] = df['trip_time'].dt.total_seconds() /60

In [12]:
# Check if each date is a weekday and create a new column 'is_weekday'
df['is_weekday'] = df['pickup_datetime'].dt.weekday < 5

In [13]:
# Define the rush hour time ranges
rush_hour_ranges = [(8, 9), (15, 19)]  # 8-9 a.m. and 3-7 p.m.
# Check if each datetime falls within the rush hour periods and create a new column 'is_rush_hour'
df['is_rush_hour_s'] = df['pickup_datetime'].apply(lambda x: any((start <= x.hour < end) for start, end in rush_hour_ranges))
df['is_rush_hour_e'] = df['dropoff_datetime'].apply(lambda x: any((start <= x.hour < end) for start, end in rush_hour_ranges))

df['is_rush_hour'] = df['is_rush_hour_s'] | df['is_rush_hour_e']

RH  = {True:1, False:0}
df['is_rush_hour'] = df['is_rush_hour'].map(RH)

In [14]:
WD  = {True:1, False:0}
df['is_weekday'] = df['is_weekday'].map(WD)


In [15]:
# Create the 'is_high_passenger_count' column
df['is_high_passenger_count'] = np.where(df['passenger_count'] > 3, 1, 0)

In [16]:
df['is_weekday']

709309     1
190396     1
468662     0
1387639    0
881148     1
          ..
816904     1
1339346    1
912051     1
900184     1
622334     1
Name: is_weekday, Length: 20000, dtype: int64

In [17]:
# Create the 'is_high_passenger_count' column
df['is_high_passenger_count'] = np.where(df['passenger_count'] > 1, 1, 0)
df['is_high_passenger_count']

709309     0
190396     0
468662     0
1387639    0
881148     0
          ..
816904     0
1339346    1
912051     0
900184     0
622334     0
Name: is_high_passenger_count, Length: 20000, dtype: int32

In [18]:
df.columns

Index(['id', 'vendor_id', 'pickup_datetime', 'dropoff_datetime',
       'passenger_count', 'pickup_longitude', 'pickup_latitude',
       'dropoff_longitude', 'dropoff_latitude', 'store_and_fwd_flag',
       'trip_duration', 'distance', 'from_Manhatan_DT', 'to_Manhatan_DT',
       'from_Manhatan_2DT', 'to_Manhatan_2DT', 'from_Manhatan_3DT',
       'to_Manhatan_3DT', 'in_Manhattan_DT', 'trip_time', 'duration_minutes',
       'is_weekday', 'is_rush_hour_s', 'is_rush_hour_e', 'is_rush_hour',
       'is_high_passenger_count'],
      dtype='object')

In [19]:
#trip duration less than 20 secs are removed

df = df[df['trip_duration']<22000]
df = df[df['trip_duration']>20]

In [20]:
df['distance'].min()
df['distance_m'] = df['distance']*1000;

In [21]:
df = df[df['distance_m']>20]

In [22]:
#x = df[ ['is_weekday','vendor_id','passenger_count',  'store_and_fwd_flag', 'distance'] ] 
#x = df[ ['is_rush_hour','is_weekday','vendor_id','passenger_count', 'distance'] ] 

x = df[ ['is_rush_hour','is_weekday', 'distance', 'is_high_passenger_count', 'in_Manhattan_DT'] ] 


#x = df[ ['is_rush_hour','is_weekday', 'distance', 'is_high_passenger_count'] ] 
#y = df['trip_duration']
y = df['duration_minutes']

In [23]:
df.T

Unnamed: 0,709309,190396,468662,1387639,881148,1397138,1317594,766471,526859,116719,...,1358688,1203866,76942,828395,689606,816904,1339346,912051,900184,622334
id,id2272843,id3524337,id3222770,id2293972,id2383362,id1424668,id2703391,id2701989,id2563606,id3131624,...,id0931794,id1394129,id2114576,id2511893,id2360484,id2047416,id1779024,id3219651,id2175706,id1813986
vendor_id,2,1,1,2,2,2,1,2,2,2,...,2,2,2,2,2,2,2,1,1,2
pickup_datetime,2016-01-21 15:57:56,2016-02-23 21:57:17,2016-03-19 11:49:58,2016-02-07 23:40:17,2016-02-25 14:26:10,2016-04-25 21:34:52,2016-02-16 22:58:26,2016-01-01 13:53:20,2016-06-17 14:40:22,2016-02-10 09:28:21,...,2016-06-01 21:32:04,2016-01-07 07:36:53,2016-06-15 14:52:04,2016-02-09 20:21:16,2016-02-03 08:46:54,2016-03-24 15:59:43,2016-04-21 23:10:59,2016-03-18 11:10:15,2016-06-07 20:43:32,2016-01-13 17:28:08
dropoff_datetime,2016-01-21 16:02:20,2016-02-23 22:16:46,2016-03-19 12:38:30,2016-02-07 23:50:12,2016-02-25 14:33:54,2016-04-25 21:44:26,2016-02-16 23:30:22,2016-01-01 13:57:58,2016-06-17 15:03:41,2016-02-10 09:44:23,...,2016-06-01 21:38:27,2016-01-07 07:53:16,2016-06-15 15:06:30,2016-02-09 20:40:43,2016-02-03 08:48:58,2016-03-24 16:08:14,2016-04-21 23:28:42,2016-03-18 11:18:19,2016-06-07 20:56:55,2016-01-13 17:31:54
passenger_count,1,1,1,1,1,1,1,2,5,1,...,1,1,3,1,1,1,2,1,1,1
pickup_longitude,-73.985352,-73.961601,-73.992287,-73.973633,-73.985771,-73.983452,-74.000824,-73.952461,-73.989799,-73.988907,...,-73.988419,-73.963303,-73.975655,-73.989525,-73.95446,-73.974274,-74.00267,-73.98774,-73.984734,-73.947678
pickup_latitude,40.744549,40.771378,40.749035,40.788841,40.741272,40.762001,40.742188,40.823551,40.743351,40.736698,...,40.723358,40.771492,40.792019,40.730038,40.731049,40.762676,40.733315,40.738369,40.754181,40.783417
dropoff_longitude,-73.978333,-73.98362,-73.870209,-73.96225,-73.992607,-73.980179,-73.905663,-73.94207,-74.004532,-74.01136,...,-73.994942,-73.963364,-73.956795,-73.954559,-73.955315,-73.959938,-73.986549,-73.985931,-74.010849,-73.953682
dropoff_latitude,40.75383,40.765953,40.683563,40.76326,40.737637,40.746071,40.737911,40.837688,40.75679,40.702068,...,40.73423,40.771973,40.783566,40.778004,40.738155,40.758488,40.670063,40.743565,40.716129,40.777908
store_and_fwd_flag,N,N,N,N,N,N,N,N,N,N,...,N,N,N,N,N,N,N,N,N,N


In [24]:
x = pd.get_dummies(x,drop_first=True)

In [25]:
x

Unnamed: 0,is_rush_hour,is_weekday,distance,is_high_passenger_count,in_Manhattan_DT
709309,1,1,1.189397,0,1
190396,0,1,1.949893,0,0
468662,0,0,12.603893,0,1
1387639,0,0,3.001667,0,0
881148,0,1,0.703639,0,1
...,...,...,...,...,...
816904,1,1,1.294118,0,0
1339346,0,1,7.163333,1,1
912051,0,1,0.597473,0,1
900184,0,1,4.769097,0,1


In [26]:
def fnc_all_regression_models(x,y):
    
    from sklearn.linear_model import LinearRegression
    from sklearn.linear_model import Ridge,Lasso
    from sklearn.linear_model import ElasticNet
    from sklearn.tree import ExtraTreeRegressor
    from sklearn.ensemble import GradientBoostingRegressor
    from sklearn.neighbors import KNeighborsRegressor
    
    from sklearn.model_selection import train_test_split

    from sklearn.metrics import mean_squared_error,r2_score,mean_absolute_error
    from sklearn.preprocessing import PolynomialFeatures
    from sklearn.pipeline import Pipeline
    from sklearn import svm
    from sklearn.tree import DecisionTreeRegressor
    from sklearn.ensemble import RandomForestRegressor
    from lightgbm import LGBMRegressor
    
    x_train,x_test,y_train,y_test=train_test_split(x,y,test_size=0.2,random_state=42)

    L=LinearRegression()
    R=Ridge()
    Lass=Lasso()
    E=ElasticNet()
    ExTree=ExtraTreeRegressor()
    GBR=GradientBoostingRegressor()
    KN=KNeighborsRegressor()
    PLR = Pipeline([('poly', PolynomialFeatures(degree=2)),
        ('linear', LinearRegression(fit_intercept=True))])
    SVM = svm.SVR()
    DTR = DecisionTreeRegressor(max_depth=12)
    RFR=RandomForestRegressor(n_estimators=100, max_features=100, max_leaf_nodes=100,random_state=42)
    LGBM=LGBMRegressor()

    algos=[L,R,Lass,E,ExTree,GBR,KN,PLR, SVM, DTR,RFR, LGBM]
    algo_names=['LinearRegression','Ridge','Lasso','ElasticNet','ExtraTreeRegressor',
                'GradientBoostingRegressor','KNeighborsRegressor','Polynomial',
                'Support Vector Regression', 'Decision Tree Regressor','Random Forest Regressor',
                'Light Gradient B. Mdl']
    r_squared=[]
    rmse=[]
    mae=[]
    
    result = pd.DataFrame(columns=['R_Squared','RMSE','MAE'],index=algo_names)
    
    for item in algos:
        item.fit(x_train,y_train)
        item.predict(x_test)
        r_squared.append(r2_score(y_test,item.predict(x_test)))
        rmse.append((mean_squared_error(y_test,item.predict(x_test)))**.5)
        mae.append(mean_absolute_error(y_test,item.predict(x_test)))
        
    result.R_Squared=r_squared
    result.RMSE=rmse
    result.MAE=mae
        
    return result.sort_values('R_Squared',ascending=False)

In [27]:
#!pip install lightgbm

In [28]:
fnc_all_regression_models(x,y)

[LightGBM] [Info] Auto-choosing row-wise multi-threading, the overhead of testing was 0.000418 seconds.
You can set `force_row_wise=true` to remove the overhead.
And if memory is not enough, you can set `force_col_wise=true`.
[LightGBM] [Info] Total Bins 263
[LightGBM] [Info] Number of data points in the train set: 15864, number of used features: 5
[LightGBM] [Info] Start training from score 13.963122


Unnamed: 0,R_Squared,RMSE,MAE
GradientBoostingRegressor,0.626436,6.781611,4.260617
Light Gradient B. Mdl,0.620463,6.835617,4.280345
Random Forest Regressor,0.618013,6.857641,4.31894
Polynomial,0.613208,6.900637,4.334638
Support Vector Regression,0.588236,7.119916,4.230119
KNeighborsRegressor,0.557111,7.384103,4.717202
LinearRegression,0.544456,7.488857,4.783295
Ridge,0.544454,7.488877,4.783306
ElasticNet,0.520311,7.684766,4.990767
Lasso,0.519888,7.688151,4.978449


In [29]:
#splitting data
from sklearn.model_selection import train_test_split
#x = np.array(df[["Delivery_person_Age", 
#                   "Delivery_person_Ratings", 
#                   "distance"]])
#y = np.array(df[["Time_taken(min)"]])


xtrain, xtest, ytrain, ytest = train_test_split(x, y,  test_size=0.10, random_state=42)

# creating the LSTM neural network model
from keras.models import Sequential
from keras.layers import Dense, LSTM
model = Sequential()
model.add(LSTM(128, return_sequences=True, input_shape= (xtrain.shape[1], 1)))
model.add(LSTM(64, return_sequences=False ))
model.add(Dense(64))
model.add(Dense(64))
model.add(Dense(25))
model.add(Dense(19))
model.add(Dense(10))
model.add(Dense(1))
model.summary()

# training the model
model.compile(optimizer='adam', loss='mean_squared_error')


Model: "sequential"
_________________________________________________________________
 Layer (type)                Output Shape              Param #   
 lstm (LSTM)                 (None, 5, 128)            66560     
                                                                 
 lstm_1 (LSTM)               (None, 64)                49408     
                                                                 
 dense (Dense)               (None, 64)                4160      
                                                                 
 dense_1 (Dense)             (None, 64)                4160      
                                                                 
 dense_2 (Dense)             (None, 25)                1625      
                                                                 
 dense_3 (Dense)             (None, 19)                494       
                                                                 
 dense_4 (Dense)             (None, 10)                2

In [30]:
model.fit(xtrain, ytrain, batch_size=1, epochs=10)

Epoch 1/10
Epoch 2/10
Epoch 3/10
Epoch 4/10
Epoch 5/10
Epoch 6/10
Epoch 7/10
Epoch 8/10
Epoch 9/10
Epoch 10/10


<keras.src.callbacks.History at 0x22a143f0ed0>

In [31]:
predictions = model.predict(xtest)



In [32]:
ytest.shape

(1984,)

In [33]:
Accuracy_percentage_v = 1 - np.abs( predictions.flatten() - ytest ) / np.abs( ytest )
#percentage error is
np.mean( Accuracy_percentage_v )

0.5828891197626616