In [193]:
from pathlib import Path

import numpy as np
import pandas as pd

pd.set_option("display.max_columns", 100)


In [194]:
features_df = pd.read_csv(
    "./train.csv", 
     index_col="tripid"
)
labels_df = pd.read_csv(
    "./labels.csv", 
     index_col="tripid"
)

In [195]:
from math import sin, cos, sqrt, atan2, radians
def calculate_distance(**kwargs):
    R = 6373.0

    lat1 = radians(kwargs['pick_lat'])
    lon1 = radians(kwargs['pick_lon'])
    lat2 = radians(kwargs['drop_lat'])
    lon2 = radians(kwargs['drop_lon'])

    dlon = lon2 - lon1
    dlat = lat2 - lat1

    a = sin(dlat / 2)**2 + cos(lat1) * cos(lat2) * sin(dlon / 2)**2
    c = 2 * atan2(sqrt(a), sqrt(1 - a))

    return R * c

In [196]:
def format_df(dataFrame):
    
    df = dataFrame
    
    #calculatre the distance
    df['distance'] = [calculate_distance(**df[['pick_lat', 'pick_lon', 'drop_lat', 'drop_lon']].iloc[i].to_dict()) for i in range(df.shape[0])]
    
    #calculate the time difference between pickup and drop time in seconds
    df['time_difference'] = (pd.DatetimeIndex(df['drop_time']) - pd.DatetimeIndex(df['pickup_time'])).seconds
    
    #fill the missing values for the duration  
    df.loc[(pd.isnull(df.duration)), 'duration'] = df.time_difference

    #fill the missing values for additional_fare
    df.loc[(pd.isnull(df.additional_fare)), 'additional_fare'] = 10.5
    
    #fill the missing values for meter_waiting_till_pickup
    df.loc[(pd.isnull(df.meter_waiting_till_pickup)), 'meter_waiting_till_pickup'] = 60
    
    #fill the missing values for meter_waiting
    df.loc[(pd.isnull(df.meter_waiting)), 'meter_waiting'] = 120
    
    #fill the missing values for meter_waiting_fare
    df.loc[(pd.isnull(df.meter_waiting_fare)), 'meter_waiting_fare'] = df.meter_waiting * 1.75/60

    #fill the missing values for fare
    df.loc[(pd.isnull(df.fare)), 'fare'] = df.meter_waiting_fare + df.additional_fare + df.distance

    df = df.drop(['pickup_time','drop_time','pick_lat','pick_lon','drop_lat','drop_lon'], axis = 1) 
    


    
    return df

In [197]:
from sklearn.preprocessing import StandardScaler
from sklearn.impute import SimpleImputer
from sklearn.compose import ColumnTransformer

from sklearn.linear_model import LogisticRegression
from sklearn.multioutput import MultiOutputClassifier

from sklearn.pipeline import Pipeline

from sklearn.model_selection import train_test_split

from sklearn.metrics import roc_curve, roc_auc_score

RANDOM_SEED = 6  

In [198]:
features_df = features_df.drop(['label'], axis = 1) 

numeric_cols = features_df.columns[features_df.dtypes != "object"].values


In [199]:
# chain preprocessing into a Pipeline object
# each step is a tuple of (name you chose, sklearn transformer)
numeric_preprocessing_steps = Pipeline([
    ('standard_scaler', StandardScaler()),
    ('simple_imputer', SimpleImputer(strategy='median'))
])

# create the preprocessor stage of final pipeline
# each entry in the transformer list is a tuple of
# (name you choose, sklearn transformer, list of columns)
preprocessor = ColumnTransformer(
    transformers = [
        ("numeric", numeric_preprocessing_steps, numeric_cols)
    ],
    remainder = "drop"
)

In [200]:
estimators = MultiOutputClassifier(
    estimator=LogisticRegression(penalty="l2", C=1)
)

In [201]:
full_pipeline = Pipeline([
    ("preprocessor", preprocessor),
    ("estimators", estimators),
])

In [202]:
full_pipeline

Pipeline(memory=None,
         steps=[('preprocessor',
                 ColumnTransformer(n_jobs=None, remainder='drop',
                                   sparse_threshold=0.3,
                                   transformer_weights=None,
                                   transformers=[('numeric',
                                                  Pipeline(memory=None,
                                                           steps=[('standard_scaler',
                                                                   StandardScaler(copy=True,
                                                                                  with_mean=True,
                                                                                  with_std=True)),
                                                                  ('simple_imputer',
                                                                   SimpleImputer(add_indicator=False,
                                                                 

In [203]:
X_train, X_eval, y_train, y_eval = train_test_split(
    features_df,
    labels_df,
    test_size=0.33,
    shuffle=True,
    stratify=labels_df,
    random_state=RANDOM_SEED
)

In [204]:
%%time

# Train model
full_pipeline.fit(X_train, y_train)

# Predict on evaluation set
# This competition wants probabilities, not labels
preds = full_pipeline.predict_proba(X_eval)
preds

Wall time: 301 ms


[array([[0.91281879, 0.08718121],
        [0.91306995, 0.08693005],
        [0.93922644, 0.06077356],
        ...,
        [0.93731483, 0.06268517],
        [0.93171891, 0.06828109],
        [0.63789393, 0.36210607]])]

In [205]:
print("test_probas[0].shape", preds[0].shape)
#print("test_probas[1].shape", preds[1].shape)

test_probas[0].shape (5669, 2)


In [206]:
y_preds = pd.DataFrame(
    {
        "label": preds[0][:, 1],
        #"seasonal_vaccine": preds[1][:, 1],
    },
    index = y_eval.index
)
print("y_preds.shape:", y_preds.shape)
y_preds.head()

y_preds.shape: (5669, 1)


Unnamed: 0_level_0,label
tripid,Unnamed: 1_level_1
194824749,0.087181
206975816,0.08693
195240179,0.060774
205348966,0.067443
200208485,0.171143


In [207]:
test_features = pd.read_csv("./test.csv", 
                               index_col="tripid")

In [208]:
features_df

Unnamed: 0_level_0,additional_fare,duration,meter_waiting,meter_waiting_fare,meter_waiting_till_pickup,pickup_time,drop_time,pick_lat,pick_lon,drop_lat,drop_lon,fare
tripid,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1,Unnamed: 5_level_1,Unnamed: 6_level_1,Unnamed: 7_level_1,Unnamed: 8_level_1,Unnamed: 9_level_1,Unnamed: 10_level_1,Unnamed: 11_level_1,Unnamed: 12_level_1
189123628,10.5,834.0,56.0,0.0000,64.0,11/1/2019 0:20,11/1/2019 0:34,6.86252,79.8993,6.90330,79.8783,270.32
189125358,10.5,791.0,47.0,0.0000,134.0,11/1/2019 0:56,11/1/2019 1:09,6.88589,79.8984,6.91373,79.8923,197.85
189125719,10.5,1087.0,80.0,0.0000,61.0,11/1/2019 1:08,11/1/2019 1:26,6.90839,79.8651,6.93669,79.9146,301.64
189127273,10.5,598.0,271.0,15.6638,68.0,11/1/2019 2:27,11/1/2019 2:37,6.92570,79.8895,6.92748,79.8971,82.30
189128020,,,,,,11/1/2019 3:34,11/1/2019 3:51,6.87441,79.8615,6.84478,79.9290,358.39
...,...,...,...,...,...,...,...,...,...,...,...,...
213803193,10.5,838.0,93.0,5.4219,451.0,1/31/2020 22:07,1/31/2020 22:21,7.29073,80.6367,7.28891,80.6557,198.26
213812756,10.5,2151.0,428.0,0.0000,39.0,1/31/2020 23:07,1/31/2020 23:43,6.90569,79.8516,6.95089,79.9389,581.23
213813930,10.5,263.0,9.0,0.0000,110.0,1/31/2020 23:21,1/31/2020 23:25,7.09210,79.9000,7.10135,79.9017,76.20
213815405,10.5,858.0,115.0,0.0000,317.0,1/31/2020 23:39,1/31/2020 23:53,6.94540,79.8768,6.93574,79.9010,133.31


In [209]:
#test_features = test_features.drop(['tripid'], axis = 1) 
test_features

Unnamed: 0_level_0,additional_fare,duration,meter_waiting,meter_waiting_fare,meter_waiting_till_pickup,pickup_time,drop_time,pick_lat,pick_lon,drop_lat,drop_lon,fare
tripid,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1,Unnamed: 5_level_1,Unnamed: 6_level_1,Unnamed: 7_level_1,Unnamed: 8_level_1,Unnamed: 9_level_1,Unnamed: 10_level_1,Unnamed: 11_level_1,Unnamed: 12_level_1
213284604,10.5,924,42,2.44860,148,2/1/2020 0:38,2/1/2020 0:53,6.83454,79.8750,6.77490,79.8840,289.27
213286352,10.5,4249,20,0.00000,91,2/1/2020 1:02,2/1/2020 2:13,6.91168,79.8723,6.55091,79.9706,1912.70
213293973,10.5,1552,255,2.65880,23,2/1/2020 5:02,2/1/2020 5:28,6.92145,79.8478,6.90539,79.8989,394.00
213294622,10.5,462,16,0.00000,198,2/1/2020 5:30,2/1/2020 5:38,6.77433,79.9416,6.80401,79.9407,154.32
213298687,10.5,814,392,12.36920,69,2/1/2020 7:00,2/1/2020 7:14,6.97968,79.9130,6.98875,79.8914,147.47
...,...,...,...,...,...,...,...,...,...,...,...,...
222856243,10.5,1723,429,24.83332,3,3/16/2020 21:28,3/16/2020 21:56,6.85103,79.9567,6.85588,79.9214,388.48
222857785,10.5,1378,80,0.00000,125,3/16/2020 21:59,3/16/2020 22:22,6.91293,79.9656,6.92112,79.8980,379.85
222858416,10.5,418,56,3.28440,93,3/16/2020 22:02,3/16/2020 22:09,6.85718,79.9081,6.83868,79.9083,112.79
222858691,10.5,1604,548,31.67440,17,3/16/2020 22:07,3/16/2020 22:34,6.91289,79.8846,6.93159,79.9145,248.46


In [210]:
test_probas = full_pipeline.predict(test_features)
test_probas

array([['correct'],
       ['incorrect'],
       ['correct'],
       ...,
       ['correct'],
       ['correct'],
       ['correct']], dtype=object)

In [211]:
submission_df = pd.read_csv("./sample_submission.csv", 
                            index_col="tripid")

In [212]:
submission_df.head()


Unnamed: 0_level_0,prediction
tripid,Unnamed: 1_level_1
213284604,1
213286352,0
213293973,0
213294622,1
213298687,1


In [217]:
# Make sure we have the rows in the same order
np.testing.assert_array_equal(test_features.index.values, submission_df.index.values)

# Save predictions to submission data frame
submission_df["prediction"] = test_probas[0][:, 1]

submission_df.head()

IndexError: too many indices for array

In [214]:
test_features.index.values

array([213284604, 213286352, 213293973, ..., 222858416, 222858691,
       222860703], dtype=int64)

In [215]:
submission_df.index.values

array([213284604, 213286352, 213293973, ..., 222858416, 222858691,
       222860703], dtype=int64)

In [219]:
submission_df.to_csv('my_new_submission.csv', index=True)


In [218]:
submission_df

Unnamed: 0_level_0,prediction
tripid,Unnamed: 1_level_1
213284604,1
213286352,0
213293973,0
213294622,1
213298687,1
...,...
222856243,1
222857785,0
222858416,0
222858691,0


In [220]:
df

NameError: name 'df' is not defined

In [221]:
features_df

Unnamed: 0_level_0,additional_fare,duration,meter_waiting,meter_waiting_fare,meter_waiting_till_pickup,pickup_time,drop_time,pick_lat,pick_lon,drop_lat,drop_lon,fare
tripid,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1,Unnamed: 5_level_1,Unnamed: 6_level_1,Unnamed: 7_level_1,Unnamed: 8_level_1,Unnamed: 9_level_1,Unnamed: 10_level_1,Unnamed: 11_level_1,Unnamed: 12_level_1
189123628,10.5,834.0,56.0,0.0000,64.0,11/1/2019 0:20,11/1/2019 0:34,6.86252,79.8993,6.90330,79.8783,270.32
189125358,10.5,791.0,47.0,0.0000,134.0,11/1/2019 0:56,11/1/2019 1:09,6.88589,79.8984,6.91373,79.8923,197.85
189125719,10.5,1087.0,80.0,0.0000,61.0,11/1/2019 1:08,11/1/2019 1:26,6.90839,79.8651,6.93669,79.9146,301.64
189127273,10.5,598.0,271.0,15.6638,68.0,11/1/2019 2:27,11/1/2019 2:37,6.92570,79.8895,6.92748,79.8971,82.30
189128020,,,,,,11/1/2019 3:34,11/1/2019 3:51,6.87441,79.8615,6.84478,79.9290,358.39
...,...,...,...,...,...,...,...,...,...,...,...,...
213803193,10.5,838.0,93.0,5.4219,451.0,1/31/2020 22:07,1/31/2020 22:21,7.29073,80.6367,7.28891,80.6557,198.26
213812756,10.5,2151.0,428.0,0.0000,39.0,1/31/2020 23:07,1/31/2020 23:43,6.90569,79.8516,6.95089,79.9389,581.23
213813930,10.5,263.0,9.0,0.0000,110.0,1/31/2020 23:21,1/31/2020 23:25,7.09210,79.9000,7.10135,79.9017,76.20
213815405,10.5,858.0,115.0,0.0000,317.0,1/31/2020 23:39,1/31/2020 23:53,6.94540,79.8768,6.93574,79.9010,133.31


In [222]:
features_df['additional_fare'].mean()

13.719650642158596

In [223]:
features_df['meter_waiting_till_pickup'].mean()

112.466831624838

In [269]:
features_df['meter_waiting'].describe()

count     16974.000000
mean        629.074231
std        8063.260669
min           0.000000
25%          33.000000
50%         123.000000
75%         351.000000
max      453650.000000
Name: meter_waiting, dtype: float64

In [227]:
features_df

Unnamed: 0_level_0,additional_fare,duration,meter_waiting,meter_waiting_fare,meter_waiting_till_pickup,pickup_time,drop_time,pick_lat,pick_lon,drop_lat,drop_lon,fare
tripid,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1,Unnamed: 5_level_1,Unnamed: 6_level_1,Unnamed: 7_level_1,Unnamed: 8_level_1,Unnamed: 9_level_1,Unnamed: 10_level_1,Unnamed: 11_level_1,Unnamed: 12_level_1
189123628,10.5,834.0,56.0,0.0000,64.0,11/1/2019 0:20,11/1/2019 0:34,6.86252,79.8993,6.90330,79.8783,270.32
189125358,10.5,791.0,47.0,0.0000,134.0,11/1/2019 0:56,11/1/2019 1:09,6.88589,79.8984,6.91373,79.8923,197.85
189125719,10.5,1087.0,80.0,0.0000,61.0,11/1/2019 1:08,11/1/2019 1:26,6.90839,79.8651,6.93669,79.9146,301.64
189127273,10.5,598.0,271.0,15.6638,68.0,11/1/2019 2:27,11/1/2019 2:37,6.92570,79.8895,6.92748,79.8971,82.30
189128020,,,,,,11/1/2019 3:34,11/1/2019 3:51,6.87441,79.8615,6.84478,79.9290,358.39
...,...,...,...,...,...,...,...,...,...,...,...,...
213803193,10.5,838.0,93.0,5.4219,451.0,1/31/2020 22:07,1/31/2020 22:21,7.29073,80.6367,7.28891,80.6557,198.26
213812756,10.5,2151.0,428.0,0.0000,39.0,1/31/2020 23:07,1/31/2020 23:43,6.90569,79.8516,6.95089,79.9389,581.23
213813930,10.5,263.0,9.0,0.0000,110.0,1/31/2020 23:21,1/31/2020 23:25,7.09210,79.9000,7.10135,79.9017,76.20
213815405,10.5,858.0,115.0,0.0000,317.0,1/31/2020 23:39,1/31/2020 23:53,6.94540,79.8768,6.93574,79.9010,133.31


In [228]:
features_df.loc[(pd.isnull(features_df.additional_fare)), 'additional_fare'] = features_df['additional_fare'].mean()


In [229]:
features_df

Unnamed: 0_level_0,additional_fare,duration,meter_waiting,meter_waiting_fare,meter_waiting_till_pickup,pickup_time,drop_time,pick_lat,pick_lon,drop_lat,drop_lon,fare
tripid,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1,Unnamed: 5_level_1,Unnamed: 6_level_1,Unnamed: 7_level_1,Unnamed: 8_level_1,Unnamed: 9_level_1,Unnamed: 10_level_1,Unnamed: 11_level_1,Unnamed: 12_level_1
189123628,10.500000,834.0,56.0,0.0000,64.0,11/1/2019 0:20,11/1/2019 0:34,6.86252,79.8993,6.90330,79.8783,270.32
189125358,10.500000,791.0,47.0,0.0000,134.0,11/1/2019 0:56,11/1/2019 1:09,6.88589,79.8984,6.91373,79.8923,197.85
189125719,10.500000,1087.0,80.0,0.0000,61.0,11/1/2019 1:08,11/1/2019 1:26,6.90839,79.8651,6.93669,79.9146,301.64
189127273,10.500000,598.0,271.0,15.6638,68.0,11/1/2019 2:27,11/1/2019 2:37,6.92570,79.8895,6.92748,79.8971,82.30
189128020,13.719651,,,,,11/1/2019 3:34,11/1/2019 3:51,6.87441,79.8615,6.84478,79.9290,358.39
...,...,...,...,...,...,...,...,...,...,...,...,...
213803193,10.500000,838.0,93.0,5.4219,451.0,1/31/2020 22:07,1/31/2020 22:21,7.29073,80.6367,7.28891,80.6557,198.26
213812756,10.500000,2151.0,428.0,0.0000,39.0,1/31/2020 23:07,1/31/2020 23:43,6.90569,79.8516,6.95089,79.9389,581.23
213813930,10.500000,263.0,9.0,0.0000,110.0,1/31/2020 23:21,1/31/2020 23:25,7.09210,79.9000,7.10135,79.9017,76.20
213815405,10.500000,858.0,115.0,0.0000,317.0,1/31/2020 23:39,1/31/2020 23:53,6.94540,79.8768,6.93574,79.9010,133.31


In [234]:
features_df['meter_waiting'].describe()


count     16974.000000
mean        629.074231
std        8063.260669
min           0.000000
25%          33.000000
50%         123.000000
75%         351.000000
max      453650.000000
Name: meter_waiting, dtype: float64

In [273]:
features_df['meter_waiting'].median()

123.0

In [243]:
features_df['meter_waiting'].value_counts().nlargest(n=30)

0.0      729
10.0     386
20.0     305
30.0     251
40.0     202
50.0     151
60.0     134
11.0     120
70.0     111
2.0      110
12.0     107
21.0     105
31.0     105
14.0      99
80.0      99
90.0      99
13.0      96
3.0       94
17.0      93
9.0       93
22.0      93
15.0      92
18.0      90
6.0       89
120.0     88
1.0       87
32.0      87
100.0     86
110.0     86
23.0      84
Name: meter_waiting, dtype: int64

In [244]:
from math import sin, cos, sqrt, atan2, radians
def calculate_distance(**kwargs):
    R = 6373.0

    lat1 = radians(kwargs['pick_lat'])
    lon1 = radians(kwargs['pick_lon'])
    lat2 = radians(kwargs['drop_lat'])
    lon2 = radians(kwargs['drop_lon'])

    dlon = lon2 - lon1
    dlat = lat2 - lat1

    a = sin(dlat / 2)**2 + cos(lat1) * cos(lat2) * sin(dlon / 2)**2
    c = 2 * atan2(sqrt(a), sqrt(1 - a))

    return R * c

In [246]:
features_df['distance'] = [calculate_distance(**features_df[['pick_lat', 'pick_lon', 'drop_lat', 'drop_lon']].iloc[i].to_dict()) for i in range(features_df.shape[0])]


In [247]:
features_df

Unnamed: 0_level_0,additional_fare,duration,meter_waiting,meter_waiting_fare,meter_waiting_till_pickup,pickup_time,drop_time,pick_lat,pick_lon,drop_lat,drop_lon,fare,distance
tripid,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1,Unnamed: 5_level_1,Unnamed: 6_level_1,Unnamed: 7_level_1,Unnamed: 8_level_1,Unnamed: 9_level_1,Unnamed: 10_level_1,Unnamed: 11_level_1,Unnamed: 12_level_1,Unnamed: 13_level_1
189123628,10.500000,834.0,56.0,0.0000,64.0,11/1/2019 0:20,11/1/2019 0:34,6.86252,79.8993,6.90330,79.8783,270.32,5.094369
189125358,10.500000,791.0,47.0,0.0000,134.0,11/1/2019 0:56,11/1/2019 1:09,6.88589,79.8984,6.91373,79.8923,197.85,3.169052
189125719,10.500000,1087.0,80.0,0.0000,61.0,11/1/2019 1:08,11/1/2019 1:26,6.90839,79.8651,6.93669,79.9146,301.64,6.307375
189127273,10.500000,598.0,271.0,15.6638,68.0,11/1/2019 2:27,11/1/2019 2:37,6.92570,79.8895,6.92748,79.8971,82.30,0.862217
189128020,13.719651,,,,,11/1/2019 3:34,11/1/2019 3:51,6.87441,79.8615,6.84478,79.9290,358.39,8.150340
...,...,...,...,...,...,...,...,...,...,...,...,...,...
213803193,10.500000,838.0,93.0,5.4219,451.0,1/31/2020 22:07,1/31/2020 22:21,7.29073,80.6367,7.28891,80.6557,198.26,2.106037
213812756,10.500000,2151.0,428.0,0.0000,39.0,1/31/2020 23:07,1/31/2020 23:43,6.90569,79.8516,6.95089,79.9389,581.23,10.871789
213813930,10.500000,263.0,9.0,0.0000,110.0,1/31/2020 23:21,1/31/2020 23:25,7.09210,79.9000,7.10135,79.9017,76.20,1.045847
213815405,10.500000,858.0,115.0,0.0000,317.0,1/31/2020 23:39,1/31/2020 23:53,6.94540,79.8768,6.93574,79.9010,133.31,2.879981


In [248]:
features_df['time_difference'] = (pd.DatetimeIndex(features_df['drop_time']) - pd.DatetimeIndex(features_df['pickup_time'])).seconds


In [249]:
features_df

Unnamed: 0_level_0,additional_fare,duration,meter_waiting,meter_waiting_fare,meter_waiting_till_pickup,pickup_time,drop_time,pick_lat,pick_lon,drop_lat,drop_lon,fare,distance,time_difference
tripid,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1,Unnamed: 5_level_1,Unnamed: 6_level_1,Unnamed: 7_level_1,Unnamed: 8_level_1,Unnamed: 9_level_1,Unnamed: 10_level_1,Unnamed: 11_level_1,Unnamed: 12_level_1,Unnamed: 13_level_1,Unnamed: 14_level_1
189123628,10.500000,834.0,56.0,0.0000,64.0,11/1/2019 0:20,11/1/2019 0:34,6.86252,79.8993,6.90330,79.8783,270.32,5.094369,840
189125358,10.500000,791.0,47.0,0.0000,134.0,11/1/2019 0:56,11/1/2019 1:09,6.88589,79.8984,6.91373,79.8923,197.85,3.169052,780
189125719,10.500000,1087.0,80.0,0.0000,61.0,11/1/2019 1:08,11/1/2019 1:26,6.90839,79.8651,6.93669,79.9146,301.64,6.307375,1080
189127273,10.500000,598.0,271.0,15.6638,68.0,11/1/2019 2:27,11/1/2019 2:37,6.92570,79.8895,6.92748,79.8971,82.30,0.862217,600
189128020,13.719651,,,,,11/1/2019 3:34,11/1/2019 3:51,6.87441,79.8615,6.84478,79.9290,358.39,8.150340,1020
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
213803193,10.500000,838.0,93.0,5.4219,451.0,1/31/2020 22:07,1/31/2020 22:21,7.29073,80.6367,7.28891,80.6557,198.26,2.106037,840
213812756,10.500000,2151.0,428.0,0.0000,39.0,1/31/2020 23:07,1/31/2020 23:43,6.90569,79.8516,6.95089,79.9389,581.23,10.871789,2160
213813930,10.500000,263.0,9.0,0.0000,110.0,1/31/2020 23:21,1/31/2020 23:25,7.09210,79.9000,7.10135,79.9017,76.20,1.045847,240
213815405,10.500000,858.0,115.0,0.0000,317.0,1/31/2020 23:39,1/31/2020 23:53,6.94540,79.8768,6.93574,79.9010,133.31,2.879981,840


In [252]:
features_df['fare_per_km'] = ((features_df['fare']) / (features_df['distance']))


In [253]:
features_df

Unnamed: 0_level_0,additional_fare,duration,meter_waiting,meter_waiting_fare,meter_waiting_till_pickup,pickup_time,drop_time,pick_lat,pick_lon,drop_lat,drop_lon,fare,distance,time_difference,fare_per_km
tripid,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1,Unnamed: 5_level_1,Unnamed: 6_level_1,Unnamed: 7_level_1,Unnamed: 8_level_1,Unnamed: 9_level_1,Unnamed: 10_level_1,Unnamed: 11_level_1,Unnamed: 12_level_1,Unnamed: 13_level_1,Unnamed: 14_level_1,Unnamed: 15_level_1
189123628,10.500000,834.0,56.0,0.0000,64.0,11/1/2019 0:20,11/1/2019 0:34,6.86252,79.8993,6.90330,79.8783,270.32,5.094369,840,53.062512
189125358,10.500000,791.0,47.0,0.0000,134.0,11/1/2019 0:56,11/1/2019 1:09,6.88589,79.8984,6.91373,79.8923,197.85,3.169052,780,62.431918
189125719,10.500000,1087.0,80.0,0.0000,61.0,11/1/2019 1:08,11/1/2019 1:26,6.90839,79.8651,6.93669,79.9146,301.64,6.307375,1080,47.823385
189127273,10.500000,598.0,271.0,15.6638,68.0,11/1/2019 2:27,11/1/2019 2:37,6.92570,79.8895,6.92748,79.8971,82.30,0.862217,600,95.451642
189128020,13.719651,,,,,11/1/2019 3:34,11/1/2019 3:51,6.87441,79.8615,6.84478,79.9290,358.39,8.150340,1020,43.972399
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
213803193,10.500000,838.0,93.0,5.4219,451.0,1/31/2020 22:07,1/31/2020 22:21,7.29073,80.6367,7.28891,80.6557,198.26,2.106037,840,94.138917
213812756,10.500000,2151.0,428.0,0.0000,39.0,1/31/2020 23:07,1/31/2020 23:43,6.90569,79.8516,6.95089,79.9389,581.23,10.871789,2160,53.462222
213813930,10.500000,263.0,9.0,0.0000,110.0,1/31/2020 23:21,1/31/2020 23:25,7.09210,79.9000,7.10135,79.9017,76.20,1.045847,240,72.859628
213815405,10.500000,858.0,115.0,0.0000,317.0,1/31/2020 23:39,1/31/2020 23:53,6.94540,79.8768,6.93574,79.9010,133.31,2.879981,840,46.288506


In [254]:
features_df['fare_per_km'].describe()


count    1.703900e+04
mean              inf
std               NaN
min      0.000000e+00
25%      4.802738e+01
50%      5.879010e+01
75%      8.801478e+01
max               inf
Name: fare_per_km, dtype: float64

In [264]:
features_df['fare_per_km']=features_df.round({'fare_per_km':0})

In [265]:
features_df

Unnamed: 0_level_0,additional_fare,duration,meter_waiting,meter_waiting_fare,meter_waiting_till_pickup,pickup_time,drop_time,pick_lat,pick_lon,drop_lat,drop_lon,fare,distance,time_difference,fare_per_km
tripid,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1,Unnamed: 5_level_1,Unnamed: 6_level_1,Unnamed: 7_level_1,Unnamed: 8_level_1,Unnamed: 9_level_1,Unnamed: 10_level_1,Unnamed: 11_level_1,Unnamed: 12_level_1,Unnamed: 13_level_1,Unnamed: 14_level_1,Unnamed: 15_level_1
189123628,10.500000,834.0,56.0,0.0000,64.0,11/1/2019 0:20,11/1/2019 0:34,6.86252,79.8993,6.90330,79.8783,270.32,5.094369,840,10.5
189125358,10.500000,791.0,47.0,0.0000,134.0,11/1/2019 0:56,11/1/2019 1:09,6.88589,79.8984,6.91373,79.8923,197.85,3.169052,780,10.5
189125719,10.500000,1087.0,80.0,0.0000,61.0,11/1/2019 1:08,11/1/2019 1:26,6.90839,79.8651,6.93669,79.9146,301.64,6.307375,1080,10.5
189127273,10.500000,598.0,271.0,15.6638,68.0,11/1/2019 2:27,11/1/2019 2:37,6.92570,79.8895,6.92748,79.8971,82.30,0.862217,600,10.5
189128020,13.719651,,,,,11/1/2019 3:34,11/1/2019 3:51,6.87441,79.8615,6.84478,79.9290,358.39,8.150340,1020,13.7197
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
213803193,10.500000,838.0,93.0,5.4219,451.0,1/31/2020 22:07,1/31/2020 22:21,7.29073,80.6367,7.28891,80.6557,198.26,2.106037,840,10.5
213812756,10.500000,2151.0,428.0,0.0000,39.0,1/31/2020 23:07,1/31/2020 23:43,6.90569,79.8516,6.95089,79.9389,581.23,10.871789,2160,10.5
213813930,10.500000,263.0,9.0,0.0000,110.0,1/31/2020 23:21,1/31/2020 23:25,7.09210,79.9000,7.10135,79.9017,76.20,1.045847,240,10.5
213815405,10.500000,858.0,115.0,0.0000,317.0,1/31/2020 23:39,1/31/2020 23:53,6.94540,79.8768,6.93574,79.9010,133.31,2.879981,840,10.5


In [267]:
features_df

Unnamed: 0_level_0,additional_fare,duration,meter_waiting,meter_waiting_fare,meter_waiting_till_pickup,pickup_time,drop_time,pick_lat,pick_lon,drop_lat,drop_lon,fare,distance,time_difference,fare_per_km
tripid,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1,Unnamed: 5_level_1,Unnamed: 6_level_1,Unnamed: 7_level_1,Unnamed: 8_level_1,Unnamed: 9_level_1,Unnamed: 10_level_1,Unnamed: 11_level_1,Unnamed: 12_level_1,Unnamed: 13_level_1,Unnamed: 14_level_1,Unnamed: 15_level_1
189123628,10.500000,834.0,56.0,0.0000,64.0,11/1/2019 0:20,11/1/2019 0:34,6.86252,79.8993,6.90330,79.8783,270.32,5.094369,840,10.5
189125358,10.500000,791.0,47.0,0.0000,134.0,11/1/2019 0:56,11/1/2019 1:09,6.88589,79.8984,6.91373,79.8923,197.85,3.169052,780,10.5
189125719,10.500000,1087.0,80.0,0.0000,61.0,11/1/2019 1:08,11/1/2019 1:26,6.90839,79.8651,6.93669,79.9146,301.64,6.307375,1080,10.5
189127273,10.500000,598.0,271.0,15.6638,68.0,11/1/2019 2:27,11/1/2019 2:37,6.92570,79.8895,6.92748,79.8971,82.30,0.862217,600,10.5
189128020,13.719651,,,,,11/1/2019 3:34,11/1/2019 3:51,6.87441,79.8615,6.84478,79.9290,358.39,8.150340,1020,13.7197
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
213803193,10.500000,838.0,93.0,5.4219,451.0,1/31/2020 22:07,1/31/2020 22:21,7.29073,80.6367,7.28891,80.6557,198.26,2.106037,840,10.5
213812756,10.500000,2151.0,428.0,0.0000,39.0,1/31/2020 23:07,1/31/2020 23:43,6.90569,79.8516,6.95089,79.9389,581.23,10.871789,2160,10.5
213813930,10.500000,263.0,9.0,0.0000,110.0,1/31/2020 23:21,1/31/2020 23:25,7.09210,79.9000,7.10135,79.9017,76.20,1.045847,240,10.5
213815405,10.500000,858.0,115.0,0.0000,317.0,1/31/2020 23:39,1/31/2020 23:53,6.94540,79.8768,6.93574,79.9010,133.31,2.879981,840,10.5


In [268]:
features_df['meter_waiting'].notna(how = "any")

TypeError: notna() got an unexpected keyword argument 'how'

In [270]:
features_df['waiting_fare_per_s'] = ((features_df['meter_waiting_fare']) / (features_df['meter_waiting']))


In [271]:
features_df

Unnamed: 0_level_0,additional_fare,duration,meter_waiting,meter_waiting_fare,meter_waiting_till_pickup,pickup_time,drop_time,pick_lat,pick_lon,drop_lat,drop_lon,fare,distance,time_difference,fare_per_km,waiting_fare_per_s
tripid,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1,Unnamed: 5_level_1,Unnamed: 6_level_1,Unnamed: 7_level_1,Unnamed: 8_level_1,Unnamed: 9_level_1,Unnamed: 10_level_1,Unnamed: 11_level_1,Unnamed: 12_level_1,Unnamed: 13_level_1,Unnamed: 14_level_1,Unnamed: 15_level_1,Unnamed: 16_level_1
189123628,10.500000,834.0,56.0,0.0000,64.0,11/1/2019 0:20,11/1/2019 0:34,6.86252,79.8993,6.90330,79.8783,270.32,5.094369,840,10.5,0.0000
189125358,10.500000,791.0,47.0,0.0000,134.0,11/1/2019 0:56,11/1/2019 1:09,6.88589,79.8984,6.91373,79.8923,197.85,3.169052,780,10.5,0.0000
189125719,10.500000,1087.0,80.0,0.0000,61.0,11/1/2019 1:08,11/1/2019 1:26,6.90839,79.8651,6.93669,79.9146,301.64,6.307375,1080,10.5,0.0000
189127273,10.500000,598.0,271.0,15.6638,68.0,11/1/2019 2:27,11/1/2019 2:37,6.92570,79.8895,6.92748,79.8971,82.30,0.862217,600,10.5,0.0578
189128020,13.719651,,,,,11/1/2019 3:34,11/1/2019 3:51,6.87441,79.8615,6.84478,79.9290,358.39,8.150340,1020,13.7197,
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
213803193,10.500000,838.0,93.0,5.4219,451.0,1/31/2020 22:07,1/31/2020 22:21,7.29073,80.6367,7.28891,80.6557,198.26,2.106037,840,10.5,0.0583
213812756,10.500000,2151.0,428.0,0.0000,39.0,1/31/2020 23:07,1/31/2020 23:43,6.90569,79.8516,6.95089,79.9389,581.23,10.871789,2160,10.5,0.0000
213813930,10.500000,263.0,9.0,0.0000,110.0,1/31/2020 23:21,1/31/2020 23:25,7.09210,79.9000,7.10135,79.9017,76.20,1.045847,240,10.5,0.0000
213815405,10.500000,858.0,115.0,0.0000,317.0,1/31/2020 23:39,1/31/2020 23:53,6.94540,79.8768,6.93574,79.9010,133.31,2.879981,840,10.5,0.0000


In [272]:
features_df['waiting_fare_per_s'].describe()

count    16245.000000
mean         0.029266
std          0.028498
min          0.000000
25%          0.000000
50%          0.038909
75%          0.057912
max          0.061413
Name: waiting_fare_per_s, dtype: float64

In [274]:
features_df

Unnamed: 0_level_0,additional_fare,duration,meter_waiting,meter_waiting_fare,meter_waiting_till_pickup,pickup_time,drop_time,pick_lat,pick_lon,drop_lat,drop_lon,fare,distance,time_difference,fare_per_km,waiting_fare_per_s
tripid,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1,Unnamed: 5_level_1,Unnamed: 6_level_1,Unnamed: 7_level_1,Unnamed: 8_level_1,Unnamed: 9_level_1,Unnamed: 10_level_1,Unnamed: 11_level_1,Unnamed: 12_level_1,Unnamed: 13_level_1,Unnamed: 14_level_1,Unnamed: 15_level_1,Unnamed: 16_level_1
189123628,10.500000,834.0,56.0,0.0000,64.0,11/1/2019 0:20,11/1/2019 0:34,6.86252,79.8993,6.90330,79.8783,270.32,5.094369,840,10.5,0.0000
189125358,10.500000,791.0,47.0,0.0000,134.0,11/1/2019 0:56,11/1/2019 1:09,6.88589,79.8984,6.91373,79.8923,197.85,3.169052,780,10.5,0.0000
189125719,10.500000,1087.0,80.0,0.0000,61.0,11/1/2019 1:08,11/1/2019 1:26,6.90839,79.8651,6.93669,79.9146,301.64,6.307375,1080,10.5,0.0000
189127273,10.500000,598.0,271.0,15.6638,68.0,11/1/2019 2:27,11/1/2019 2:37,6.92570,79.8895,6.92748,79.8971,82.30,0.862217,600,10.5,0.0578
189128020,13.719651,,,,,11/1/2019 3:34,11/1/2019 3:51,6.87441,79.8615,6.84478,79.9290,358.39,8.150340,1020,13.7197,
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
213803193,10.500000,838.0,93.0,5.4219,451.0,1/31/2020 22:07,1/31/2020 22:21,7.29073,80.6367,7.28891,80.6557,198.26,2.106037,840,10.5,0.0583
213812756,10.500000,2151.0,428.0,0.0000,39.0,1/31/2020 23:07,1/31/2020 23:43,6.90569,79.8516,6.95089,79.9389,581.23,10.871789,2160,10.5,0.0000
213813930,10.500000,263.0,9.0,0.0000,110.0,1/31/2020 23:21,1/31/2020 23:25,7.09210,79.9000,7.10135,79.9017,76.20,1.045847,240,10.5,0.0000
213815405,10.500000,858.0,115.0,0.0000,317.0,1/31/2020 23:39,1/31/2020 23:53,6.94540,79.8768,6.93574,79.9010,133.31,2.879981,840,10.5,0.0000


In [275]:
from math import sin, cos, sqrt, atan2, radians
def calculate_plain_distance(**kwargs):

    lat1 = radians(kwargs['pick_lat'])
    lon1 = radians(kwargs['pick_lon'])
    lat2 = radians(kwargs['drop_lat'])
    lon2 = radians(kwargs['drop_lon'])

    dlon = (lon2 - lon1)**2
    dlat = (lat2 - lat1)**2
   
    d = dlon + dlat
    sd = sqrt(d)
    return sd

In [276]:
features_df['plain_distance'] = [calculate_plain_distance(**features_df[['pick_lat', 'pick_lon', 'drop_lat', 'drop_lon']].iloc[i].to_dict()) for i in range(features_df.shape[0])]


In [277]:
features_df

Unnamed: 0_level_0,additional_fare,duration,meter_waiting,meter_waiting_fare,meter_waiting_till_pickup,pickup_time,drop_time,pick_lat,pick_lon,drop_lat,drop_lon,fare,distance,time_difference,fare_per_km,waiting_fare_per_s,plain_distance
tripid,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1,Unnamed: 5_level_1,Unnamed: 6_level_1,Unnamed: 7_level_1,Unnamed: 8_level_1,Unnamed: 9_level_1,Unnamed: 10_level_1,Unnamed: 11_level_1,Unnamed: 12_level_1,Unnamed: 13_level_1,Unnamed: 14_level_1,Unnamed: 15_level_1,Unnamed: 16_level_1,Unnamed: 17_level_1
189123628,10.500000,834.0,56.0,0.0000,64.0,11/1/2019 0:20,11/1/2019 0:34,6.86252,79.8993,6.90330,79.8783,270.32,5.094369,840,10.5,0.0000,0.000801
189125358,10.500000,791.0,47.0,0.0000,134.0,11/1/2019 0:56,11/1/2019 1:09,6.88589,79.8984,6.91373,79.8923,197.85,3.169052,780,10.5,0.0000,0.000497
189125719,10.500000,1087.0,80.0,0.0000,61.0,11/1/2019 1:08,11/1/2019 1:26,6.90839,79.8651,6.93669,79.9146,301.64,6.307375,1080,10.5,0.0000,0.000995
189127273,10.500000,598.0,271.0,15.6638,68.0,11/1/2019 2:27,11/1/2019 2:37,6.92570,79.8895,6.92748,79.8971,82.30,0.862217,600,10.5,0.0578,0.000136
189128020,13.719651,,,,,11/1/2019 3:34,11/1/2019 3:51,6.87441,79.8615,6.84478,79.9290,358.39,8.150340,1020,13.7197,,0.001287
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
213803193,10.500000,838.0,93.0,5.4219,451.0,1/31/2020 22:07,1/31/2020 22:21,7.29073,80.6367,7.28891,80.6557,198.26,2.106037,840,10.5,0.0583,0.000333
213812756,10.500000,2151.0,428.0,0.0000,39.0,1/31/2020 23:07,1/31/2020 23:43,6.90569,79.8516,6.95089,79.9389,581.23,10.871789,2160,10.5,0.0000,0.001716
213813930,10.500000,263.0,9.0,0.0000,110.0,1/31/2020 23:21,1/31/2020 23:25,7.09210,79.9000,7.10135,79.9017,76.20,1.045847,240,10.5,0.0000,0.000164
213815405,10.500000,858.0,115.0,0.0000,317.0,1/31/2020 23:39,1/31/2020 23:53,6.94540,79.8768,6.93574,79.9010,133.31,2.879981,840,10.5,0.0000,0.000455


In [278]:
features_df['time_err'] = features_df['time_difference'] - features_df['duration']

In [279]:
features_df

Unnamed: 0_level_0,additional_fare,duration,meter_waiting,meter_waiting_fare,meter_waiting_till_pickup,pickup_time,drop_time,pick_lat,pick_lon,drop_lat,drop_lon,fare,distance,time_difference,fare_per_km,waiting_fare_per_s,plain_distance,time_err
tripid,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1,Unnamed: 5_level_1,Unnamed: 6_level_1,Unnamed: 7_level_1,Unnamed: 8_level_1,Unnamed: 9_level_1,Unnamed: 10_level_1,Unnamed: 11_level_1,Unnamed: 12_level_1,Unnamed: 13_level_1,Unnamed: 14_level_1,Unnamed: 15_level_1,Unnamed: 16_level_1,Unnamed: 17_level_1,Unnamed: 18_level_1
189123628,10.500000,834.0,56.0,0.0000,64.0,11/1/2019 0:20,11/1/2019 0:34,6.86252,79.8993,6.90330,79.8783,270.32,5.094369,840,10.5,0.0000,0.000801,6.0
189125358,10.500000,791.0,47.0,0.0000,134.0,11/1/2019 0:56,11/1/2019 1:09,6.88589,79.8984,6.91373,79.8923,197.85,3.169052,780,10.5,0.0000,0.000497,-11.0
189125719,10.500000,1087.0,80.0,0.0000,61.0,11/1/2019 1:08,11/1/2019 1:26,6.90839,79.8651,6.93669,79.9146,301.64,6.307375,1080,10.5,0.0000,0.000995,-7.0
189127273,10.500000,598.0,271.0,15.6638,68.0,11/1/2019 2:27,11/1/2019 2:37,6.92570,79.8895,6.92748,79.8971,82.30,0.862217,600,10.5,0.0578,0.000136,2.0
189128020,13.719651,,,,,11/1/2019 3:34,11/1/2019 3:51,6.87441,79.8615,6.84478,79.9290,358.39,8.150340,1020,13.7197,,0.001287,
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
213803193,10.500000,838.0,93.0,5.4219,451.0,1/31/2020 22:07,1/31/2020 22:21,7.29073,80.6367,7.28891,80.6557,198.26,2.106037,840,10.5,0.0583,0.000333,2.0
213812756,10.500000,2151.0,428.0,0.0000,39.0,1/31/2020 23:07,1/31/2020 23:43,6.90569,79.8516,6.95089,79.9389,581.23,10.871789,2160,10.5,0.0000,0.001716,9.0
213813930,10.500000,263.0,9.0,0.0000,110.0,1/31/2020 23:21,1/31/2020 23:25,7.09210,79.9000,7.10135,79.9017,76.20,1.045847,240,10.5,0.0000,0.000164,-23.0
213815405,10.500000,858.0,115.0,0.0000,317.0,1/31/2020 23:39,1/31/2020 23:53,6.94540,79.8768,6.93574,79.9010,133.31,2.879981,840,10.5,0.0000,0.000455,-18.0


In [285]:
features_df['pickup_hour'] =pd.to_datetime(features_df['pickup_time']).dt.hour


In [292]:
features_df['pickup_hour'].value_counts().sort_index()

0      184
1       83
2       77
3       45
4       57
5      105
6      317
7      641
8      941
9     1012
10    1076
11    1199
12    1208
13    1156
14    1107
15    1129
16    1229
17    1244
18    1138
19     970
20     858
21     693
22     426
23     281
Name: pickup_hour, dtype: int64

In [299]:
features_df['session']=pd.cut(features_df.pickup_hour,
               [0,8,18,23],
               labels=[0,1,0.0000000000000000000000000000000000001],
               include_lowest=True)

In [300]:
features_df


Unnamed: 0_level_0,additional_fare,duration,meter_waiting,meter_waiting_fare,meter_waiting_till_pickup,pickup_time,drop_time,pick_lat,pick_lon,drop_lat,drop_lon,fare,distance,time_difference,fare_per_km,waiting_fare_per_s,plain_distance,time_err,pickup_hour,session
tripid,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1,Unnamed: 5_level_1,Unnamed: 6_level_1,Unnamed: 7_level_1,Unnamed: 8_level_1,Unnamed: 9_level_1,Unnamed: 10_level_1,Unnamed: 11_level_1,Unnamed: 12_level_1,Unnamed: 13_level_1,Unnamed: 14_level_1,Unnamed: 15_level_1,Unnamed: 16_level_1,Unnamed: 17_level_1,Unnamed: 18_level_1,Unnamed: 19_level_1,Unnamed: 20_level_1
189123628,10.500000,834.0,56.0,0.0000,64.0,11/1/2019 0:20,11/1/2019 0:34,6.86252,79.8993,6.90330,79.8783,270.32,5.094369,840,10.5,0.0000,0.000801,6.0,0,0.000000e+00
189125358,10.500000,791.0,47.0,0.0000,134.0,11/1/2019 0:56,11/1/2019 1:09,6.88589,79.8984,6.91373,79.8923,197.85,3.169052,780,10.5,0.0000,0.000497,-11.0,0,0.000000e+00
189125719,10.500000,1087.0,80.0,0.0000,61.0,11/1/2019 1:08,11/1/2019 1:26,6.90839,79.8651,6.93669,79.9146,301.64,6.307375,1080,10.5,0.0000,0.000995,-7.0,1,0.000000e+00
189127273,10.500000,598.0,271.0,15.6638,68.0,11/1/2019 2:27,11/1/2019 2:37,6.92570,79.8895,6.92748,79.8971,82.30,0.862217,600,10.5,0.0578,0.000136,2.0,2,0.000000e+00
189128020,13.719651,,,,,11/1/2019 3:34,11/1/2019 3:51,6.87441,79.8615,6.84478,79.9290,358.39,8.150340,1020,13.7197,,0.001287,,3,0.000000e+00
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
213803193,10.500000,838.0,93.0,5.4219,451.0,1/31/2020 22:07,1/31/2020 22:21,7.29073,80.6367,7.28891,80.6557,198.26,2.106037,840,10.5,0.0583,0.000333,2.0,22,1.000000e-37
213812756,10.500000,2151.0,428.0,0.0000,39.0,1/31/2020 23:07,1/31/2020 23:43,6.90569,79.8516,6.95089,79.9389,581.23,10.871789,2160,10.5,0.0000,0.001716,9.0,23,1.000000e-37
213813930,10.500000,263.0,9.0,0.0000,110.0,1/31/2020 23:21,1/31/2020 23:25,7.09210,79.9000,7.10135,79.9017,76.20,1.045847,240,10.5,0.0000,0.000164,-23.0,23,1.000000e-37
213815405,10.500000,858.0,115.0,0.0000,317.0,1/31/2020 23:39,1/31/2020 23:53,6.94540,79.8768,6.93574,79.9010,133.31,2.879981,840,10.5,0.0000,0.000455,-18.0,23,1.000000e-37


In [302]:
features_df["PICKUP_DAY_OF_WEEK"]=pd.to_datetime(features_df["pickup_time"]).dt.dayofweek


In [303]:
features_df

Unnamed: 0_level_0,additional_fare,duration,meter_waiting,meter_waiting_fare,meter_waiting_till_pickup,pickup_time,drop_time,pick_lat,pick_lon,drop_lat,drop_lon,fare,distance,time_difference,fare_per_km,waiting_fare_per_s,plain_distance,time_err,pickup_hour,session,PICKUP_DAY_OF_WEEK
tripid,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1,Unnamed: 5_level_1,Unnamed: 6_level_1,Unnamed: 7_level_1,Unnamed: 8_level_1,Unnamed: 9_level_1,Unnamed: 10_level_1,Unnamed: 11_level_1,Unnamed: 12_level_1,Unnamed: 13_level_1,Unnamed: 14_level_1,Unnamed: 15_level_1,Unnamed: 16_level_1,Unnamed: 17_level_1,Unnamed: 18_level_1,Unnamed: 19_level_1,Unnamed: 20_level_1,Unnamed: 21_level_1
189123628,10.500000,834.0,56.0,0.0000,64.0,11/1/2019 0:20,11/1/2019 0:34,6.86252,79.8993,6.90330,79.8783,270.32,5.094369,840,10.5,0.0000,0.000801,6.0,0,0.000000e+00,4
189125358,10.500000,791.0,47.0,0.0000,134.0,11/1/2019 0:56,11/1/2019 1:09,6.88589,79.8984,6.91373,79.8923,197.85,3.169052,780,10.5,0.0000,0.000497,-11.0,0,0.000000e+00,4
189125719,10.500000,1087.0,80.0,0.0000,61.0,11/1/2019 1:08,11/1/2019 1:26,6.90839,79.8651,6.93669,79.9146,301.64,6.307375,1080,10.5,0.0000,0.000995,-7.0,1,0.000000e+00,4
189127273,10.500000,598.0,271.0,15.6638,68.0,11/1/2019 2:27,11/1/2019 2:37,6.92570,79.8895,6.92748,79.8971,82.30,0.862217,600,10.5,0.0578,0.000136,2.0,2,0.000000e+00,4
189128020,13.719651,,,,,11/1/2019 3:34,11/1/2019 3:51,6.87441,79.8615,6.84478,79.9290,358.39,8.150340,1020,13.7197,,0.001287,,3,0.000000e+00,4
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
213803193,10.500000,838.0,93.0,5.4219,451.0,1/31/2020 22:07,1/31/2020 22:21,7.29073,80.6367,7.28891,80.6557,198.26,2.106037,840,10.5,0.0583,0.000333,2.0,22,1.000000e-37,4
213812756,10.500000,2151.0,428.0,0.0000,39.0,1/31/2020 23:07,1/31/2020 23:43,6.90569,79.8516,6.95089,79.9389,581.23,10.871789,2160,10.5,0.0000,0.001716,9.0,23,1.000000e-37,4
213813930,10.500000,263.0,9.0,0.0000,110.0,1/31/2020 23:21,1/31/2020 23:25,7.09210,79.9000,7.10135,79.9017,76.20,1.045847,240,10.5,0.0000,0.000164,-23.0,23,1.000000e-37,4
213815405,10.500000,858.0,115.0,0.0000,317.0,1/31/2020 23:39,1/31/2020 23:53,6.94540,79.8768,6.93574,79.9010,133.31,2.879981,840,10.5,0.0000,0.000455,-18.0,23,1.000000e-37,4


In [313]:
features_df["PICKUP_DAY_OF_WEEK"]=pd.to_datetime(features_df["pickup_time"]).dt.dayofweek
features_df["day"]=features_df["PICKUP_DAY_OF_WEEK"].map( {0:'MONDAY',1:'TUESDAY',2:'WEDNESDAY', 3:'THURSDAY',4:'FRIDAY', 5:'SATURDAY',6:'SUNDAY'} ).astype(str)

In [314]:
features_df

Unnamed: 0_level_0,additional_fare,duration,meter_waiting,meter_waiting_fare,meter_waiting_till_pickup,pickup_time,drop_time,pick_lat,pick_lon,drop_lat,drop_lon,fare,distance,time_difference,fare_per_km,waiting_fare_per_s,plain_distance,time_err,pickup_hour,session,PICKUP_DAY_OF_WEEK,day
tripid,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1,Unnamed: 5_level_1,Unnamed: 6_level_1,Unnamed: 7_level_1,Unnamed: 8_level_1,Unnamed: 9_level_1,Unnamed: 10_level_1,Unnamed: 11_level_1,Unnamed: 12_level_1,Unnamed: 13_level_1,Unnamed: 14_level_1,Unnamed: 15_level_1,Unnamed: 16_level_1,Unnamed: 17_level_1,Unnamed: 18_level_1,Unnamed: 19_level_1,Unnamed: 20_level_1,Unnamed: 21_level_1,Unnamed: 22_level_1
189123628,10.500000,834.0,56.0,0.0000,64.0,11/1/2019 0:20,11/1/2019 0:34,6.86252,79.8993,6.90330,79.8783,270.32,5.094369,840,10.5,0.0000,0.000801,6.0,0,0.000000e+00,4,FRIDAY
189125358,10.500000,791.0,47.0,0.0000,134.0,11/1/2019 0:56,11/1/2019 1:09,6.88589,79.8984,6.91373,79.8923,197.85,3.169052,780,10.5,0.0000,0.000497,-11.0,0,0.000000e+00,4,FRIDAY
189125719,10.500000,1087.0,80.0,0.0000,61.0,11/1/2019 1:08,11/1/2019 1:26,6.90839,79.8651,6.93669,79.9146,301.64,6.307375,1080,10.5,0.0000,0.000995,-7.0,1,0.000000e+00,4,FRIDAY
189127273,10.500000,598.0,271.0,15.6638,68.0,11/1/2019 2:27,11/1/2019 2:37,6.92570,79.8895,6.92748,79.8971,82.30,0.862217,600,10.5,0.0578,0.000136,2.0,2,0.000000e+00,4,FRIDAY
189128020,13.719651,,,,,11/1/2019 3:34,11/1/2019 3:51,6.87441,79.8615,6.84478,79.9290,358.39,8.150340,1020,13.7197,,0.001287,,3,0.000000e+00,4,FRIDAY
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
213803193,10.500000,838.0,93.0,5.4219,451.0,1/31/2020 22:07,1/31/2020 22:21,7.29073,80.6367,7.28891,80.6557,198.26,2.106037,840,10.5,0.0583,0.000333,2.0,22,1.000000e-37,4,FRIDAY
213812756,10.500000,2151.0,428.0,0.0000,39.0,1/31/2020 23:07,1/31/2020 23:43,6.90569,79.8516,6.95089,79.9389,581.23,10.871789,2160,10.5,0.0000,0.001716,9.0,23,1.000000e-37,4,FRIDAY
213813930,10.500000,263.0,9.0,0.0000,110.0,1/31/2020 23:21,1/31/2020 23:25,7.09210,79.9000,7.10135,79.9017,76.20,1.045847,240,10.5,0.0000,0.000164,-23.0,23,1.000000e-37,4,FRIDAY
213815405,10.500000,858.0,115.0,0.0000,317.0,1/31/2020 23:39,1/31/2020 23:53,6.94540,79.8768,6.93574,79.9010,133.31,2.879981,840,10.5,0.0000,0.000455,-18.0,23,1.000000e-37,4,FRIDAY


In [315]:
#the function gets day of week.return 1 if day is weekend (saturday or sunday) otherwise 0
def isWeekEnd(x):
    if((x=="SATURDAY") | (x=="SUNDAY")):
        return 1
    else:
        return 0

In [316]:
features_df["IS_WEEK_END"]=features_df["day"].apply(lambda x:isWeekEnd(x))


In [317]:
features_df

Unnamed: 0_level_0,additional_fare,duration,meter_waiting,meter_waiting_fare,meter_waiting_till_pickup,pickup_time,drop_time,pick_lat,pick_lon,drop_lat,drop_lon,fare,distance,time_difference,fare_per_km,waiting_fare_per_s,plain_distance,time_err,pickup_hour,session,PICKUP_DAY_OF_WEEK,day,IS_WEEK_END
tripid,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1,Unnamed: 5_level_1,Unnamed: 6_level_1,Unnamed: 7_level_1,Unnamed: 8_level_1,Unnamed: 9_level_1,Unnamed: 10_level_1,Unnamed: 11_level_1,Unnamed: 12_level_1,Unnamed: 13_level_1,Unnamed: 14_level_1,Unnamed: 15_level_1,Unnamed: 16_level_1,Unnamed: 17_level_1,Unnamed: 18_level_1,Unnamed: 19_level_1,Unnamed: 20_level_1,Unnamed: 21_level_1,Unnamed: 22_level_1,Unnamed: 23_level_1
189123628,10.500000,834.0,56.0,0.0000,64.0,11/1/2019 0:20,11/1/2019 0:34,6.86252,79.8993,6.90330,79.8783,270.32,5.094369,840,10.5,0.0000,0.000801,6.0,0,0.000000e+00,4,FRIDAY,0
189125358,10.500000,791.0,47.0,0.0000,134.0,11/1/2019 0:56,11/1/2019 1:09,6.88589,79.8984,6.91373,79.8923,197.85,3.169052,780,10.5,0.0000,0.000497,-11.0,0,0.000000e+00,4,FRIDAY,0
189125719,10.500000,1087.0,80.0,0.0000,61.0,11/1/2019 1:08,11/1/2019 1:26,6.90839,79.8651,6.93669,79.9146,301.64,6.307375,1080,10.5,0.0000,0.000995,-7.0,1,0.000000e+00,4,FRIDAY,0
189127273,10.500000,598.0,271.0,15.6638,68.0,11/1/2019 2:27,11/1/2019 2:37,6.92570,79.8895,6.92748,79.8971,82.30,0.862217,600,10.5,0.0578,0.000136,2.0,2,0.000000e+00,4,FRIDAY,0
189128020,13.719651,,,,,11/1/2019 3:34,11/1/2019 3:51,6.87441,79.8615,6.84478,79.9290,358.39,8.150340,1020,13.7197,,0.001287,,3,0.000000e+00,4,FRIDAY,0
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
213803193,10.500000,838.0,93.0,5.4219,451.0,1/31/2020 22:07,1/31/2020 22:21,7.29073,80.6367,7.28891,80.6557,198.26,2.106037,840,10.5,0.0583,0.000333,2.0,22,1.000000e-37,4,FRIDAY,0
213812756,10.500000,2151.0,428.0,0.0000,39.0,1/31/2020 23:07,1/31/2020 23:43,6.90569,79.8516,6.95089,79.9389,581.23,10.871789,2160,10.5,0.0000,0.001716,9.0,23,1.000000e-37,4,FRIDAY,0
213813930,10.500000,263.0,9.0,0.0000,110.0,1/31/2020 23:21,1/31/2020 23:25,7.09210,79.9000,7.10135,79.9017,76.20,1.045847,240,10.5,0.0000,0.000164,-23.0,23,1.000000e-37,4,FRIDAY,0
213815405,10.500000,858.0,115.0,0.0000,317.0,1/31/2020 23:39,1/31/2020 23:53,6.94540,79.8768,6.93574,79.9010,133.31,2.879981,840,10.5,0.0000,0.000455,-18.0,23,1.000000e-37,4,FRIDAY,0


In [321]:
features_df['drop_lat'].value_counts().nlargest(n=30)

6.93296    9
6.93299    7
6.93295    7
6.93292    7
6.91279    6
6.91174    6
6.93402    6
7.29304    6
6.91777    6
6.90415    5
6.91717    5
6.91719    5
6.92145    5
6.88879    5
6.84777    5
6.87400    5
6.84021    5
6.89417    5
6.93344    5
7.29309    5
6.93407    5
6.92017    5
6.93302    5
6.93286    5
6.93422    5
6.93642    5
6.93297    5
6.91747    5
6.86939    5
6.89488    5
Name: drop_lat, dtype: int64

In [322]:
features_df['drop_lon'].value_counts().nlargest(n=30)

79.8661    41
79.8648    39
79.8784    39
79.8670    32
79.8774    32
79.8562    31
79.8553    31
79.8606    29
79.8667    29
79.8680    29
79.8930    28
79.8669    28
79.8847    28
79.8619    27
79.8872    27
79.8772    27
79.8664    27
79.8846    27
79.8760    26
79.8778    26
79.8516    26
79.8624    26
79.8668    26
79.8655    25
79.8732    25
79.8674    25
79.8646    25
79.8692    25
79.8918    25
79.8932    25
Name: drop_lon, dtype: int64

In [323]:
features_df

Unnamed: 0_level_0,additional_fare,duration,meter_waiting,meter_waiting_fare,meter_waiting_till_pickup,pickup_time,drop_time,pick_lat,pick_lon,drop_lat,drop_lon,fare,distance,time_difference,fare_per_km,waiting_fare_per_s,plain_distance,time_err,pickup_hour,session,PICKUP_DAY_OF_WEEK,day,IS_WEEK_END
tripid,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1,Unnamed: 5_level_1,Unnamed: 6_level_1,Unnamed: 7_level_1,Unnamed: 8_level_1,Unnamed: 9_level_1,Unnamed: 10_level_1,Unnamed: 11_level_1,Unnamed: 12_level_1,Unnamed: 13_level_1,Unnamed: 14_level_1,Unnamed: 15_level_1,Unnamed: 16_level_1,Unnamed: 17_level_1,Unnamed: 18_level_1,Unnamed: 19_level_1,Unnamed: 20_level_1,Unnamed: 21_level_1,Unnamed: 22_level_1,Unnamed: 23_level_1
189123628,10.500000,834.0,56.0,0.0000,64.0,11/1/2019 0:20,11/1/2019 0:34,6.86252,79.8993,6.90330,79.8783,270.32,5.094369,840,10.5,0.0000,0.000801,6.0,0,0.000000e+00,4,FRIDAY,0
189125358,10.500000,791.0,47.0,0.0000,134.0,11/1/2019 0:56,11/1/2019 1:09,6.88589,79.8984,6.91373,79.8923,197.85,3.169052,780,10.5,0.0000,0.000497,-11.0,0,0.000000e+00,4,FRIDAY,0
189125719,10.500000,1087.0,80.0,0.0000,61.0,11/1/2019 1:08,11/1/2019 1:26,6.90839,79.8651,6.93669,79.9146,301.64,6.307375,1080,10.5,0.0000,0.000995,-7.0,1,0.000000e+00,4,FRIDAY,0
189127273,10.500000,598.0,271.0,15.6638,68.0,11/1/2019 2:27,11/1/2019 2:37,6.92570,79.8895,6.92748,79.8971,82.30,0.862217,600,10.5,0.0578,0.000136,2.0,2,0.000000e+00,4,FRIDAY,0
189128020,13.719651,,,,,11/1/2019 3:34,11/1/2019 3:51,6.87441,79.8615,6.84478,79.9290,358.39,8.150340,1020,13.7197,,0.001287,,3,0.000000e+00,4,FRIDAY,0
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
213803193,10.500000,838.0,93.0,5.4219,451.0,1/31/2020 22:07,1/31/2020 22:21,7.29073,80.6367,7.28891,80.6557,198.26,2.106037,840,10.5,0.0583,0.000333,2.0,22,1.000000e-37,4,FRIDAY,0
213812756,10.500000,2151.0,428.0,0.0000,39.0,1/31/2020 23:07,1/31/2020 23:43,6.90569,79.8516,6.95089,79.9389,581.23,10.871789,2160,10.5,0.0000,0.001716,9.0,23,1.000000e-37,4,FRIDAY,0
213813930,10.500000,263.0,9.0,0.0000,110.0,1/31/2020 23:21,1/31/2020 23:25,7.09210,79.9000,7.10135,79.9017,76.20,1.045847,240,10.5,0.0000,0.000164,-23.0,23,1.000000e-37,4,FRIDAY,0
213815405,10.500000,858.0,115.0,0.0000,317.0,1/31/2020 23:39,1/31/2020 23:53,6.94540,79.8768,6.93574,79.9010,133.31,2.879981,840,10.5,0.0000,0.000455,-18.0,23,1.000000e-37,4,FRIDAY,0


In [324]:
h = features_df.groupby('pickup_hour').size()

In [325]:
hf = h/len(features_df)

In [326]:
features_df['hour_freq'] = features_df.pickup_hour.map(hf)

In [327]:
features_df

Unnamed: 0_level_0,additional_fare,duration,meter_waiting,meter_waiting_fare,meter_waiting_till_pickup,pickup_time,drop_time,pick_lat,pick_lon,drop_lat,drop_lon,fare,distance,time_difference,fare_per_km,waiting_fare_per_s,plain_distance,time_err,pickup_hour,session,PICKUP_DAY_OF_WEEK,day,IS_WEEK_END,hour_freq
tripid,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1,Unnamed: 5_level_1,Unnamed: 6_level_1,Unnamed: 7_level_1,Unnamed: 8_level_1,Unnamed: 9_level_1,Unnamed: 10_level_1,Unnamed: 11_level_1,Unnamed: 12_level_1,Unnamed: 13_level_1,Unnamed: 14_level_1,Unnamed: 15_level_1,Unnamed: 16_level_1,Unnamed: 17_level_1,Unnamed: 18_level_1,Unnamed: 19_level_1,Unnamed: 20_level_1,Unnamed: 21_level_1,Unnamed: 22_level_1,Unnamed: 23_level_1,Unnamed: 24_level_1
189123628,10.500000,834.0,56.0,0.0000,64.0,11/1/2019 0:20,11/1/2019 0:34,6.86252,79.8993,6.90330,79.8783,270.32,5.094369,840,10.5,0.0000,0.000801,6.0,0,0.000000e+00,4,FRIDAY,0,0.010713
189125358,10.500000,791.0,47.0,0.0000,134.0,11/1/2019 0:56,11/1/2019 1:09,6.88589,79.8984,6.91373,79.8923,197.85,3.169052,780,10.5,0.0000,0.000497,-11.0,0,0.000000e+00,4,FRIDAY,0,0.010713
189125719,10.500000,1087.0,80.0,0.0000,61.0,11/1/2019 1:08,11/1/2019 1:26,6.90839,79.8651,6.93669,79.9146,301.64,6.307375,1080,10.5,0.0000,0.000995,-7.0,1,0.000000e+00,4,FRIDAY,0,0.004832
189127273,10.500000,598.0,271.0,15.6638,68.0,11/1/2019 2:27,11/1/2019 2:37,6.92570,79.8895,6.92748,79.8971,82.30,0.862217,600,10.5,0.0578,0.000136,2.0,2,0.000000e+00,4,FRIDAY,0,0.004483
189128020,13.719651,,,,,11/1/2019 3:34,11/1/2019 3:51,6.87441,79.8615,6.84478,79.9290,358.39,8.150340,1020,13.7197,,0.001287,,3,0.000000e+00,4,FRIDAY,0,0.002620
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
213803193,10.500000,838.0,93.0,5.4219,451.0,1/31/2020 22:07,1/31/2020 22:21,7.29073,80.6367,7.28891,80.6557,198.26,2.106037,840,10.5,0.0583,0.000333,2.0,22,1.000000e-37,4,FRIDAY,0,0.024802
213812756,10.500000,2151.0,428.0,0.0000,39.0,1/31/2020 23:07,1/31/2020 23:43,6.90569,79.8516,6.95089,79.9389,581.23,10.871789,2160,10.5,0.0000,0.001716,9.0,23,1.000000e-37,4,FRIDAY,0,0.016360
213813930,10.500000,263.0,9.0,0.0000,110.0,1/31/2020 23:21,1/31/2020 23:25,7.09210,79.9000,7.10135,79.9017,76.20,1.045847,240,10.5,0.0000,0.000164,-23.0,23,1.000000e-37,4,FRIDAY,0,0.016360
213815405,10.500000,858.0,115.0,0.0000,317.0,1/31/2020 23:39,1/31/2020 23:53,6.94540,79.8768,6.93574,79.9010,133.31,2.879981,840,10.5,0.0000,0.000455,-18.0,23,1.000000e-37,4,FRIDAY,0,0.016360
