In [1]:
import warnings 
warnings.filterwarnings("ignore")
warnings.filterwarnings("ignore", category=DeprecationWarning)

import numpy as np
import pandas as pd
import matplotlib.pyplot as plt
import seaborn as sns
%matplotlib inline

import datetime
import random
import time

from sklearn.model_selection import train_test_split
from sklearn.linear_model import LinearRegression,Ridge,Lasso
from sklearn.svm import SVR
from sklearn.ensemble import RandomForestRegressor
from sklearn.cluster import DBSCAN
from sklearn.metrics import mean_squared_error
from sklearn.preprocessing import MinMaxScaler


from keras.models import Sequential
from keras.layers import Dense,Activation,Flatten,BatchNormalization
from keras.regularizers import l2
from xgboost import XGBRegressor

try:
    import gmplot
    import googlemaps
except ImportError:
    %pip install gmplot googlemaps 
    import gmplot
    import googlemaps

In [2]:
import sys
import os
from pathlib import Path
from dotenv import load_dotenv

parent_path = Path().cwd().parent  

#.env
dotenv_path = parent_path / '.env'
load_dotenv(dotenv_path=dotenv_path)

GOOGLE_MAPS_API_KEY = os.getenv('GOOGLE_MAPS_API_KEY')

#src folder 
src_path = parent_path / "src"
if str(src_path) not in sys.path:
    sys.path.insert(0, str(src_path))

# Now import from features package
from features.distance import calc_distance
from features.gmaps import extract_gmaps_data,merge_gmaps_data
from features.time import extract_time_features

In [3]:
train_df = pd.read_csv("../data/processed/eda_processed_train.csv")
test_df = pd.read_csv("../data/processed/eda_processed_test.csv")
combine = [train_df,test_df]

print("Train:",train_df.shape)
print("Test:",test_df.shape)
train_df.head()

Train: (140965, 7)
Test: (30000, 6)


Unnamed: 0,row_id,start_lng,start_lat,end_lng,end_lat,datetime,duration
0,0,-73.783279,40.644718,-73.963799,40.68132,2015-01-09 00:51:48,1815.0
1,1,-122.422385,37.789408,-122.41989,37.766692,2012-09-14 01:46:00,300.0
2,2,-73.885292,40.77306,-74.000076,40.76136,2015-04-19 20:26:57,2620.0
3,3,-122.401603,37.788975,-122.410958,37.796447,2012-09-03 23:48:00,360.0
4,4,-74.008728,40.714317,-74.006432,40.73172,2015-01-21 12:14:45,582.0


## Calculating Euclidean & Manhattan distance

In [4]:
for df in combine:
    df['manhattan'] = calc_distance(df,method='manhattan')
    df['euclidean'] = calc_distance(df,method='euclidean')

In [5]:
df.sample()

Unnamed: 0,row_id,start_lng,start_lat,end_lng,end_lat,datetime,manhattan,euclidean
22091,22091,-73.863403,40.770008,-73.981567,40.749844,2015-11-05 23:44:41,15398.602522,10213.203817


## Extract Google Maps data

In [6]:
# start = 0
# interval = 10000

# #process training data in batches
# while start < len(train_df):
#     print(f"Train: Now starting with batch of the {start}'s...")
#     train = train_df.loc[start:start+interval-1]
#     train = extract_gmaps_data(train,GOOGLE_MAPS_API_KEY,test=False)
#     start+=interval

# #process test data in batches
# start = 0
# while start < len(test_df):
#     print(f"Test: Now starting with batch of the {start}'s...")
#     test = test_df.loc[start:start+interval-1]
#     test = extract_gmaps_data(test,GOOGLE_MAPS_API_KEY,test=True)
#     start += interval


In [7]:
#merging multiple gmaps csv into gmaps train and test respectively
# merge_gmaps_data()

In [8]:
#importing gmaps data
gmaps_train_data = pd.read_csv("../data/processed/gmapsdata/gmaps_train_data.csv",index_col='row_id')
gmaps_test_data = pd.read_csv("../data/processed/gmapsdata/gmaps_test_data.csv",index_col='row_id')

train_df['gmaps_distance'] = gmaps_train_data['gmaps_distance']
train_df['gmaps_duration'] = gmaps_train_data['gmaps_duration']
train_df.dropna(axis=0,inplace=True)


test_df['gmaps_distance'] = gmaps_test_data['gmaps_distance']
test_df['gmaps_duration'] = gmaps_test_data['gmaps_duration']
test_df.dropna(axis=0,inplace=True)


In [9]:
train_df.sample(5)

Unnamed: 0,row_id,start_lng,start_lat,end_lng,end_lat,datetime,duration,manhattan,euclidean,gmaps_distance,gmaps_duration
94137,97462,-73.977875,40.754326,-73.969521,40.756618,2015-10-16 23:12:11,585.0,1185.107299,749.231355,3573,759
100845,104403,-122.422672,37.792772,-122.408137,37.776502,2012-09-12 14:43:00,720.0,3429.196914,2217.105226,1306,237
61069,63203,-73.982315,40.758831,-73.98877,40.745361,2015-04-19 02:08:33,405.0,2218.040854,1595.221485,4685,666
116330,120444,-122.397,37.785782,-122.384598,37.61528,2012-09-03 06:55:00,900.0,20360.780144,19011.60288,1666,440
133423,138174,-122.395358,37.776625,-122.410607,37.779507,2012-09-13 19:11:00,240.0,2018.333688,1379.518971,17744,1576


In [10]:
test_df.sample(5)

Unnamed: 0,row_id,start_lng,start_lat,end_lng,end_lat,datetime,manhattan,euclidean,gmaps_distance,gmaps_duration
10041,10041,-73.979782,40.737427,-73.981323,40.746655,2015-01-14 14:17:20,1198.799596,1035.445715,1462,377
27243,27243,-73.955978,40.778835,-73.97068,40.765728,2015-04-08 21:24:30,3095.683719,1914.433506,2436,582
4013,4013,-122.410612,37.791822,-122.39961,37.793355,2012-09-05 18:50:00,1395.389817,982.759245,982,277
19805,19805,-73.978523,40.7859,-73.978508,40.757332,2015-11-25 11:08:45,3181.845005,3180.175464,3632,1005
4231,4231,-122.435187,37.800052,-122.419725,37.784968,2012-09-08 02:54:00,3400.365166,2160.922128,3338,679


sometimes there are chances that Google Maps API returns 0 distance for certain routes. Treasure Island in San Francisco has weird quirks where Google Maps fails to calculate the driving distance.

In [11]:
#Tresure Island fix
for df in combine:
    TI_df = df[df['gmaps_distance']==0].loc[df.manhattan>2000]
    #replacing them with manhattan distance
    df.loc[TI_df.index,"gmaps_distance"] = TI_df.manhattan
    #approximating gmaps_duration 
    df.loc[TI_df.index,"gmaps_duration"] = TI_df.manhattan/11.0

## Time Features

In [12]:
#add weekdays,hour,date column and drop datetime
#adding holiday column
extract_time_features(combine)
train_df.head()

Unnamed: 0,row_id,start_lng,start_lat,end_lng,end_lat,duration,manhattan,euclidean,gmaps_distance,gmaps_duration,weekday,hour,date,holiday
0,0,-73.783279,40.644718,-73.963799,40.68132,1815.0,24169.91048,15778.616804,19322.0,2567.0,5,0,2015-01-09,0
1,1,-122.422385,37.789408,-122.41989,37.766692,300.0,2806.475682,2538.244354,2855.0,550.0,5,1,2012-09-14,0
2,2,-73.885292,40.77306,-74.000076,40.76136,2620.0,14080.134473,9764.679964,14693.0,1876.0,7,20,2015-04-19,0
3,3,-122.401603,37.788975,-122.410958,37.796447,360.0,1873.173072,1170.082524,2019.0,578.0,1,23,2012-09-03,0
4,4,-74.008728,40.714317,-74.006432,40.73172,582.0,2192.882649,1946.952973,2382.0,608.0,3,12,2015-01-21,0


In [14]:
train_df.columns

Index(['row_id', 'start_lng', 'start_lat', 'end_lng', 'end_lat', 'duration',
       'manhattan', 'euclidean', 'gmaps_distance', 'gmaps_duration', 'weekday',
       'hour', 'date', 'holiday'],
      dtype='object')

In [15]:
train_df.holiday.value_counts()

holiday
0    137357
1      3608
Name: count, dtype: int64