In [1]:
import warnings 
warnings.filterwarnings("ignore")
warnings.filterwarnings("ignore", category=DeprecationWarning)

import numpy as np
import pandas as pd
import matplotlib.pyplot as plt
import seaborn as sns
%matplotlib inline

import datetime
import random
import time

from sklearn.model_selection import train_test_split
from sklearn.linear_model import LinearRegression,Ridge,Lasso
from sklearn.svm import SVR
from sklearn.ensemble import RandomForestRegressor
from sklearn.cluster import DBSCAN
from sklearn.metrics import mean_squared_error
from sklearn.preprocessing import MinMaxScaler


from keras.models import Sequential
from keras.layers import Dense,Activation,Flatten,BatchNormalization
from keras.regularizers import l2
from xgboost import XGBRegressor

try:
    import gmplot
    import googlemaps
except ImportError:
    %pip install gmplot googlemaps 
    import gmplot
    import googlemaps

In [None]:
import sys
import os
from pathlib import Path
from dotenv import load_dotenv

parent_path = Path().cwd().parent  

#.env
dotenv_path = parent_path / '.env'
load_dotenv(dotenv_path=dotenv_path)

GOOGLE_MAPS_API_KEY = os.getenv('GOOGLE_MAPS_API_KEY')

#src folder 
src_path = parent_path / "src"
if str(src_path) not in sys.path:
    sys.path.insert(0, str(src_path))

# Now import from features package
from features.distance import calc_distance
from features.gmaps import extract_gmaps_data,merge_gmaps_data

In [3]:
train_df = pd.read_csv("../data/processed/eda_processed_train.csv")
test_df = pd.read_csv("../data/processed/eda_processed_test.csv")
combine = [train_df,test_df]

print("Train:",train_df.shape)
print("Test:",test_df.shape)
train_df.head()

Train: (140965, 7)
Test: (30000, 6)


Unnamed: 0,row_id,start_lng,start_lat,end_lng,end_lat,datetime,duration
0,0,-73.783279,40.644718,-73.963799,40.68132,2015-01-09 00:51:48,1815.0
1,1,-122.422385,37.789408,-122.41989,37.766692,2012-09-14 01:46:00,300.0
2,2,-73.885292,40.77306,-74.000076,40.76136,2015-04-19 20:26:57,2620.0
3,3,-122.401603,37.788975,-122.410958,37.796447,2012-09-03 23:48:00,360.0
4,4,-74.008728,40.714317,-74.006432,40.73172,2015-01-21 12:14:45,582.0


## Calculating Euclidean & Manhattan distance

In [4]:
for df in combine:
    df['manhattan'] = calc_distance(df,method='manhattan')
    df['euclidean'] = calc_distance(df,method='euclidean')

In [5]:
df.sample()

Unnamed: 0,row_id,start_lng,start_lat,end_lng,end_lat,datetime,manhattan,euclidean
12473,12473,-73.968735,40.754261,-73.962502,40.763176,2015-11-22 01:35:28,1686.267647,1122.99101


## Extract Google Maps data

In [6]:
# start = 0
# interval = 10000

# #process training data in batches
# while start < len(train_df):
#     print(f"Train: Now starting with batch of the {start}'s...")
#     train = train_df.loc[start:start+interval-1]
#     train = extract_gmaps_data(train,GOOGLE_MAPS_API_KEY,test=False)
#     start+=interval

# #process test data in batches
# start = 0
# while start < len(test_df):
#     print(f"Test: Now starting with batch of the {start}'s...")
#     test = test_df.loc[start:start+interval-1]
#     test = extract_gmaps_data(test,GOOGLE_MAPS_API_KEY,test=True)
#     start += interval


In [None]:
#merging multiple gmaps csv into gmaps train and test respectively
# merge_gmaps_data()

Merging 15 train CSV files...
Saved merged train data: C:\Users\Harshita\Documents\GoPredict\data\processed\gmapsdata\gmaps_train_data.csv
Removed train/ subdirectory
Merging 3 test CSV files...
Saved merged test data: C:\Users\Harshita\Documents\GoPredict\data\processed\gmapsdata\gmaps_test_data.csv
Removed test/ subdirectory
Merge operation completed successfully!


In [11]:
#importing gmaps data
gmaps_train_data = pd.read_csv("../data/processed/gmapsdata/gmaps_train_data.csv",index_col='row_id')
gmaps_test_data = pd.read_csv("../data/processed/gmapsdata/gmaps_test_data.csv",index_col='row_id')

train_df['gmaps_distance'] = gmaps_train_data['gmaps_distance']
train_df['gmaps_duration'] = gmaps_train_data['gmaps_duration']
train_df.dropna(axis=0,inplace=True)


test_df['gmaps_distance'] = gmaps_test_data['gmaps_distance']
test_df['gmaps_duration'] = gmaps_test_data['gmaps_duration']
test_df.dropna(axis=0,inplace=True)


In [18]:
train_df.sample(5)

Unnamed: 0,row_id,start_lng,start_lat,end_lng,end_lat,datetime,duration,manhattan,euclidean,gmaps_distance,gmaps_duration
106554,110309,-73.989052,40.75848,-73.983994,40.769871,2015-05-09 14:13:45,353.0,1831.094304,1337.831664,2313,585
7370,7656,-73.953362,40.8228,-73.963837,40.808002,2015-08-15 10:35:27,221.0,2813.377491,1868.805148,2291,415
42244,43740,-73.97905,40.787071,-73.986481,40.748661,2015-04-19 12:43:10,873.0,5102.996777,4321.43632,2735,878
87760,90881,-122.41563,37.791145,-122.404225,37.79249,2012-09-03 02:13:00,180.0,1419.323508,1014.401427,1072,312
6261,6503,-73.985504,40.741516,-73.97831,40.751457,2015-10-26 21:00:03,264.0,1907.459475,1262.032532,1567,429


In [19]:
test_df.sample(5)

Unnamed: 0,row_id,start_lng,start_lat,end_lng,end_lat,datetime,manhattan,euclidean,gmaps_distance,gmaps_duration
5368,5368,-73.98159,40.743027,-73.989182,40.738621,2015-09-15 16:34:56,1335.611251,806.593645,1211,341
5102,5102,-73.99305,40.743137,-74.003647,40.745689,2015-06-05 20:47:58,1463.739984,937.80293,1407,346
11357,11357,-73.977722,40.763275,-73.979187,40.781788,2015-04-13 21:42:22,2223.940787,2064.555126,2794,668
12159,12159,-73.980186,40.736511,-73.982292,40.748379,2015-12-13 18:03:53,1555.578564,1333.02664,2036,546
1385,1385,-73.990929,40.727795,-73.982224,40.740067,2015-03-28 03:55:15,2335.148958,1550.947764,1551,283


sometimes there are chances that Google Maps API returns 0 distance for certain routes. Treasure Island in San Francisco has weird quirks where Google Maps fails to calculate the driving distance.

In [21]:
#Tresure Island fix
for df in combine:
    TI_df = df[df['gmaps_distance']==0].loc[df.manhattan>2000]
    #replacing them with manhattan distance
    df.loc[TI_df.index,"gmaps_distance"] = TI_df.manhattan
    #approximating gmaps_duration 
    df.loc[TI_df.index,"gmaps_duration"] = TI_df.manhattan/11.0

## Time Features