In [2]:
import warnings 
warnings.filterwarnings("ignore")
warnings.filterwarnings("ignore", category=DeprecationWarning)

import numpy as np
import pandas as pd
import matplotlib.pyplot as plt
import seaborn as sns
%matplotlib inline
try:
    import gmplot
    import googlemaps
except ImportError:
    %pip install gmplot googlemaps 
    import gmplot
    import googlemaps

In [3]:
import sys
import os
from pathlib import Path
from dotenv import load_dotenv

parent_path = Path().cwd().parent  

#.env
dotenv_path = parent_path / '.env'
load_dotenv(dotenv_path=dotenv_path)

GOOGLE_MAPS_API_KEY = os.getenv('GOOGLE_MAPS_API_KEY')

#src folder 
src_path = parent_path / "src"
if str(src_path) not in sys.path:
    sys.path.insert(0, str(src_path))

# Now import from features package
from features.distance import calc_distance
from features.gmaps import extract_gmaps_data,merge_gmaps_data
from features.time import extract_time_features
from features.geolocation import clustering
from features.precipitation import extract_precipitation_data

In [None]:
train_df = pd.read_csv("../data/processed/eda_processed_train.csv",index_col="row_id")
test_df = pd.read_csv("../data/processed/eda_processed_test.csv",index_col="row_id")
combine = [train_df,test_df]

print("Train:",train_df.shape)
print("Test:",test_df.shape)
train_df.head()

## Calculating Euclidean & Manhattan distance

In [None]:
for df in combine:
    df['manhattan'] = calc_distance(df,method='manhattan')
    df['euclidean'] = calc_distance(df,method='euclidean')

In [None]:
df.sample()

## Extract Google Maps data

In [None]:
# start = 0
# interval = 10000

# #process training data in batches
# while start < len(train_df):
#     print(f"Train: Now starting with batch of the {start}'s...")
#     train = train_df.loc[start:start+interval-1]
#     train = extract_gmaps_data(train,GOOGLE_MAPS_API_KEY,test=False)
#     start+=interval

# #process test data in batches
# start = 0
# while start < len(test_df):
#     print(f"Test: Now starting with batch of the {start}'s...")
#     test = test_df.loc[start:start+interval-1]
#     test = extract_gmaps_data(test,GOOGLE_MAPS_API_KEY,test=True)
#     start += interval


In [4]:
#merging multiple gmaps csv into gmaps train and test respectively
merge_gmaps_data()

Merge operation completed successfully!


In [None]:
#importing gmaps data
gmaps_train_data = pd.read_csv("../data/processed/gmapsdata/gmaps_train_data.csv",index_col='row_id')
gmaps_test_data = pd.read_csv("../data/processed/gmapsdata/gmaps_test_data.csv",index_col='row_id')

train_df['gmaps_distance'] = gmaps_train_data['gmaps_distance']
train_df['gmaps_duration'] = gmaps_train_data['gmaps_duration']
train_df.dropna(axis=0,inplace=True)


test_df['gmaps_distance'] = gmaps_test_data['gmaps_distance']
test_df['gmaps_duration'] = gmaps_test_data['gmaps_duration']
test_df.dropna(axis=0,inplace=True)


In [None]:
train_df.sample(5)

In [None]:
test_df.sample(5)

sometimes there are chances that Google Maps API returns 0 distance for certain routes. Treasure Island in San Francisco has weird quirks where Google Maps fails to calculate the driving distance.

In [None]:
#Tresure Island fix
for df in combine:
    TI_df = df[df['gmaps_distance']==0].loc[df.manhattan>2000]
    #replacing them with manhattan distance
    df.loc[TI_df.index,"gmaps_distance"] = TI_df.manhattan
    #approximating gmaps_duration 
    df.loc[TI_df.index,"gmaps_duration"] = TI_df.manhattan/11.0

## Time Features

In [None]:
#add weekdays,hour,date column and drop datetime
#adding holiday column
extract_time_features(combine)
train_df.head()

In [None]:
train_df.columns

In [None]:
train_df.holiday.value_counts()

## Geolocations

In [None]:
clustering(train_df,test_df)

In [None]:
train_df.columns

In [None]:
test_df.columns

In [None]:
sns.barplot(train_df[['citycenter','airport','standalone']])

## Precipitation data

In [None]:
extract_precipitation_data(combine)

In [None]:
train_df = combine[0]
test_df = combine[1]
train_df.head()

## Marking Outliers

In [None]:
#marking routing errors and short trips
for df in combine: 
    df['routing_error'] = np.zeros(df.index.shape)
    df['short_trip'] = np.zeros(df.index.shape)

    df.loc[(df.gmaps_distance > 500) & (df.manhattan < 50),"routing_error"] = 1
    df.loc[(df.gmaps_distance < 500) & (df.manhattan < 50),"short_trip"] = 1

In [None]:
train_df.sample(5)

In [None]:
test_df.sample(5)

In [None]:
colormap = plt.cm.viridis
plt.figure(figsize=(10,10))
plt.title('Pearson Correlation of Features',y=1.05,size=15)
sns.heatmap(train_df.corr().round(2),linewidths=0.1,vmax=1.0, square=True, cmap=colormap,linecolor='white', annot=True)