In [None]:
# Imports
import pandas as pd
import numpy as np

from IPython.display import clear_output

In [None]:
# Read CSV
df = pd.read_csv('0_data.csv')

In [None]:
# For testing purpose only
df = df[:10000]

In [None]:
df

In [None]:
df.info()

In [None]:
# Checking NULL values
df.isnull().sum()

In [None]:
# Setting Radius of Earth
radius_earth = 6371

# Function to convert Degree to Radian
def rad(degree):
    return degree * np.pi / 180

# **Function to calculate distance using Haversine Formula**

In [None]:
def dist(lat1, lon1, lat2, lon2):
    d_lat = rad(lat2 - lat1)
    d_lon = rad(lon2 - lon1)
    a = np.sin(d_lat / 2) ** 2 + np.cos(rad(lat1)) * np.cos(rad(lat2)) * np.sin(d_lon / 2) ** 2
    c = 2 * np.arctan2(np.sqrt(a), np.sqrt(1 - a))
    return radius_earth * c

# **Adding a new column `Distance` between Restaurant & Customer**

In [None]:
# %%timeit
df["Distance"] = np.nan

df["Distance"] = dist(df["R_Lat"], df["R_Lon"], df["C_Lat"], df["C_Lon"])

In [None]:
df.head()

In [None]:
df.to_csv('1_data_with_distance.csv', index=False)

# **Adding a new column `Time` i.e. delivery time**

In [None]:
df = pd.read_csv('1_data_with_distance.csv')

In [None]:
def get_time(t1, t2):
    time = (pd.to_datetime(t2) - pd.to_datetime(t1)).dt.total_seconds()
    return time

In [None]:
%%timeit
df["Time"] = np.nan

df["Time"] = get_time(df["order_pickedup_time"], df["order_delivered_time"])

In [None]:
df.head()

In [None]:
df.to_csv('2_data_with_distance_time.csv', index=False)

# **Function to find `State` of the restaurant**

In [None]:
df = pd.read_csv('2_data_with_distance_time.csv')

In [None]:
from geopy.geocoders import Nominatim

# initialize Nominatim API
geolocator = Nominatim(user_agent="GetLoc")

In [None]:
def get_state(latitude, longitude):
    latitudes_str = str(latitude)
    longitudes_str = str(longitude)

    location = latitudes_str + ", " + longitudes_str

    address = geolocator.reverse(location)

    states = address.raw['address'].get('state', '')

    return states

# **Adding a new column `State` i.e. state of the restaurant**

In [None]:
df["State"] = np.nan

saved = 0

for i in range(len(df)):
    try:
        if pd.isna(df.loc[i]["State"]):
            df.loc[i, "State"] = get_state(df.loc[i, 'R_Lat'], df.loc[i, 'R_Lon'])

    except:
        # Due to some issues replacing failed rows with previous state
        df.loc[i, "State"] = df.loc[i-1, "State"]

    # Saving after a while to prevent data loss
    if i % 100 == 0:
        df.to_csv('3_data_with_distance_time_state.csv', index=False)
        saved = i

    clear_output(wait = True)
    print(f"Saved at {saved}")
    print(i)

In [None]:
df

In [None]:
df.to_csv('3_data_with_distance_time_state.csv', index=False)

# **Label Encoding of `State` column**

In [None]:
df = pd.read_csv('3_data_with_distance_time_state.csv')

In [None]:
df["State"].unique()

In [None]:
from sklearn import preprocessing

In [None]:
label_encoder = preprocessing.LabelEncoder()

df['State']= label_encoder.fit_transform(df['State'])

df['State'].unique()

In [None]:
df

# **Adding a new column `Hour` i.e. hour of order**

In [None]:
%%timeit
for i in range(len(df)):
    hour = pd.Timestamp(df.loc[i, "order_pickedup_time"]).hour
    df.loc[i, "Hour"] = hour

In [None]:
df

In [None]:
df.to_csv('4_data_with_distance_time_state_hour.csv', index=False)

# **Now, we have to split the dataset in Train & Test dataset**

In [None]:
df = pd.read_csv('/content/drive/MyDrive/data_science_problem/4_data_with_distance_time_state_hour.csv')

In [None]:
from sklearn.model_selection import train_test_split

In [None]:
x = np.array(df[["Distance", "State", "Hour"]])
y = np.array(df["Time"])

In [None]:
X_train, X_test, y_train, y_test = train_test_split(x, y, test_size=0.2, random_state=42)

# **Finally, we are going to train the model. For that purpose, we are going to use LSTM**

In [None]:
from keras.models import Sequential
from keras.layers import Dense, LSTM

In [None]:
model = Sequential()
model.add(LSTM(128, return_sequences=True, input_shape=(X_train.shape[1], 1)))
model.add(LSTM(64, return_sequences=False))
model.add(Dense(25))
model.add(Dense(1))
model.summary()

In [None]:
model.compile(optimizer='adam', loss='mean_squared_error')
model.fit(X_train, y_train, epochs=50, batch_size=100)

# **Let's take sample distance (in km), State, Hour and predict the time required**

In [None]:
model.predict(np.array([[2.242045, 1, 9]]))