In [None]:
# Imports
import pandas as pd
import numpy as np

from IPython.display import clear_output

In [None]:
# Read CSV
df = pd.read_csv('data/0_data.csv')

In [None]:
# For testing purpose only
df = df[:100]

In [None]:
df

In [None]:
df.info()

In [None]:
# Checking NULL values
df.isnull().sum()

# **Calculating `Distance`**

In [None]:
# Setting Radius of Earth
radius_earth = 6371


# Function to convert Degree to Radian
def rad(degree):
    return degree * np.pi / 180

In [None]:
# Function to calculate distance using Haversine Formula
def dist(lat1, lon1, lat2, lon2):
    d_lat = rad(lat2 - lat1)
    d_lon = rad(lon2 - lon1)
    a = np.sin(d_lat / 2) ** 2 + np.cos(rad(lat1)) * np.cos(rad(lat2)) * np.sin(d_lon / 2) ** 2
    c = 2 * np.arctan2(np.sqrt(a), np.sqrt(1 - a))
    return radius_earth * c

In [None]:
df["Distance (km)"] = dist(df["R_Lat"], df["R_Lon"], df["C_Lat"], df["C_Lon"])

In [None]:
df.head()

In [None]:
df.to_csv('data/1_data_with_distance.csv', index=False)

# **Calculating `Delivery_Time`**

In [None]:
df = pd.read_csv('data/1_data_with_distance.csv')

In [None]:
# Function to calculate delivery time
def get_time(t1, t2):
    time = (pd.to_datetime(t2) - pd.to_datetime(t1)).dt.total_seconds()
    return time

In [None]:
df["Delivery_Time (sec)"] = get_time(df["order_pickedup_time"], df["order_delivered_time"])

In [None]:
df.head()

In [None]:
df.to_csv('data/2_data_with_distance_time.csv', index=False)

# **Calculating `Delivery_Hour` e.g. Hour = 2 for all orders between 2:00 AM & 2:59 AM**

In [None]:
df = pd.read_csv('data/2_data_with_distance_time.csv')

In [None]:
for i in range(len(df)):
    hour = pd.Timestamp(df.loc[i, "order_pickedup_time"]).hour
    df.loc[i, "Delivery_Hour"] = hour

In [None]:
df

In [None]:
df.to_csv('data/3_data_with_distance_time_hour.csv', index=False)

# **Finding `State` of the restaurant**

In [None]:
df = pd.read_csv('data/3_data_with_distance_time_hour.csv')

In [None]:
from geopy.geocoders import Nominatim

# initialize Nominatim API
geolocator = Nominatim(user_agent="GetLoc")

In [None]:
# Function to fetch State of restaurant
def get_state(latitude, longitude):
    latitudes_str = str(latitude)
    longitudes_str = str(longitude)

    location = latitudes_str + ", " + longitudes_str

    address = geolocator.reverse(location)

    states = address.raw['address'].get('state', '')

    return states

In [None]:
df["State"] = np.nan

saved = 0

for i in range(len(df)):
    try:
        if pd.isna(df.loc[i]["State"]):
            df.loc[i, "State"] = get_state(df.loc[i, 'R_Lat'], df.loc[i, 'R_Lon'])
    except:
        # Due to some issues replacing failed rows with previous state
        df.loc[i, "State"] = df.loc[i - 1, "State"]

    # Saving after a while to prevent data loss
    if i % 10 == 0:
        df.to_csv('data/4_data_with_distance_time_hour_state.csv', index=False)
        saved = i

    clear_output()
    print(f"Saved till {saved}")
    print(f"{i}/{len(df)}")

In [None]:
df

In [None]:
df.to_csv('data/4_data_with_distance_time_hour_state.csv', index=False)

# Dropping unnecessary columns & rearranging the columns

In [None]:
drop = ["R_Lon", "R_Lat", "C_Lon", "C_Lat", "order_pickedup_time", "order_delivered_time"]

In [None]:
df.drop(drop, axis=1, inplace=True)

In [None]:
# Rearranging the columns
df = df[["order_id", "Distance (km)", "Delivery_Hour", "State", 'Delivery_Time (sec)']]

In [None]:
df

In [None]:
df.to_csv('data/Preprocessed_Data.csv', index=False)