In [1]:
# Imports
import pandas as pd
import numpy as np

from IPython.display import clear_output

In [None]:
# Read CSV
df = pd.read_csv('data/0_data.csv')

In [None]:
# For testing purpose only
# df = df[:100]

In [None]:
df

In [None]:
df.info()

In [None]:
# Checking NULL values
df.isnull().sum()

# **Calculating `Distance`**

In [None]:
# Setting Radius of Earth
radius_earth = 6371


# Function to convert Degree to Radian
def rad(degree):
    return degree * np.pi / 180

In [None]:
# Function to calculate distance using Haversine Formula
def dist(lat1, lon1, lat2, lon2):
    d_lat = rad(lat2 - lat1)
    d_lon = rad(lon2 - lon1)
    a = np.sin(d_lat / 2) ** 2 + np.cos(rad(lat1)) * np.cos(rad(lat2)) * np.sin(d_lon / 2) ** 2
    c = 2 * np.arctan2(np.sqrt(a), np.sqrt(1 - a))
    return radius_earth * c

In [None]:
df["Distance (km)"] = dist(df["R_Lat"], df["R_Lon"], df["C_Lat"], df["C_Lon"])

In [None]:
df.head()

In [None]:
df.to_csv('data/1_data_with_distance.csv', index=False)

# **Calculating `Delivery_Time`**

In [None]:
df = pd.read_csv('data/1_data_with_distance.csv')

In [None]:
# Function to calculate delivery time
def get_time(t1, t2):
    time = (pd.to_datetime(t2) - pd.to_datetime(t1)).dt.total_seconds()
    return time

In [None]:
df["Delivery_Time (sec)"] = get_time(df["order_pickedup_time"], df["order_delivered_time"])

In [None]:
df.head()

In [None]:
df.to_csv('data/2_data_with_distance_time.csv', index=False)

# **Calculating `Delivery_Hour` e.g. Hour = 2 for all orders between 2:00 AM & 2:59 AM**

In [None]:
df = pd.read_csv('data/2_data_with_distance_time.csv')

In [None]:
df["Delivery_Hour"] = pd.to_datetime(df["order_pickedup_time"]).dt.hour

In [None]:
df

In [None]:
df.to_csv('data/3_data_with_distance_time_hour.csv', index=False)

# **Finding `State` of the restaurant**

In [None]:
df = pd.read_csv('data/3_data_with_distance_time_hour.csv')

In [None]:
try:
    df = pd.read_csv('data/4_data_with_distance_time_hour_state.csv')
    saved = len(df) - df["State"].isnull().sum() - 1
except:
    df["State"] = np.nan
    saved = 0

In [None]:
from geopy.geocoders import Nominatim

# initialize Nominatim API
geolocator = Nominatim(user_agent="GetLoc")

In [None]:
# Function to fetch State of restaurant
def get_state(latitude, longitude):
    latitudes_str = str(latitude)
    longitudes_str = str(longitude)

    location = latitudes_str + ", " + longitudes_str

    address = geolocator.reverse(location)

    states = address.raw['address'].get('state', '')

    return states

In [None]:
for i in range(saved, len(df)):
    try:
        if pd.isna(df.loc[i]["State"]):
            df.loc[i, "State"] = get_state(df.loc[i, 'R_Lat'], df.loc[i, 'R_Lon'])
    except:
        # Due to some issues replacing failed rows with previous state
        df.loc[i, "State"] = df.loc[i - 1, "State"]

    # Saving after a while to prevent data loss
    if i % 100 == 0:
        df.to_csv('data/4_data_with_distance_time_hour_state.csv', index=False)
        saved = i

    clear_output()
    print(f"Saved till {saved}")
    print(f"{i + 1}/{len(df)}")

In [None]:
df

In [None]:
df.to_csv('data/4_data_with_distance_time_hour_state.csv', index=False)

# Rearranging the columns

In [2]:
df = pd.read_csv('data/4_data_with_distance_time_hour_state.csv')

  df = pd.read_csv('data/4_data_with_distance_time_hour_state.csv')


In [3]:
df.columns

Index(['order_id', 'R_Lon', 'R_Lat', 'C_Lon', 'C_Lat', 'order_pickedup_time',
       'order_delivered_time', 'Distance (km)', 'Delivery_Time (sec)',
       'Delivery_Hour', 'State'],
      dtype='object')

In [4]:
# Rearranging the columns
df = df[["order_id", "R_Lat", "R_Lon", "C_Lat", "C_Lon", "order_pickedup_time", "order_delivered_time", "Distance (km)",
         "Delivery_Hour", "State", 'Delivery_Time (sec)']]

In [5]:
df

Unnamed: 0,order_id,R_Lat,R_Lon,C_Lat,C_Lon,order_pickedup_time,order_delivered_time,Distance (km),Delivery_Hour,State,Delivery_Time (sec)
0,PRD_ZMT_2960011,28.468983,77.063728,28.449929,77.071230,2016-04-23 09:30:20,2016-04-23 09:57:04,2.242045,9,Haryana,1604.0
1,PRD_ZMT_2964555,28.468983,77.063728,28.454522,77.066637,2016-04-23 11:05:32,2016-04-23 11:27:10,1.632941,11,Haryana,1298.0
2,PRD_ZMT_2968945,28.468983,77.063728,28.461570,77.067590,2016-04-23 14:49:07,2016-04-23 15:07:18,0.906626,14,Haryana,1091.0
3,PRD_ZMT_2976663,28.468983,77.063728,28.455738,77.058401,2016-04-23 16:43:14,2016-04-23 17:16:12,1.562127,16,Haryana,1978.0
4,PRD_ZMT_2985948,28.468983,77.063728,28.440068,77.083967,2016-04-24 07:54:14,2016-04-24 08:16:29,3.775236,7,Haryana,1335.0
...,...,...,...,...,...,...,...,...,...,...,...
237368,PRD_ZMT_8457746,28.532886,77.207941,28.503037,77.201892,2016-09-25 16:49:02,2016-09-25 17:18:55,3.371266,16,,1793.0
237369,PRD_ZMT_8458181,22.544237,88.352148,22.560074,88.369037,2016-09-25 17:07:48,2016-09-25 17:22:04,2.471663,17,,856.0
237370,PRD_ZMT_8458897,19.122455,72.916411,19.104956,72.918557,2016-09-25 17:22:39,2016-09-25 17:43:55,1.958820,17,,1276.0
237371,PRD_ZMT_8458955,17.450286,78.379158,17.444102,78.391155,2016-09-25 17:07:15,2016-09-25 17:26:16,1.446523,17,,1141.0


In [6]:
df.to_csv('data/Preprocessed_Data.csv', index=False)