## Step 0: Initiate Libraries




In [1]:
# Import Warnings
import warnings
warnings.simplefilter("ignore", UserWarning)
warnings.simplefilter("ignore", FutureWarning)
warnings.simplefilter("ignore", DeprecationWarning)

# Import Key Libraries
import numpy as np
import pandas as pd
import os

# Import Data Preprocessing Libraries
from dateutil.parser import parse

# ast : Abstract Syntax Trees
from ast import literal_eval

# Import Geospatial Libraries
import geopy
from geopy.geocoders import Nominatim
from geopy.extra.rate_limiter import RateLimiter
from geopy.distance import geodesic
import geopandas as gpd
import folium
from folium import plugins
from folium.plugins import *
import reverse_geocoder as rg 

# Datetime
import datetime
import datetime as dt


# Data Visualisation
import matplotlib.pyplot as plt
import seaborn as sns
import plotly.express as px
%matplotlib inline
import plotly.graph_objs as go
from plotly.tools import FigureFactory as FF

## Step 1: Read Data

- Here will we be reading the raw data as -  `dirty_data.csv` file into our jupyter notebook.
- The variable name for the Food Delivery data would be called `dataset` .

In [2]:
# Create the file path
file_path = os.path.join(os.pardir, os.pardir, 'Melbourne-Delivery/data/dirty_data.csv')

# Load the file into a DataFrame
delivery_data = pd.read_csv(file_path)

# Copy the data
orders_df = delivery_data.copy()

### Step 1.1: Data Discovery (Building Intuition)

- This is a technique we use to get an initial feel for our data tables.
- We read the data using pandas and perform method calls.
- Standardize dataset columns in the correct format.
- Explore Descriptive Statistics on Numerical Columns and more below:

##### `df.info()`

- It is an important and widely used method of Python.
- This Method prints the information or summary of the dataframe.
- It prints the various information of the Dataframe such as index type, dtype, columns, non-values, and memory usage. It gives a quick overview of the dataset.
- Info Method to get the Non-Null Count & Dtype (data type) of the dataset,
- Validate if a column and column type aligns with the format of the Business Requirements.

In [3]:
orders_df.info()

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 500 entries, 0 to 499
Data columns (total 12 columns):
 #   Column                   Non-Null Count  Dtype  
---  ------                   --------------  -----  
 0   order_id                 500 non-null    object 
 1   date                     500 non-null    object 
 2   time                     500 non-null    object 
 3   order_type               500 non-null    object 
 4   branch_code              500 non-null    object 
 5   order_items              500 non-null    object 
 6   order_price              500 non-null    float64
 7   customer_lat             500 non-null    float64
 8   customer_lon             500 non-null    float64
 9   customerHasloyalty?      500 non-null    int64  
 10  distance_to_customer_KM  500 non-null    float64
 11  delivery_fee             500 non-null    float64
dtypes: float64(5), int64(1), object(6)
memory usage: 47.0+ KB


### Step 1.2: Data Preporcessing - Cleaning 

- Here we will be cleaning the data by converting the columns to the correct data types.
- We will merge the date and time columns into one column called `order_date` and convert it to a datetime type.
- We will also rename the `customerHasloyalty?` , `distance_to_customer_KM` columns to `customer_loyalty` , and  `distance_to_customer` respectively.
- We will add an additional column called `updated at` which will be the date and time the data was updated.

In [4]:
def reverseGeocode(coordinates): 
    result = rg.search(coordinates)
    return (result)

def cleaning_data_types(orders_df):
    # Create a copy of the DataFrame to avoid modifying the original one
    df_clean = orders_df.copy()

    # Define helper function to clean date data
    def clean_date(date_str):
        date_str = date_str.strip()
        date = parse(date_str, dayfirst=True)  # dayfirst=True to handle DD/MM/YYYY properly
        return date.strftime('%Y-%m-%d')

    # Convert columns to appropriate data types
    df_clean['order_id'] = df_clean['order_id'].str.extract('(\d+)').astype(int)
    df_clean['date'] = df_clean['date'].apply(clean_date).astype('datetime64[ns]')
    df_clean['datetime'] = pd.to_datetime(df_clean['date'].astype(str) + ' ' + df_clean['time'])
    df_clean["order_type"] = df_clean["order_type"].astype("category")

   
    # Convert 'branch_code' to upper case to handle case-insensitive duplicates
    df_clean["branch_code"] = df_clean["branch_code"].str.upper().astype("category")


    # Use exception handling for potential errors in the literal_eval() function
    try:
        df_clean["order_items"] = df_clean["order_items"].apply(literal_eval)
    except (ValueError, SyntaxError):
        pass

    # Continue with the remaining conversions
    df_clean["order_price"] = df_clean["order_price"].astype(float)
    df_clean["customer_lat"] = df_clean["customer_lat"].astype(float)
    df_clean["customer_lon"] = df_clean["customer_lon"].astype(float)
    df_clean["customerHasloyalty?"] = df_clean["customerHasloyalty?"].astype(bool)
    df_clean["distance_to_customer_KM"] = df_clean["distance_to_customer_KM"].astype(float)
    df_clean["delivery_fee"] = df_clean["delivery_fee"].astype(float)


    # make the order_price two decimal places
    df_clean['order_price'] = df_clean['order_price'].round(2)

    # make the delivery fee two decimal places
    df_clean['delivery_fee'] = df_clean['delivery_fee'].round(2)


    # transform long/lat into state
    coordinates =list(zip(df_clean['customer_lat'],df_clean['customer_lon'])) # generates pair of (lat,long)
    data = reverseGeocode(coordinates)


    # Create a new column with the City name    
    df_clean['name'] = [i['name'] for i in data]
    df_clean['admin1'] = [i['admin1'] for i in data]
    df_clean['admin2'] = [i['admin2'] for i in data]


    df_clean.drop(['admin1', 'admin2'], axis=1, inplace=True)
    df_clean.rename(columns={'name': 'location'}, inplace=True)
 

    # Rename the customerHasloyalty? column to customerHasloyalty
    df_clean.rename(columns={'customerHasloyalty?': 'customer_loyalty'}, inplace=True)

    # Rename the distance_to_customer_KM column to distance_to_customer_km
    df_clean.rename(columns={'distance_to_customer_KM': 'distance_to_customer_km'}, inplace=True)

    # Drop the 'date' and 'time' columns
    df_clean.drop(['date', 'time'], axis=1, inplace=True)

    # Rename the 'datetime' column to 'order_date' and move it to the second position
    df_clean.rename(columns={'datetime': 'order_date'}, inplace=True)
    order_date = df_clean.pop('order_date')
    df_clean.insert(1, 'order_date', order_date)

    # Add the 'updated_at' column with the current datetime
    df_clean['updated_at'] = datetime.datetime.today().replace(second=0, microsecond=0)

    df_clean.drop(['customer_lat', 'customer_lon'], axis=1, inplace=True)

    return df_clean

df_clean = cleaning_data_types(orders_df)
df_clean.info()


Loading formatted geocoded file...
<class 'pandas.core.frame.DataFrame'>
RangeIndex: 500 entries, 0 to 499
Data columns (total 11 columns):
 #   Column                   Non-Null Count  Dtype         
---  ------                   --------------  -----         
 0   order_id                 500 non-null    int64         
 1   order_date               500 non-null    datetime64[ns]
 2   order_type               500 non-null    category      
 3   branch_code              500 non-null    category      
 4   order_items              500 non-null    object        
 5   order_price              500 non-null    float64       
 6   customer_loyalty         500 non-null    bool          
 7   distance_to_customer_km  500 non-null    float64       
 8   delivery_fee             500 non-null    float64       
 9   location                 500 non-null    object        
 10  updated_at               500 non-null    datetime64[ns]
dtypes: bool(1), category(2), datetime64[ns](2), float64(3), int64(

In [5]:
df_clean.head(5)

Unnamed: 0,order_id,order_date,order_type,branch_code,order_items,order_price,customer_loyalty,distance_to_customer_km,delivery_fee,location,updated_at
0,1406,2018-07-08 15:16:03,Lunch,NS,"[(Fries, 6), (Salad, 4)]",140.8,True,8.335,13.7,Docklands,2023-07-06 02:03:00
1,10125,2018-12-01 08:20:16,Breakfast,NS,"[(Cereal, 8), (Pancake, 6)]",313.5,True,7.536,6.17,Melbourne,2023-07-06 02:03:00
2,4175,2018-06-07 14:05:04,Lunch,NS,"[(Steak, 3), (Salad, 1), (Chicken, 6), (Fries,...",714.0,False,9.86,15.09,Richmond,2023-07-06 02:03:00
3,3691,2018-04-26 11:43:05,Dinner,NS,"[(Pancake, 9), (Eggs, 10), (Cereal, 2)]",480.25,False,8.614,13.68,Southbank,2023-07-06 02:03:00
4,4094,2018-04-10 11:12:40,Breakfast,NS,"[(Eggs, 5), (Coffee, 3), (Pancake, 9), (Cereal...",497.75,False,8.802,13.76,Southbank,2023-07-06 02:03:00


In [6]:
# Define directory
directory = os.path.join(os.pardir, 'data')

# Create target directory if it doesn't exist
if not os.path.exists(directory):
    os.makedirs(directory)

# Save file
df_clean.to_csv(os.path.join(directory, 'clean_data.csv'), index=False)

### Step 1.3: Feature Engineering

- Here we will be creating new columns from existing columns.

#### Step 1.4: Feature Engineering ( `order_items` )

- Here we will be extracting the food items and their quantities from the `order_items` column into separate columns.
- `cuisine` which will be the type of cuisine the food item is.
-  `order_items_count` which will be the total number of items ordered.
-  `order_items_total` which will be the total price of the items ordered.

#### Step 1.5: Feature Engineering ( `order_date` )

- Here we will be creating new columns based on the existing columns in the dataset.
- `order_time_of_day` which will be the hour of the day the order was made. (Morning, Afternoon, Evening, Night)
- `order_day` which will be the day of the week the order was made. (Monday, Tuesday, Wednesday, Thursday, Friday, Saturday, Sunday)
-  `order_month` which will be the month of the year the order was made. (January, February, March, April, May, June, July, August, September, October, November, December)
- `order_season` which will be the season the order was made. (Summer, Autumn, Winter, Spring)


In [None]:
def feature_en(df_clean):
    # Create a copy of the DataFrame to avoid modifying the original one
    df = df_clean.copy()

   # Explode the 'order_items' column
    df_exploded = df.explode('order_items')

    # Split the tuple into two new columns
    df_exploded[['cuisine', 'quantity_ordered']] = pd.DataFrame(df_exploded['order_items'].tolist(), index=df_exploded.index)

    # average_order_price = order_items_total / quantity_ordered
    df_exploded['average_item_price'] = df_exploded['order_price'] / df_exploded['quantity_ordered']

    # two decimal places
    df_exploded['average_item_price'] = df_exploded['average_item_price'].round(2)

    # Drop the 'order_items' column
    df_exploded.drop('order_items', axis=1, inplace=True)


    # Extract the year, month as Jan, Feb, Mar, etc. Add them as new columns
    df_exploded['order_month'] = df_exploded['order_date'].dt.strftime('%b')
    df_exploded['day_of_week'] = df_exploded['order_date'].dt.strftime('%a')

    # new column for the season the order was made. (Spring, Summer, Autumn, Winter)
    df_exploded['order_season'] = df_exploded['order_date'].dt.month.apply(lambda x: (x%12 + 3)//3)
    
    # change the season number to season name
    df_exploded['order_season'] = df_exploded['order_season'].map({1:'Spring', 2:'Summer', 3:'Autumn', 4:'Winter'})

    # reposition the columns 
    df_exploded = df_exploded[['order_id', 'order_date' , 'order_price', 'quantity_ordered' , 'average_item_price', 'order_month', 'day_of_week', 'order_season' , 'order_type', 'branch_code' , 'delivery_fee', 'location', 'cuisine' , 'customer_loyalty', 'distance_to_customer_km','updated_at']]

    return df_exploded

df_exploded = feature_en(df_clean)
df_exploded.head()

In [None]:
df_exploded.info()

### Step 2: Descriptive Statistics 

- Descriptive statistics include those that summarize the central tendency, dispersion and shape of a dataset's distribution, excluding NaN values. 

In [None]:
# Descriptive Statistics
df_exploded.describe(include='all')

In [None]:
# Base on the information from the describe() method:

#  Orders Date Range 
print('Orders Date Range ---->: ', df_exploded['order_date'].min(), 'to', df_exploded['order_date'].max())

# Order Price Range
print('Order Price Range ---->: ', df_exploded['order_price'].min(), 'to', df_exploded['order_price'].max())

# Average Order Price in 2 decimal places
print('Average Order Price ---->: ', round(df_exploded['order_price'].mean(), 2))

# Popular Order Type
print('Popular Order Type ---->: ', df_exploded['order_type'].mode()[0])

# Popular Cuisine
print('Popular Cuisine ---->: ', df_exploded['cuisine'].mode()[0])

# Popular Branch
print('Popular Branch ---->: ', df_exploded['branch_code'].mode()[0])

# Popular Day of the Week
print('Popular Day of the Week ---->: ', df_exploded['day_of_week'].mode()[0])

# Popular Season
print('Popular Season ---->: ', df_exploded['order_season'].mode()[0])

# Popular Month
print('Popular Month ---->: ', df_exploded['order_month'].mode()[0])

# Pupluar Hour
print('Popular Hour ---->: ', df_exploded['order_date'].dt.hour.mode()[0])

# Delivery Fee Range
print('Delivery Fee Range ---->: ', df_exploded['delivery_fee'].min(), 'to', df_exploded['delivery_fee'].max())

# Average Disance to Customer in 2 decimal places
print('Average Disance to Customer ---->: ', round(df_exploded['distance_to_customer_km'].mean(), 2))

# Popular Location
print('Popular Location ---->: ', df_exploded['location'].mode()[0])

# Types of cuisine
print('Types of cuisine ---->: ', df_exploded['cuisine'].unique())

###