In [28]:
import numpy as np
import pandas as pd

In [30]:
pd.options.display.float_format = '{:,.4f}'.format

In [11]:
import numpy as np


def _psi(expected: np.ndarray, actual: np.ndarray, bucket_type: str = "bins", n_bins: int = 10) -> float:
    """Calculate PSI metric for two arrays.
    
    Parameters
    ----------
        expected : list-like
            Array of expected values
        actual : list-like
            Array of actual values
        bucket_type : str
            Binning strategy. Accepts two options: 'bins' and 'quantiles'. Defaults to 'bins'.
            'bins': input arrays are splitted into bins with equal
                and fixed steps based on 'expected' array
            'quantiles': input arrays are binned according to 'expected' array
                with given number of n_bins
        n_bins : int
            Number of buckets for binning. Defaults to 10.

    Returns
    -------
        A single float number
    """
    breakpoints = np.arange(0, n_bins + 1) / (n_bins) * 100
    if bucket_type == "bins":
        breakpoints = np.histogram(expected, n_bins)[1]
    elif bucket_type == "quantiles":
        breakpoints = np.percentile(expected, breakpoints)

    # Calculate frequencies
    expected_percents = np.histogram(expected, breakpoints)[0] / len(expected)
    actual_percents = np.histogram(actual, breakpoints)[0] / len(actual)
    # Clip freaquencies to avoid zero division
    expected_percents = np.clip(expected_percents, a_min=0.0001, a_max=None)
    actual_percents = np.clip(actual_percents, a_min=0.0001, a_max=None)
    # Calculate PSI
    psi_value = (expected_percents - actual_percents) * np.log(expected_percents / actual_percents)
    psi_value = sum(psi_value)

    return psi_value


def calculate_psi(
        expected: np.ndarray, actual: np.ndarray, bucket_type: str = "bins", n_bins: int = 10, axis: int = 0
) -> np.ndarray:
    """Apply PSI calculation to 2 1-d or 2-d arrays.

    Parameters
    ----------
    expected : list-like
        Array of expected values
    actual : list-like
        Array of actual values
    bucket_type : str
        Binning strategy. Accepts two options: 'bins' and 'quantiles'. Defaults to 'bins'.
            'bins' - input arrays are splitted into bins with equal
                and fixed steps based on ’expected' array
            'quantiles' - input arrays are binned according to ’expected’ array
                with given number of n_bins
    n_bins : int
        Number of buckets for binning. Defaults to 10.

    Returns
    -------
        np.ndarray
    """
    if len(expected.shape) == 1:
        psi_values = np.empty(len(expected.shape))
    else:
        psi_values = np.empty(expected.shape[axis])

    for i in range(0, len(psi_values)):
        if len(psi_values) == 1:
            psi_values = _psi(expected, actual, bucket_type, n_bins)
        elif axis == 0:
            psi_values[i] = _psi(expected[:, i], actual[:, i], bucket_type, n_bins)
        elif axis == 1:
            psi_values[i] = _psi(expected[i, :], actual[i, :], bucket_type, n_bins)
        return np.array(psi_values)

In [15]:
df = pd.read_excel('Enhanced_pizza_sell_data_2024-25.xlsx')

In [32]:
df.describe()

Unnamed: 0,Order Time,Delivery Time,Delivery Duration (min),Toppings Count,Distance (km),Delivery Efficiency (min/km),Topping Density,Estimated Duration (min),Delay (min),Pizza Complexity,Traffic Impact,Order Hour,Restaurant Avg Time
count,1004,1004,1004.0,1004.0,1004.0,1004.0,1004.0,1004.0,1004.0,1004.0,1004.0,1004.0,1004.0
mean,2025-03-27 00:33:24.980079872,2025-03-27 01:02:54.501992192,29.492,3.3625,4.9456,6.397,0.7147,11.8695,17.6225,9.4681,2.0498,18.6912,29.492
min,2024-01-05 18:30:00,2024-01-05 18:45:00,15.0,1.0,2.0,4.1667,0.2667,4.8,9.0,1.0,1.0,12.0,26.6667
25%,2024-08-31 13:33:45,2024-08-31 14:05:00,25.0,3.0,3.5,5.0,0.6,8.4,15.2,6.0,1.0,18.0,28.8442
50%,2025-03-01 07:30:00,2025-03-01 08:12:30,30.0,3.0,4.5,6.0,0.6667,10.8,17.8,6.0,2.0,19.0,29.9485
75%,2025-11-07 00:48:45,2025-11-07 01:18:45,30.0,4.0,6.0,7.1429,0.8333,14.4,20.4,12.0,3.0,20.0,30.2594
max,2026-07-07 20:00:00,2026-07-07 20:30:00,50.0,5.0,10.0,12.5,1.5,24.0,30.08,20.0,3.0,21.0,30.2865
std,,,7.7531,1.1359,1.9515,1.5626,0.203,4.6835,3.9643,6.2337,0.7757,1.5295,0.8599


In [36]:
df.head()

Unnamed: 0,Order ID,Restaurant Name,Location,Order Time,Delivery Time,Delivery Duration (min),Pizza Size,Pizza Type,Toppings Count,Distance (km),...,Topping Density,Order Month,Payment Category,Estimated Duration (min),Delay (min),Is Delayed,Pizza Complexity,Traffic Impact,Order Hour,Restaurant Avg Time
0,ORD001,Domino's,"New York, NY",2024-01-05 18:30:00,2024-01-05 18:45:00,15,Medium,Veg,3,2.5,...,1.2,January,Online,6.0,9.0,False,6,2,18,30.2594
1,ORD002,Papa John's,"Los Angeles, CA",2024-02-14 20:00:00,2024-02-14 20:25:00,25,Large,Non-Veg,4,5.0,...,0.8,February,Online,12.0,13.0,False,12,3,20,28.1863
2,ORD003,Little Caesars,"Chicago, IL",2024-03-21 12:15:00,2024-03-21 12:35:00,20,Small,Vegan,2,3.0,...,0.6667,March,Online,7.2,12.8,False,2,1,12,28.8442
3,ORD004,Pizza Hut,"Miami, FL",2024-04-10 19:45:00,2024-04-10 20:10:00,25,XL,Cheese Burst,5,4.5,...,1.1111,April,Offline,10.8,14.2,False,20,2,19,29.9485
4,ORD005,Marco's Pizza,"Dallas, TX",2024-05-05 13:00:00,2024-05-05 13:20:00,20,Medium,Non-Veg,3,2.0,...,1.5,May,Online,4.8,15.2,False,6,3,13,30.2865


In [38]:
df['year'] = pd.DatetimeIndex(df['Order Time']).year

In [40]:
df.head()

Unnamed: 0,Order ID,Restaurant Name,Location,Order Time,Delivery Time,Delivery Duration (min),Pizza Size,Pizza Type,Toppings Count,Distance (km),...,Order Month,Payment Category,Estimated Duration (min),Delay (min),Is Delayed,Pizza Complexity,Traffic Impact,Order Hour,Restaurant Avg Time,year
0,ORD001,Domino's,"New York, NY",2024-01-05 18:30:00,2024-01-05 18:45:00,15,Medium,Veg,3,2.5,...,January,Online,6.0,9.0,False,6,2,18,30.2594,2024
1,ORD002,Papa John's,"Los Angeles, CA",2024-02-14 20:00:00,2024-02-14 20:25:00,25,Large,Non-Veg,4,5.0,...,February,Online,12.0,13.0,False,12,3,20,28.1863,2024
2,ORD003,Little Caesars,"Chicago, IL",2024-03-21 12:15:00,2024-03-21 12:35:00,20,Small,Vegan,2,3.0,...,March,Online,7.2,12.8,False,2,1,12,28.8442,2024
3,ORD004,Pizza Hut,"Miami, FL",2024-04-10 19:45:00,2024-04-10 20:10:00,25,XL,Cheese Burst,5,4.5,...,April,Offline,10.8,14.2,False,20,2,19,29.9485,2024
4,ORD005,Marco's Pizza,"Dallas, TX",2024-05-05 13:00:00,2024-05-05 13:20:00,20,Medium,Non-Veg,3,2.0,...,May,Online,4.8,15.2,False,6,3,13,30.2865,2024


In [71]:
df_exp = df[df['year']==2024]['Delivery Duration (min)']


In [73]:
df_exp2 = df_exp.iloc[70:]

In [75]:
df_exp2.describe()

count   373.0000
mean     27.8686
std       8.9398
min      20.0000
25%      20.0000
50%      25.0000
75%      30.0000
max      50.0000
Name: Delivery Duration (min), dtype: float64

In [77]:
df_actual = df[df['year']==2025]['Delivery Duration (min)']

In [79]:
df_actual.describe()

count   373.0000
mean     32.1984
std       7.8198
min      20.0000
25%      30.0000
50%      30.0000
75%      40.0000
max      50.0000
Name: Delivery Duration (min), dtype: float64

In [83]:
calculate_psi(df_exp2, df_actual, bucket_type='bins', n_bins = 10, axis=0)

array(1.33136691)

In [85]:
calculate_psi(df_exp, df_exp, bucket_type='bins', n_bins = 10, axis=0)

array(0.)