
#  Optimizing Delivery Partner Allocation in Last Mile Logistics
## Data Preprocessing 



In [1]:
import pandas as pd
import numpy as np
from datetime import datetime
import matplotlib.pyplot as plt

In [2]:
df = pd.read_csv('merged_delivery_data.csv')
df.head(10)

Unnamed: 0,order_id,region_id,city,courier_id,lng,lat,aoi_id,aoi_type,accept_time,accept_gps_time,accept_gps_lng,accept_gps_lat,delivery_time,delivery_gps_time,delivery_gps_lng,delivery_gps_lat,ds
0,2031782,10,Chongqing,73,108.71571,30.90228,50,14,10-22 10:26:00,10-22 10:26:00,108.71826,30.95587,10-22 17:04:00,10-22 17:04:00,108.66361,30.96702,1022
1,4285071,10,Chongqing,3605,108.71639,30.90269,50,14,09-07 10:13:00,09-07 10:13:00,108.71791,30.95635,09-09 15:44:00,09-09 15:44:00,108.71644,30.90266,907
2,4056800,10,Chongqing,3605,108.71645,30.90259,50,14,06-26 09:49:00,06-26 09:49:00,108.71798,30.95635,06-27 16:03:00,06-27 16:03:00,108.71647,30.90251,626
3,3589481,10,Chongqing,3605,108.7165,30.90347,50,14,09-11 11:01:00,09-11 11:01:00,108.71823,30.95596,09-13 17:14:00,09-13 17:14:00,108.7165,30.90341,911
4,2752329,10,Chongqing,3605,108.71608,30.90409,50,14,10-01 09:52:00,10-01 09:52:00,108.7182,30.95598,10-01 18:30:00,10-01 18:30:00,108.71413,30.90397,1001
5,659996,10,Chongqing,3605,108.71644,30.9047,50,14,08-08 19:01:00,08-08 19:01:00,108.71796,30.9563,08-11 10:50:00,08-11 10:50:00,108.71632,30.90479,808
6,4481765,10,Chongqing,3605,108.71605,30.9041,50,14,09-30 10:00:00,09-30 10:00:00,108.71824,30.95583,09-30 16:38:00,09-30 16:38:00,108.71429,30.90416,930
7,2365752,10,Chongqing,3605,108.71633,30.90266,50,14,09-30 10:00:00,09-30 10:00:00,108.71826,30.95585,09-30 18:38:00,09-30 18:38:00,108.71425,30.90416,930
8,20671,10,Chongqing,3605,108.71643,30.90253,50,14,05-20 10:06:00,05-20 10:06:00,108.71795,30.95621,05-21 15:30:00,05-21 15:30:00,108.71643,30.9025,520
9,965648,10,Chongqing,3605,108.71554,30.90256,50,14,08-10 10:52:00,08-10 10:52:00,108.71797,30.9563,08-12 15:50:00,08-12 15:50:00,108.71542,30.90243,810


#### Understanding the dataset - Statistical overview

In [3]:
df.columns

Index(['order_id', 'region_id', 'city', 'courier_id', 'lng', 'lat', 'aoi_id',
       'aoi_type', 'accept_time', 'accept_gps_time', 'accept_gps_lng',
       'accept_gps_lat', 'delivery_time', 'delivery_gps_time',
       'delivery_gps_lng', 'delivery_gps_lat', 'ds'],
      dtype='object')

In [4]:
df.shape

(4514661, 17)

In [5]:
df['city'].unique()

array(['Chongqing', 'Jilin', 'Hangzhou', 'Shanghai', 'Yantai'],
      dtype=object)

In [6]:
df.info()

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 4514661 entries, 0 to 4514660
Data columns (total 17 columns):
 #   Column             Dtype  
---  ------             -----  
 0   order_id           int64  
 1   region_id          int64  
 2   city               object 
 3   courier_id         int64  
 4   lng                float64
 5   lat                float64
 6   aoi_id             int64  
 7   aoi_type           int64  
 8   accept_time        object 
 9   accept_gps_time    object 
 10  accept_gps_lng     float64
 11  accept_gps_lat     float64
 12  delivery_time      object 
 13  delivery_gps_time  object 
 14  delivery_gps_lng   float64
 15  delivery_gps_lat   float64
 16  ds                 int64  
dtypes: float64(6), int64(6), object(5)
memory usage: 585.6+ MB


In [7]:
df[['accept_time', 'delivery_time']].sample(5)

Unnamed: 0,accept_time,delivery_time
638666,09-13 14:11:00,09-13 21:28:00
699871,09-14 10:08:00,09-14 20:24:00
2037113,05-27 08:50:00,05-27 09:47:00
2764948,06-11 08:00:00,06-11 12:55:00
4022036,06-12 17:10:00,06-12 17:44:00


In [8]:
df.describe()

Unnamed: 0,order_id,region_id,courier_id,lng,lat,aoi_id,aoi_type,accept_gps_lng,accept_gps_lat,delivery_gps_lng,delivery_gps_lat,ds
count,4514661.0,4514661.0,4514661.0,4514661.0,4514661.0,4514661.0,4514661.0,4511284.0,4511284.0,4514661.0,4514661.0,4514661.0
mean,2257330.0,73.81683,2385.268,117.8683,30.84852,29980.49,4.355784,117.8629,30.83946,117.8634,30.84735,811.2479
std,1303271.0,45.38614,1390.526,5.822634,1.928226,17290.42,5.527705,5.853219,1.916018,5.867664,1.937424,165.3643
min,0.0,0.0,0.0,102.0849,23.11348,0.0,0.0,-9e-05,-9e-05,-9e-05,-9e-05,501.0
25%,1128665.0,36.0,1206.0,119.9666,30.12185,14874.0,1.0,119.9621,30.10518,119.9666,30.1217,703.0
50%,2257330.0,71.0,2345.0,120.2321,30.30657,30055.0,1.0,120.2476,30.31297,120.2317,30.30687,820.0
75%,3385995.0,111.0,3588.0,121.4099,31.18919,44464.0,8.0,121.4002,31.18778,121.4098,31.18906,929.0
max,4514660.0,167.0,4876.0,126.8191,44.22335,60149.0,15.0,126.6288,43.94521,139.7561,45.76194,1031.0


In [9]:
#checking counts
df.count().to_frame(name='Count_Rows')

Unnamed: 0,Count_Rows
order_id,4514661
region_id,4514661
city,4514661
courier_id,4514661
lng,4514661
lat,4514661
aoi_id,4514661
aoi_type,4514661
accept_time,4514661
accept_gps_time,4514661


---

## Data Cleaning
##### Handling Missing Values
##### Checkig Duplicates
##### Verifing Data Types

In [10]:
#checking missing values
df.isnull().sum()

order_id                0
region_id               0
city                    0
courier_id              0
lng                     0
lat                     0
aoi_id                  0
aoi_type                0
accept_time             0
accept_gps_time         0
accept_gps_lng       3377
accept_gps_lat       3377
delivery_time           0
delivery_gps_time       0
delivery_gps_lng        0
delivery_gps_lat        0
ds                      0
dtype: int64

In [11]:
# Checking for Missing Values in Essential Columns from gitbook
df[['accept_time', 'delivery_time', 'lng', 'lat', 'courier_id']].isnull().sum().to_frame(name='Essential Missing Values')

Unnamed: 0,Essential Missing Values
accept_time,0
delivery_time,0
lng,0
lat,0
courier_id,0


In [12]:
# Impute missing 'accept_gps_lng' and 'accept_gps_lat' values with the mean of their corresponding city groups
df['accept_gps_lng'] = df.groupby('city')['accept_gps_lng'].transform(lambda x: x.fillna(x.mean()))
df['accept_gps_lat'] = df.groupby('city')['accept_gps_lat'].transform(lambda x: x.fillna(x.mean()))

In [13]:
#verifying missing
df.isnull().sum()


order_id             0
region_id            0
city                 0
courier_id           0
lng                  0
lat                  0
aoi_id               0
aoi_type             0
accept_time          0
accept_gps_time      0
accept_gps_lng       0
accept_gps_lat       0
delivery_time        0
delivery_gps_time    0
delivery_gps_lng     0
delivery_gps_lat     0
ds                   0
dtype: int64

In [14]:
#checking dup
df[df.duplicated()]

Unnamed: 0,order_id,region_id,city,courier_id,lng,lat,aoi_id,aoi_type,accept_time,accept_gps_time,accept_gps_lng,accept_gps_lat,delivery_time,delivery_gps_time,delivery_gps_lng,delivery_gps_lat,ds


In [15]:
df.dtypes

order_id               int64
region_id              int64
city                  object
courier_id             int64
lng                  float64
lat                  float64
aoi_id                 int64
aoi_type               int64
accept_time           object
accept_gps_time       object
accept_gps_lng       float64
accept_gps_lat       float64
delivery_time         object
delivery_gps_time     object
delivery_gps_lng     float64
delivery_gps_lat     float64
ds                     int64
dtype: object

In [16]:
# converting dtypes
df['accept_time'] = pd.to_datetime('2024-' + df['accept_time'], format='%Y-%m-%d %H:%M:%S', errors='coerce')
df['accept_gps_time'] = pd.to_datetime('2024-' + df['accept_gps_time'], format='%Y-%m-%d %H:%M:%S', errors='coerce')
df['delivery_time'] = pd.to_datetime('2024-' + df['delivery_time'], format='%Y-%m-%d %H:%M:%S', errors='coerce')
df['delivery_gps_time'] = pd.to_datetime('2024-' + df['delivery_gps_time'], format='%Y-%m-%d %H:%M:%S', errors='coerce')

df['lat'] = pd.to_numeric(df['lat'], errors='coerce')
df['lng'] = pd.to_numeric(df['lng'], errors='coerce')

df['aoi_type'] = df['aoi_type'].astype('category')

In [17]:
df.dtypes

order_id                      int64
region_id                     int64
city                         object
courier_id                    int64
lng                         float64
lat                         float64
aoi_id                        int64
aoi_type                   category
accept_time          datetime64[ns]
accept_gps_time      datetime64[ns]
accept_gps_lng              float64
accept_gps_lat              float64
delivery_time        datetime64[ns]
delivery_gps_time    datetime64[ns]
delivery_gps_lng            float64
delivery_gps_lat            float64
ds                            int64
dtype: object

In [18]:
df.sample(5)

Unnamed: 0,order_id,region_id,city,courier_id,lng,lat,aoi_id,aoi_type,accept_time,accept_gps_time,accept_gps_lng,accept_gps_lat,delivery_time,delivery_gps_time,delivery_gps_lng,delivery_gps_lat,ds
2169276,4227068,91,Hangzhou,386,120.43724,30.14306,2363,14,2024-09-14 09:10:00,2024-09-14 09:10:00,120.416,30.19816,2024-09-14 17:40:00,2024-09-14 17:40:00,120.43942,30.17502,914
1026309,4325538,3,Hangzhou,3384,120.10678,30.26886,6932,1,2024-07-18 08:06:00,2024-07-18 08:06:00,120.0971,30.27131,2024-07-18 08:34:00,2024-07-18 08:34:00,120.10727,30.26827,718
4300975,377728,92,Shanghai,2475,121.29031,31.02999,58956,1,2024-08-16 19:16:00,2024-08-16 19:16:00,121.27445,31.02228,2024-08-16 19:31:00,2024-08-16 19:31:00,121.29029,31.02991,816
3051695,227218,9,Shanghai,2551,121.44735,30.90267,35201,1,2024-09-29 09:09:00,2024-09-29 09:09:00,121.47519,30.8902,2024-09-29 10:20:00,2024-09-29 10:20:00,121.4474,30.90262,929
715170,1561824,153,Chongqing,660,106.52992,29.65227,13817,1,2024-08-06 14:10:00,2024-08-06 14:10:00,106.56431,29.6727,2024-08-06 17:30:00,2024-08-06 17:30:00,106.53045,29.65389,806


---

### Adding & Validating NEW columns: `delivery_duration` And `distance` columns

In [19]:
#adding duration column
df['delivery_duration'] = df['delivery_time'] - df['accept_time']

In [20]:
df['delivery_duration'].describe()

count                      4514661
mean     0 days 03:05:49.485815214
std      0 days 13:00:37.288630943
min            -294 days +08:23:00
25%                0 days 00:56:00
50%                0 days 01:45:00
75%                0 days 03:13:00
max              115 days 14:59:00
Name: delivery_duration, dtype: object

<span style="color:red"> ***The minimum `delivery_duration` time shows a negiative number....***</span>


In [21]:
#identifing rows with negative durations
negative_durations = df[df['delivery_duration'] < pd.Timedelta(0)]
negative_durations

Unnamed: 0,order_id,region_id,city,courier_id,lng,lat,aoi_id,aoi_type,accept_time,accept_gps_time,accept_gps_lng,accept_gps_lat,delivery_time,delivery_gps_time,delivery_gps_lng,delivery_gps_lat,ds,delivery_duration
440298,3739029,131,Chongqing,2954,106.19835,29.49296,28470,14,2024-10-31 10:57:00,2024-10-31 10:57:00,106.21016,29.55476,2024-01-11 19:20:00,2024-01-11 19:20:00,106.19901,29.49442,1031,-294 days +08:23:00
518410,3212380,135,Chongqing,486,107.80753,30.66992,55255,14,2024-10-31 12:14:00,2024-10-31 12:14:00,107.7491,30.66363,2024-01-13 14:16:00,2024-01-13 14:16:00,107.75239,30.67711,1031,-292 days +02:02:00
522559,3956975,136,Chongqing,3893,106.31756,29.52306,57605,14,2024-10-22 17:54:00,2024-10-22 17:54:00,106.3663,29.48668,2024-01-19 12:56:00,2024-01-19 12:56:00,106.31752,29.52306,1022,-278 days +19:02:00


In [22]:
#removing rows with negative durations
df = df[df['delivery_duration'] >= pd.Timedelta(0)]

# Verifing again
df['delivery_duration'].describe()

count                      4514658
mean     0 days 03:06:06.004680753
std      0 days 11:43:43.454804323
min                0 days 00:00:00
25%                0 days 00:56:00
50%                0 days 01:45:00
75%                0 days 03:13:00
max              115 days 14:59:00
Name: delivery_duration, dtype: object

In [23]:
#doing the same thing with \\ delivery_gps_time and accept_gps_time
#adding duration_GPS_time column
df['duration_gps_time'] = df['delivery_gps_time'] - df['accept_gps_time']

negative_durations = df[df['duration_gps_time'] < pd.Timedelta(0)]
negative_durations

Unnamed: 0,order_id,region_id,city,courier_id,lng,lat,aoi_id,aoi_type,accept_time,accept_gps_time,accept_gps_lng,accept_gps_lat,delivery_time,delivery_gps_time,delivery_gps_lng,delivery_gps_lat,ds,delivery_duration,duration_gps_time


In [24]:
df = df.drop('duration_gps_time', axis=1)
df.sample(5)

Unnamed: 0,order_id,region_id,city,courier_id,lng,lat,aoi_id,aoi_type,accept_time,accept_gps_time,accept_gps_lng,accept_gps_lat,delivery_time,delivery_gps_time,delivery_gps_lng,delivery_gps_lat,ds,delivery_duration
2613022,2666255,117,Hangzhou,199,120.08184,30.152,60149,1,2024-10-08 08:55:00,2024-10-08 08:55:00,120.06878,30.17441,2024-10-08 10:50:00,2024-10-08 10:50:00,120.08222,30.15215,1008,0 days 01:55:00
3727683,3146610,54,Shanghai,2738,121.19904,31.11185,56393,1,2024-09-05 08:48:00,2024-09-05 08:48:00,121.19729,31.12637,2024-09-05 11:50:00,2024-09-05 11:50:00,121.19644,31.11275,905,0 days 03:02:00
2984923,857661,8,Shanghai,3195,121.52113,31.1399,4924,1,2024-10-29 16:53:00,2024-10-29 16:53:00,121.49363,31.13878,2024-10-29 19:08:00,2024-10-29 19:08:00,121.51381,31.13807,1029,0 days 02:15:00
1013639,989677,3,Hangzhou,742,120.09972,30.26935,1005,0,2024-07-29 15:57:00,2024-07-29 15:57:00,120.0971,30.27148,2024-07-29 16:22:00,2024-07-29 16:22:00,120.10005,30.26932,729,0 days 00:25:00
3291575,2069930,29,Shanghai,3123,121.41281,31.2375,41059,1,2024-09-02 10:31:00,2024-09-02 10:31:00,121.40009,31.25102,2024-09-02 11:07:00,2024-09-02 11:07:00,121.4129,31.23757,902,0 days 00:36:00


In [25]:
#adding distance column between acceptance and pickup
df['distance'] = np.sqrt((df['delivery_gps_lng'] - df['accept_gps_lng'])**2 +
                         (df['delivery_gps_lat'] - df['accept_gps_lat'])**2)

# Display the DataFrame to verify the new 'distance' column
df.sample(5)

Unnamed: 0,order_id,region_id,city,courier_id,lng,lat,aoi_id,aoi_type,accept_time,accept_gps_time,accept_gps_lng,accept_gps_lat,delivery_time,delivery_gps_time,delivery_gps_lng,delivery_gps_lat,ds,delivery_duration,distance
4036385,1106879,77,Shanghai,3993,121.73874,31.21869,7669,14,2024-09-20 11:32:00,2024-09-20 11:32:00,121.69347,31.21175,2024-09-20 12:57:00,2024-09-20 12:57:00,121.74208,31.22236,920,0 days 01:25:00,0.049754
2617389,1451544,119,Hangzhou,1164,119.77554,29.85885,9783,14,2024-09-06 09:14:00,2024-09-06 09:14:00,119.72481,29.82101,2024-09-06 10:33:00,2024-09-06 10:33:00,119.77507,29.85869,906,0 days 01:19:00,0.062816
446314,4209854,132,Chongqing,4142,106.5466,29.45209,877,1,2024-05-24 13:52:00,2024-05-24 13:52:00,106.5361,29.4584,2024-05-24 14:13:00,2024-05-24 14:13:00,106.54737,29.45158,524,0 days 00:21:00,0.013173
1814874,833643,60,Hangzhou,2347,120.32949,30.19674,19958,1,2024-09-12 09:33:00,2024-09-12 09:33:00,120.29608,30.21232,2024-09-12 11:04:00,2024-09-12 11:04:00,120.32952,30.1969,912,0 days 01:31:00,0.036824
1857014,1634522,62,Hangzhou,448,120.09302,30.44469,4381,1,2024-05-26 07:45:00,2024-05-26 07:45:00,120.09253,30.39206,2024-05-26 10:28:00,2024-05-26 10:28:00,120.09296,30.44471,526,0 days 02:43:00,0.052652


In [26]:
df['distance']

0          0.055776
1          0.053710
2          0.053861
3          0.052578
4          0.052169
             ...   
4514656    0.019673
4514657    0.019668
4514658    0.022433
4514659    0.019559
4514660    0.009510
Name: distance, Length: 4514658, dtype: float64

In [27]:
df['distance'].describe()

count    4.514658e+06
mean     3.584109e-02
std      9.933152e-01
min      0.000000e+00
25%      1.107610e-02
50%      1.878826e-02
75%      3.136084e-02
max      1.339792e+02
Name: distance, dtype: float64

---

## Outlier Detection

- #### Delivery Duration : Identifing abnormally long or short delivery durations.

In [28]:
# Calculating Q1 and Q3
Q1 = df['delivery_duration'].quantile(0.25)
Q3 = df['delivery_duration'].quantile(0.75)

IQR = Q3 - Q1

lower_bound = Q1 - 1.5 * IQR
upper_bound = Q3 + 1.5 * IQR


lower_bound, upper_bound

(Timedelta('-1 days +21:30:30'), Timedelta('0 days 06:38:30'))

In [29]:
# Filtering out outliers
# df = df[(df['task_duration'] >= lower_bound) & (df['task_duration'] <= upper_bound)]
df = df[df['delivery_duration'].between(lower_bound, upper_bound)]
df.sample(5)

Unnamed: 0,order_id,region_id,city,courier_id,lng,lat,aoi_id,aoi_type,accept_time,accept_gps_time,accept_gps_lng,accept_gps_lat,delivery_time,delivery_gps_time,delivery_gps_lng,delivery_gps_lat,ds,delivery_duration,distance
4260809,2398211,87,Shanghai,2786,121.46085,31.22654,56181,0,2024-08-16 11:47:00,2024-08-16 11:47:00,121.43567,31.24048,2024-08-16 14:03:00,2024-08-16 14:03:00,121.46094,31.22643,816,0 days 02:16:00,0.028913
1170113,2869550,19,Hangzhou,3175,120.14063,30.31667,23902,1,2024-07-09 15:29:00,2024-07-09 15:29:00,120.13931,30.32898,2024-07-09 15:54:00,2024-07-09 15:54:00,120.14074,30.31707,709,0 days 00:25:00,0.011996
3068755,571697,9,Shanghai,1781,121.56903,30.86322,59133,1,2024-06-25 10:44:00,2024-06-25 10:44:00,121.47383,30.88941,2024-06-25 15:55:00,2024-06-25 15:55:00,121.5677,30.86285,625,0 days 05:11:00,0.097555
4279045,1626900,90,Shanghai,4470,121.43835,31.16943,31200,1,2024-09-13 08:26:00,2024-09-13 08:26:00,121.45462,31.16742,2024-09-13 14:03:00,2024-09-13 14:03:00,121.43873,31.16974,913,0 days 05:37:00,0.016058
3398240,1203771,38,Shanghai,918,121.87343,30.97733,56035,1,2024-07-07 08:56:00,2024-07-07 08:56:00,121.86455,30.93058,2024-07-07 14:05:00,2024-07-07 14:05:00,121.87443,30.97783,707,0 days 05:09:00,0.048272


- #### Geospatial Anomalies: Validating `lng/lat` values for any out-of-bound entries:
     > Latitude range: -90 to 90.
     
     > Longitude range: -180 to 180


In [30]:
# Validating lat and lng values
df = df[(df['lat'] >= -90) & (df['lat'] <= 90) & (df['lng'] >= -180) & (df['lng'] <= 180)]

In [31]:
df.info()

<class 'pandas.core.frame.DataFrame'>
Index: 4193458 entries, 0 to 4514660
Data columns (total 19 columns):
 #   Column             Dtype          
---  ------             -----          
 0   order_id           int64          
 1   region_id          int64          
 2   city               object         
 3   courier_id         int64          
 4   lng                float64        
 5   lat                float64        
 6   aoi_id             int64          
 7   aoi_type           category       
 8   accept_time        datetime64[ns] 
 9   accept_gps_time    datetime64[ns] 
 10  accept_gps_lng     float64        
 11  accept_gps_lat     float64        
 12  delivery_time      datetime64[ns] 
 13  delivery_gps_time  datetime64[ns] 
 14  delivery_gps_lng   float64        
 15  delivery_gps_lat   float64        
 16  ds                 int64          
 17  delivery_duration  timedelta64[ns]
 18  distance           float64        
dtypes: category(1), datetime64[ns](4), float64(7), 

In [32]:
df.sample(5)

Unnamed: 0,order_id,region_id,city,courier_id,lng,lat,aoi_id,aoi_type,accept_time,accept_gps_time,accept_gps_lng,accept_gps_lat,delivery_time,delivery_gps_time,delivery_gps_lng,delivery_gps_lat,ds,delivery_duration,distance
2389270,34743,99,Hangzhou,1759,120.17868,30.32525,59193,0,2024-08-17 14:13:00,2024-08-17 14:13:00,120.19593,30.32889,2024-08-17 15:37:00,2024-08-17 15:37:00,120.17872,30.32537,817,0 days 01:24:00,0.017566
551503,4287287,139,Chongqing,2505,106.68017,29.50825,33630,1,2024-08-29 16:17:00,2024-08-29 16:17:00,106.6437,29.47375,2024-08-29 20:01:00,2024-08-29 20:01:00,106.68011,29.50936,829,0 days 03:44:00,0.050929
793040,617487,155,Chongqing,72,106.56991,29.60627,55363,1,2024-08-06 15:16:00,2024-08-06 15:16:00,106.55273,29.57858,2024-08-06 16:11:00,2024-08-06 16:11:00,106.56971,29.60683,806,0 days 00:55:00,0.03296
2049288,2781865,82,Hangzhou,4376,120.13501,30.25416,13100,8,2024-08-04 15:16:00,2024-08-04 15:16:00,120.12383,30.27043,2024-08-04 15:51:00,2024-08-04 15:51:00,120.13515,30.25415,804,0 days 00:35:00,0.019829
1846014,4148150,60,Hangzhou,2546,120.28896,30.19464,52922,14,2024-05-07 08:33:00,2024-05-07 08:33:00,120.29611,30.21235,2024-05-07 11:05:00,2024-05-07 11:05:00,120.28783,30.19502,507,0 days 02:32:00,0.019206


---

## Data Transformation : 
> Normalize numerical  features like `distance` and `delivery_duration` to ensure all variables are on a similar scale.

> Encoding Categorical Variables  ->  `city` and `aoi_type` to numeric using one-hot or label encoding.

In [33]:
df['delivery_duration'] = (df['delivery_time'] - df['accept_time']).dt.total_seconds()

In [34]:
from sklearn.preprocessing import MinMaxScaler

scaler = MinMaxScaler()

numerical_columns = ['distance', 'delivery_duration']
df[numerical_columns] = scaler.fit_transform(df[numerical_columns])

In [35]:
df['delivery_duration']

0          1.000000
6          1.000000
20         0.731156
21         0.263819
22         0.256281
             ...   
4514655    0.585427
4514656    0.349246
4514657    0.768844
4514658    0.133166
4514660    0.766332
Name: delivery_duration, Length: 4193458, dtype: float64

In [36]:
df['distance']

0          0.000416
6          0.000387
20         0.000103
21         0.000105
22         0.000110
             ...   
4514655    0.000250
4514656    0.000147
4514657    0.000147
4514658    0.000167
4514660    0.000071
Name: distance, Length: 4193458, dtype: float64

In [35]:
#encoding 'city' and 'aoi_type' columns
df = pd.get_dummies(df, columns=['city', 'aoi_type'], drop_first=True)

In [None]:
df.dtypes

---

## Overview & Saving Cleaned Data

In [None]:
# Histogram of delivery durations
plt.figure(figsize=(10, 6))
plt.hist(df['delivery_duration'], bins=80, edgecolor='black')
plt.title('Distribution of Delivery Durations')
plt.xlabel('Normalized Delivery Duration')
plt.ylabel('Frequency')
plt.grid(True, alpha=0.3)

plt.xlim(left=0)


plt.show()

In [37]:
df.columns

Index(['order_id', 'region_id', 'city', 'courier_id', 'lng', 'lat', 'aoi_id',
       'aoi_type', 'accept_time', 'accept_gps_time', 'accept_gps_lng',
       'accept_gps_lat', 'delivery_time', 'delivery_gps_time',
       'delivery_gps_lng', 'delivery_gps_lat', 'ds', 'delivery_duration',
       'distance'],
      dtype='object')

In [38]:
df.shape

(4193458, 19)

In [39]:
df.iloc[:5, :17]

Unnamed: 0,order_id,region_id,city,courier_id,lng,lat,aoi_id,aoi_type,accept_time,accept_gps_time,accept_gps_lng,accept_gps_lat,delivery_time,delivery_gps_time,delivery_gps_lng,delivery_gps_lat,ds
0,2031782,10,Chongqing,73,108.71571,30.90228,50,14,2024-10-22 10:26:00,2024-10-22 10:26:00,108.71826,30.95587,2024-10-22 17:04:00,2024-10-22 17:04:00,108.66361,30.96702,1022
6,4481765,10,Chongqing,3605,108.71605,30.9041,50,14,2024-09-30 10:00:00,2024-09-30 10:00:00,108.71824,30.95583,2024-09-30 16:38:00,2024-09-30 16:38:00,108.71429,30.90416,930
20,3098203,10,Chongqing,1635,108.71797,30.94364,296,14,2024-07-10 08:33:00,2024-07-10 08:33:00,108.71801,30.95637,2024-07-10 13:24:00,2024-07-10 13:24:00,108.71809,30.9426,710
21,356619,10,Chongqing,1635,108.71979,30.9413,296,14,2024-09-09 09:04:00,2024-09-09 09:04:00,108.71803,30.95629,2024-09-09 10:49:00,2024-09-09 10:49:00,108.7197,30.94235,909
22,1484207,10,Chongqing,1635,108.72106,30.94164,296,14,2024-10-19 08:29:00,2024-10-19 08:29:00,108.7182,30.95598,2024-10-19 10:11:00,2024-10-19 10:11:00,108.72307,30.94201,1019


In [40]:
df.iloc[:5,17:]

Unnamed: 0,delivery_duration,distance
0,1.0,0.000416
6,1.0,0.000387
20,0.731156,0.000103
21,0.263819,0.000105
22,0.256281,0.00011


In [41]:
df.to_csv('cleaned_delivery_data_v2.csv', index=False)