In [4]:
import os
import pandas as pd
import numpy as np

import warnings
warnings.filterwarnings('ignore')

In [5]:
data_dir = os.path.join(os.getcwd(), '../data')
raw_data_dir = os.path.join(data_dir, 'raw')
Food_Delivery_Data = os.path.join(raw_data_dir, 'Food_Delivery_Data.xlsx')

preprocessed_out_dir = os.path.join(data_dir, 'Preprocessed')
os.makedirs(preprocessed_out_dir, exist_ok=True)
preprocessed_saved_filepath = os.path.join(preprocessed_out_dir, 'preprocessed.pkl')


feature_engineered_out_dir = os.path.join(data_dir, 'FeatureEngineered')
os.makedirs(feature_engineered_out_dir, exist_ok=True)
feature_engineered_saved_filepath = os.path.join(feature_engineered_out_dir, 'feature_engineered.pkl')

In [6]:
df = pd.read_pickle(preprocessed_saved_filepath)

## 4.Advanced Feature Creation

### Outlier Detection & Distance Optimization
Identifying and handling distance outliers that may negatively impact model performance.

In [7]:
# Outlier Detection and Distance Clipping
print("Analyzing distance distribution for outliers...")

# Analyze distance distribution
print(f"Distance Statistics:")
print(f"   • Mean: {df['distance_geodesic_km'].mean():.2f} km")
print(f"   • Median: {df['distance_geodesic_km'].median():.2f} km")
print(f"   • 95th percentile: {df['distance_geodesic_km'].quantile(0.95):.2f} km")
print(f"   • 99th percentile: {df['distance_geodesic_km'].quantile(0.99):.2f} km")
print(f"   • Maximum: {df['distance_geodesic_km'].max():.2f} km")

# Clip extreme distances at 99th percentile
p99 = df['distance_geodesic_km'].quantile(0.99)
extreme_distances = (df['distance_geodesic_km'] > p99).sum()

print(f"\nOutlier Analysis:")
print(f"   • Records with distance > 99th percentile: {extreme_distances}")
print(f"   • Clipping threshold (P99): {p99:.2f} km")

# Create clipped distance feature
df['distance_clipped'] = df['distance_geodesic_km'].clip(upper=p99)

print(f"\nCreated 'distance_clipped' feature")
print(f"   • Original max distance: {df['distance_geodesic_km'].max():.2f} km") 
print(f"   • Clipped max distance: {df['distance_clipped'].max():.2f} km")

Analyzing distance distribution for outliers...
Distance Statistics:
   • Mean: 99.20 km
   • Median: 9.25 km
   • 95th percentile: 20.14 km
   • 99th percentile: 20.94 km
   • Maximum: 19709.58 km

Outlier Analysis:
   • Records with distance > 99th percentile: 453
   • Clipping threshold (P99): 20.94 km

Created 'distance_clipped' feature
   • Original max distance: 19709.58 km
   • Clipped max distance: 20.94 km


### Geographic Feature Engineering: City-Based Analytics
Extracting valuable location-based features from delivery person IDs to capture regional delivery patterns.

In [8]:
# Geographic Feature Engineering: City Code Extraction
print("Extracting city information from Delivery Person IDs...")

# Extract city codes from delivery person IDs
df['City_Code'] = df['Delivery_person_ID'].str.split('RES', expand=True)[0]

print("City code extraction completed!")
print(f"City Analysis:")
print(f"   • Unique cities identified: {df['City_Code'].nunique()}")
print(f"   • Sample city codes: {df['City_Code'].unique()[:10].tolist()}")

display(df['City_Code'].head())

Extracting city information from Delivery Person IDs...
City code extraction completed!
City Analysis:
   • Unique cities identified: 22
   • Sample city codes: ['INDO', 'BANG', 'COIMB', 'CHEN', 'HYD', 'RANCHI', 'MYS', 'DEH', 'KOC', 'PUNE']


0     INDO
1     BANG
2     BANG
3    COIMB
4     CHEN
Name: City_Code, dtype: object

In [9]:
# Display all unique city codes for analysis
unique_cityCode = df['City_Code'].unique()
print(f"Complete City Code List ({len(unique_cityCode)} cities):")
print(unique_cityCode)

# Analyze city distribution
city_counts = df['City_Code'].value_counts()
print(f"\nTop 10 Cities by Delivery Volume:")
display(city_counts.head(10).to_frame('Delivery Count'))

Complete City Code List (22 cities):
['INDO' 'BANG' 'COIMB' 'CHEN' 'HYD' 'RANCHI' 'MYS' 'DEH' 'KOC' 'PUNE'
 'LUDH' 'KNP' 'MUM' 'KOL' 'JAP' 'SUR' 'GOA' 'AURG' 'AGR' 'VAD' 'ALH' 'BHP']

Top 10 Cities by Delivery Volume:


Unnamed: 0_level_0,Delivery Count
City_Code,Unnamed: 1_level_1
JAP,3443
RANCHI,3229
BANG,3195
SUR,3187
HYD,3181
MUM,3173
MYS,3171
COIMB,3170
VAD,3166
INDO,3159


#### City Tier Classification Strategy
Implementing a strategic city classification system based on urban development, traffic patterns, and delivery complexity:

In [10]:
# City Tier Mapping: Strategic Classification
print("Implementing city tier classification system...")

# Tier 1: Major metropolitan areas (high traffic, complex logistics)
tier_1_cities = ['BANG', 'CHEN', 'DEH', 'KOLK', 'MUMB', 'PUNE', 'HYD']

# Tier 2: Other major cities (moderate complexity)
tier_2_cities = ['AGR', 'ALH', 'AURG', 'COIMB', 'JAP', 'KOC', 'SUR', 
                 'INDO', 'VAD', 'BHP', 'KNP', 'RANCHI', 'MYS']

# Tier 3: Smaller cities/tourist areas (simpler logistics)
tier_3_cities = ['GOA']
 
def map_tier(city_code):
    """
    Map city codes to tier levels based on urban complexity and delivery challenges.
    
    Returns:
        int: 3 (Tier 1 - highest complexity), 2 (Tier 2), 1 (Tier 3 - lowest complexity)
    """
    if city_code in tier_1_cities:
        return 3  # Tier 1 (highest delivery complexity)
    elif city_code in tier_2_cities:
        return 2  # Tier 2 (moderate complexity)
    elif city_code in tier_3_cities:
        return 1  # Tier 3 (lowest complexity)
    else:
        return 2  # Default to Tier 2 if unknown city
 
# Apply tier mapping
df['city_tier_encoded'] = df['City_Code'].apply(map_tier)

print("City tier classification completed!")
print(f"\nCity Tier Distribution:")
tier_distribution = df['city_tier_encoded'].value_counts().sort_index()
for tier, count in tier_distribution.items():
    tier_name = {3: "Tier 1 (Metro)", 2: "Tier 2 (Major)", 1: "Tier 3 (Small)"}[tier]
    percentage = (count / len(df)) * 100
    print(f"   • {tier_name}: {count:,} deliveries ({percentage:.1f}%)")

print(f"\nTier Classification Logic:")
print(f"   • Tier 1 (3): Major metros with high traffic complexity")
print(f"   • Tier 2 (2): Other major cities with moderate complexity") 
print(f"   • Tier 3 (1): Smaller cities with simpler logistics")

Implementing city tier classification system...
City tier classification completed!

City Tier Distribution:
   • Tier 3 (Small): 709 deliveries (1.6%)
   • Tier 2 (Major): 31,494 deliveries (69.1%)
   • Tier 1 (Metro): 13,390 deliveries (29.4%)

Tier Classification Logic:
   • Tier 1 (3): Major metros with high traffic complexity
   • Tier 2 (2): Other major cities with moderate complexity
   • Tier 3 (1): Smaller cities with simpler logistics


### Advanced Feature Engineering: Derived Variables
Creating sophisticated features that capture complex relationships and domain-specific insights for improved model performance.

In [11]:
df.info()

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 45593 entries, 0 to 45592
Data columns (total 14 columns):
 #   Column                   Non-Null Count  Dtype  
---  ------                   --------------  -----  
 0   ID                       45593 non-null  object 
 1   Delivery_person_ID       45593 non-null  object 
 2   Delivery_person_Age      45593 non-null  int64  
 3   Delivery_person_Ratings  45593 non-null  float64
 4   Type_of_vehicle          45593 non-null  object 
 5   Time_taken(min)          45593 non-null  int64  
 6   distance_geodesic_km     45593 non-null  float64
 7   Type_of_vehicle_encoded  45593 non-null  int64  
 8   Type_of_order_Drinks     45593 non-null  bool   
 9   Type_of_order_Meal       45593 non-null  bool   
 10  Type_of_order_Snack      45593 non-null  bool   
 11  distance_clipped         45593 non-null  float64
 12  City_Code                45593 non-null  object 
 13  city_tier_encoded        45593 non-null  int64  
dtypes: bool(3), float64(3)

In [12]:
# Advanced Feature Engineering: Statistical and Interaction Features
print("Creating advanced derived features...")

# 1. Standardized Rating Score (Z-score normalization)
rating_mean = df['Delivery_person_Ratings'].mean()
rating_std = df['Delivery_person_Ratings'].std()
df['rating_zscore'] = (df['Delivery_person_Ratings'] - rating_mean) / rating_std

# 2. Vehicle-Distance Interaction (efficiency factor)
df['vehicle_distance_interaction'] = df['distance_geodesic_km'] / (df['Type_of_vehicle_encoded'] + 1)

# 3. Non-linear Distance Features
df['distance_squared'] = df['distance_geodesic_km'] ** 2        # Quadratic relationship
df['log_distance'] = np.log1p(df['distance_geodesic_km'])      # Logarithmic transformation

print("Advanced features created:")
print(f"   • rating_zscore: Standardized delivery person ratings")
print(f"   • vehicle_distance_interaction: Distance efficiency by vehicle type")
print(f"   • distance_squared: Quadratic distance relationship")
print(f"   • log_distance: Log-transformed distance for skewed relationships")

# Display feature statistics
new_features = ['rating_zscore', 'vehicle_distance_interaction', 'distance_squared', 'log_distance']
print(f"\nNew Feature Statistics:")
display(df[new_features].describe())

Creating advanced derived features...
Advanced features created:
   • rating_zscore: Standardized delivery person ratings
   • vehicle_distance_interaction: Distance efficiency by vehicle type
   • distance_squared: Quadratic distance relationship
   • log_distance: Log-transformed distance for skewed relationships

New Feature Statistics:


Unnamed: 0,rating_zscore,vehicle_distance_interaction,distance_squared,log_distance
count,45593.0,45593.0,45593.0,45593.0
mean,-1.10977e-15,36.609716,1219649.0,2.267995
std,1.0,492.712279,18264180.0,0.899266
min,-11.08417,0.365959,2.142819,0.90172
25%,-0.09876737,1.512492,21.66144,1.732396
50%,0.2063828,2.656231,85.50746,2.326987
75%,0.5115329,4.152889,188.8132,2.690628
max,4.173335,19085.969903,388467400.0,9.888911


#### Traffic Congestion Modeling
Incorporating city-specific traffic patterns and congestion factors that significantly impact delivery times in urban environments.

In [13]:
# Traffic Congestion Factor Implementation
print("Implementing traffic congestion modeling...")

# Define congestion factors based on city tier analysis
congestion_factors = {
    3: 1.5,  # Tier 1 cities: High congestion (50% time penalty)
    2: 1.2,  # Tier 2 cities: Moderate congestion (20% time penalty)  
    1: 1.0   # Tier 3 cities: Low congestion (no penalty)
}

# Apply congestion factors
df['traffic_congestion_factor'] = df['city_tier_encoded'].map(congestion_factors)

# Create traffic-adjusted interaction feature
df['traffic_adjusted_interaction'] = (df['vehicle_distance_interaction'] * 
                                    df['traffic_congestion_factor'])

print("Traffic congestion modeling completed!")
print(f"\nCongestion Factor Distribution:")
congestion_dist = df['traffic_congestion_factor'].value_counts().sort_index()
for factor, count in congestion_dist.items():
    tier_desc = {1.5: "High (Tier 1)", 1.2: "Moderate (Tier 2)", 1.0: "Low (Tier 3)"}[factor]
    percentage = (count / len(df)) * 100
    print(f"   • {factor}x factor ({tier_desc}): {count:,} deliveries ({percentage:.1f}%)")

print(f"\nTraffic-Adjusted Interaction Feature:")
print(f"   • Mean: {df['traffic_adjusted_interaction'].mean():.3f}")
print(f"   • Range: {df['traffic_adjusted_interaction'].min():.3f} - {df['traffic_adjusted_interaction'].max():.3f}")

Implementing traffic congestion modeling...
Traffic congestion modeling completed!

Congestion Factor Distribution:
   • 1.0x factor (Low (Tier 3)): 709 deliveries (1.6%)
   • 1.2x factor (Moderate (Tier 2)): 31,494 deliveries (69.1%)
   • 1.5x factor (High (Tier 1)): 13,390 deliveries (29.4%)

Traffic-Adjusted Interaction Feature:
   • Mean: 47.367
   • Range: 0.385 - 26901.236


## Save the file locally

In [14]:
df.to_pickle(feature_engineered_saved_filepath)