# Driver Performance Analytics & Optimization

This notebook analyzes driver performance metrics, identifies top performers, and provides insights for driver retention and optimization.

In [None]:
import pandas as pd
import numpy as np
import matplotlib.pyplot as plt
import seaborn as sns
from datetime import datetime, timedelta
from sklearn.preprocessing import StandardScaler
from sklearn.cluster import KMeans
import warnings
warnings.filterwarnings('ignore')

# Set style
plt.style.use('seaborn-v0_8-whitegrid')
sns.set_palette('Set2')

# Display settings
pd.set_option('display.max_columns', None)
pd.set_option('display.float_format', '{:.2f}'.format)

In [None]:
# Load data
drivers = pd.read_csv('../data/drivers.csv')
trips = pd.read_csv('../data/trips.csv')
vehicles = pd.read_csv('../data/vehicles.csv')

# Convert datetime columns
drivers['hire_date'] = pd.to_datetime(drivers['hire_date'])
drivers['birth_date'] = pd.to_datetime(drivers['birth_date'])
trips['pickup_datetime'] = pd.to_datetime(trips['pickup_datetime'])
trips['dropoff_datetime'] = pd.to_datetime(trips['dropoff_datetime'])

print(f"Loaded {len(drivers):,} drivers")
print(f"Loaded {len(trips):,} trips")
print(f"Loaded {len(vehicles):,} vehicles")

# Filter for completed trips only
completed_trips = trips[trips['trip_status'] == 'Completed'].copy()

## 1. Driver Performance Metrics

In [None]:
# Calculate driver performance metrics
driver_performance = completed_trips.groupby('driver_id').agg({
    'trip_id': 'count',
    'total_fare': ['sum', 'mean'],
    'trip_distance': ['sum', 'mean'],
    'trip_duration': 'mean',
    'customer_rating': 'mean',
    'tips': ['sum', 'mean'],
    'route_efficiency': 'mean',
    'wait_time': 'mean'
})

# Flatten column names
driver_performance.columns = ['_'.join(col).strip() for col in driver_performance.columns]
driver_performance.columns = ['total_trips', 'total_revenue', 'avg_fare', 'total_distance', 
                              'avg_distance', 'avg_duration', 'avg_customer_rating', 
                              'total_tips', 'avg_tips', 'avg_route_efficiency', 'avg_wait_time']

# Merge with driver info
driver_performance = driver_performance.merge(drivers[['driver_id', 'first_name', 'last_name', 'rating', 'status', 'hire_date']], 
                                              on='driver_id', how='left')

# Calculate additional metrics
driver_performance['revenue_per_mile'] = driver_performance['total_revenue'] / driver_performance['total_distance'].clip(lower=1)
driver_performance['trips_per_day'] = driver_performance['total_trips'] / 30  # Assuming 30-day period
driver_performance['driver_name'] = driver_performance['first_name'] + ' ' + driver_performance['last_name']

# Top performers
top_performers = driver_performance.nlargest(10, 'total_revenue')[['driver_name', 'total_trips', 'total_revenue', 
                                                                    'avg_fare', 'avg_customer_rating', 'total_tips']]
print("Top 10 Drivers by Revenue:")
print(top_performers)

## 2. Driver Performance Distribution

In [None]:
# Create performance distribution plots
fig, axes = plt.subplots(2, 3, figsize=(18, 10))

# Total trips distribution
axes[0, 0].hist(driver_performance['total_trips'], bins=30, color='steelblue', edgecolor='black')
axes[0, 0].set_xlabel('Total Trips')
axes[0, 0].set_ylabel('Number of Drivers')
axes[0, 0].set_title('Distribution of Total Trips per Driver')
axes[0, 0].axvline(driver_performance['total_trips'].median(), color='red', linestyle='--', label='Median')
axes[0, 0].legend()

# Revenue distribution
axes[0, 1].hist(driver_performance['total_revenue'], bins=30, color='green', edgecolor='black')
axes[0, 1].set_xlabel('Total Revenue ($)')
axes[0, 1].set_ylabel('Number of Drivers')
axes[0, 1].set_title('Distribution of Total Revenue per Driver')
axes[0, 1].axvline(driver_performance['total_revenue'].median(), color='red', linestyle='--', label='Median')
axes[0, 1].legend()

# Customer rating distribution
axes[0, 2].hist(driver_performance['avg_customer_rating'].dropna(), bins=20, color='gold', edgecolor='black')
axes[0, 2].set_xlabel('Average Customer Rating')
axes[0, 2].set_ylabel('Number of Drivers')
axes[0, 2].set_title('Distribution of Customer Ratings')
axes[0, 2].axvline(driver_performance['avg_customer_rating'].median(), color='red', linestyle='--', label='Median')
axes[0, 2].legend()

# Tips distribution
axes[1, 0].hist(driver_performance['avg_tips'], bins=30, color='purple', edgecolor='black')
axes[1, 0].set_xlabel('Average Tips ($)')
axes[1, 0].set_ylabel('Number of Drivers')
axes[1, 0].set_title('Distribution of Average Tips')
axes[1, 0].axvline(driver_performance['avg_tips'].median(), color='red', linestyle='--', label='Median')
axes[1, 0].legend()

# Route efficiency distribution
axes[1, 1].hist(driver_performance['avg_route_efficiency'].dropna(), bins=30, color='orange', edgecolor='black')
axes[1, 1].set_xlabel('Average Route Efficiency')
axes[1, 1].set_ylabel('Number of Drivers')
axes[1, 1].set_title('Distribution of Route Efficiency')
axes[1, 1].axvline(driver_performance['avg_route_efficiency'].median(), color='red', linestyle='--', label='Median')
axes[1, 1].legend()

# Revenue per mile
axes[1, 2].hist(driver_performance['revenue_per_mile'], bins=30, color='coral', edgecolor='black')
axes[1, 2].set_xlabel('Revenue per Mile ($)')
axes[1, 2].set_ylabel('Number of Drivers')
axes[1, 2].set_title('Distribution of Revenue per Mile')
axes[1, 2].axvline(driver_performance['revenue_per_mile'].median(), color='red', linestyle='--', label='Median')
axes[1, 2].legend()

plt.tight_layout()
plt.show()

## 3. Driver Segmentation using K-Means Clustering

In [None]:
# Prepare features for clustering
clustering_features = ['total_trips', 'avg_fare', 'avg_customer_rating', 'avg_tips', 'avg_route_efficiency']
X = driver_performance[clustering_features].dropna()

# Standardize features
scaler = StandardScaler()
X_scaled = scaler.fit_transform(X)

# Perform K-means clustering
kmeans = KMeans(n_clusters=4, random_state=42)
driver_performance.loc[X.index, 'cluster'] = kmeans.fit_predict(X_scaled)

# Analyze clusters
cluster_summary = driver_performance.groupby('cluster')[clustering_features + ['total_revenue']].mean()
cluster_summary['driver_count'] = driver_performance.groupby('cluster').size()

# Label clusters
cluster_labels = {
    0: 'Standard Performers',
    1: 'High Volume Drivers',
    2: 'Premium Service',
    3: 'Underperformers'
}

# Assign labels based on characteristics
for idx in cluster_summary.index:
    if cluster_summary.loc[idx, 'avg_customer_rating'] > cluster_summary['avg_customer_rating'].mean() and \
       cluster_summary.loc[idx, 'avg_tips'] > cluster_summary['avg_tips'].mean():
        cluster_labels[idx] = 'Premium Service'
    elif cluster_summary.loc[idx, 'total_trips'] > cluster_summary['total_trips'].mean():
        cluster_labels[idx] = 'High Volume Drivers'
    elif cluster_summary.loc[idx, 'total_revenue'] < cluster_summary['total_revenue'].mean():
        cluster_labels[idx] = 'Underperformers'
    else:
        cluster_labels[idx] = 'Standard Performers'

driver_performance['segment'] = driver_performance['cluster'].map(cluster_labels)

print("Driver Segmentation Summary:")
print("="*60)
for cluster_id, label in cluster_labels.items():
    cluster_data = cluster_summary.loc[cluster_id]
    print(f"\n{label} (Cluster {cluster_id}):")
    print(f"  - Number of drivers: {cluster_data['driver_count']:.0f}")
    print(f"  - Avg trips: {cluster_data['total_trips']:.0f}")
    print(f"  - Avg fare: ${cluster_data['avg_fare']:.2f}")
    print(f"  - Avg customer rating: {cluster_data['avg_customer_rating']:.2f}")
    print(f"  - Avg tips: ${cluster_data['avg_tips']:.2f}")

## 4. Driver Utilization Analysis

In [None]:
# Calculate daily utilization
trips['date'] = trips['pickup_datetime'].dt.date
daily_driver_stats = completed_trips.groupby(['driver_id', trips['date']]).agg({
    'trip_id': 'count',
    'total_fare': 'sum',
    'trip_duration': 'sum',
    'pickup_datetime': 'min',
    'dropoff_datetime': 'max'
}).rename(columns={'trip_id': 'daily_trips', 'total_fare': 'daily_revenue', 'trip_duration': 'total_driving_seconds'})

# Calculate working hours
daily_driver_stats['working_hours'] = (daily_driver_stats['dropoff_datetime'] - daily_driver_stats['pickup_datetime']).dt.total_seconds() / 3600
daily_driver_stats['driving_hours'] = daily_driver_stats['total_driving_seconds'] / 3600
daily_driver_stats['utilization_rate'] = (daily_driver_stats['driving_hours'] / daily_driver_stats['working_hours'].clip(lower=0.1)) * 100

# Average utilization by driver
driver_utilization = daily_driver_stats.groupby('driver_id').agg({
    'daily_trips': 'mean',
    'daily_revenue': 'mean',
    'working_hours': 'mean',
    'driving_hours': 'mean',
    'utilization_rate': 'mean'
})

# Visualize utilization
fig, axes = plt.subplots(1, 3, figsize=(18, 5))

# Working hours vs revenue
sample_drivers = driver_utilization.sample(min(200, len(driver_utilization)))
axes[0].scatter(sample_drivers['working_hours'], sample_drivers['daily_revenue'], alpha=0.6, color='steelblue')
axes[0].set_xlabel('Average Working Hours per Day')
axes[0].set_ylabel('Average Daily Revenue ($)')
axes[0].set_title('Working Hours vs Daily Revenue')
axes[0].grid(True, alpha=0.3)

# Utilization rate distribution
axes[1].hist(driver_utilization['utilization_rate'].clip(upper=100), bins=30, color='green', edgecolor='black')
axes[1].set_xlabel('Utilization Rate (%)')
axes[1].set_ylabel('Number of Drivers')
axes[1].set_title('Driver Utilization Rate Distribution')
axes[1].axvline(driver_utilization['utilization_rate'].median(), color='red', linestyle='--', label='Median')
axes[1].legend()

# Trips per hour worked
driver_utilization['trips_per_hour'] = driver_utilization['daily_trips'] / driver_utilization['working_hours'].clip(lower=1)
axes[2].hist(driver_utilization['trips_per_hour'], bins=30, color='purple', edgecolor='black')
axes[2].set_xlabel('Trips per Hour Worked')
axes[2].set_ylabel('Number of Drivers')
axes[2].set_title('Driver Efficiency (Trips/Hour)')
axes[2].axvline(driver_utilization['trips_per_hour'].median(), color='red', linestyle='--', label='Median')
axes[2].legend()

plt.tight_layout()
plt.show()

print("\nDriver Utilization Summary:")
print(f"Average working hours per day: {driver_utilization['working_hours'].mean():.1f}")
print(f"Average driving hours per day: {driver_utilization['driving_hours'].mean():.1f}")
print(f"Average utilization rate: {driver_utilization['utilization_rate'].mean():.1f}%")
print(f"Average trips per hour: {driver_utilization['trips_per_hour'].mean():.2f}")

## 5. Driver Churn Risk Analysis

In [None]:
# Calculate days since last trip
last_trip_date = completed_trips.groupby('driver_id')['pickup_datetime'].max()
driver_performance['last_trip_date'] = driver_performance['driver_id'].map(last_trip_date)
driver_performance['days_since_last_trip'] = (pd.Timestamp.now() - driver_performance['last_trip_date']).dt.days

# Define churn risk levels
def assign_churn_risk(days):
    if pd.isna(days):
        return 'No Trips'
    elif days <= 7:
        return 'Active'
    elif days <= 14:
        return 'Low Risk'
    elif days <= 30:
        return 'Medium Risk'
    else:
        return 'High Risk'

driver_performance['churn_risk'] = driver_performance['days_since_last_trip'].apply(assign_churn_risk)

# Churn risk distribution
churn_distribution = driver_performance['churn_risk'].value_counts()

# Visualize churn risk
fig, axes = plt.subplots(1, 3, figsize=(18, 5))

# Churn risk pie chart
colors = ['#2ecc71', '#f39c12', '#e67e22', '#e74c3c', '#95a5a6']
axes[0].pie(churn_distribution.values, labels=churn_distribution.index, autopct='%1.1f%%', 
            colors=colors, startangle=90)
axes[0].set_title('Driver Churn Risk Distribution')

# Days since last trip histogram
active_drivers = driver_performance[driver_performance['status'] == 'Active']
axes[1].hist(active_drivers['days_since_last_trip'].dropna(), bins=30, color='coral', edgecolor='black')
axes[1].set_xlabel('Days Since Last Trip')
axes[1].set_ylabel('Number of Drivers')
axes[1].set_title('Days Since Last Trip (Active Drivers)')
axes[1].axvline(7, color='green', linestyle='--', label='Active (7 days)')
axes[1].axvline(14, color='yellow', linestyle='--', label='Low Risk (14 days)')
axes[1].axvline(30, color='red', linestyle='--', label='High Risk (30 days)')
axes[1].legend()

# Churn risk by performance
churn_performance = driver_performance.groupby('churn_risk').agg({
    'avg_customer_rating': 'mean',
    'avg_fare': 'mean',
    'total_trips': 'mean'
})

x = range(len(churn_performance))
axes[2].bar(x, churn_performance['avg_customer_rating'], color='steelblue')
axes[2].set_xticks(x)
axes[2].set_xticklabels(churn_performance.index, rotation=45)
axes[2].set_ylabel('Average Customer Rating')
axes[2].set_title('Customer Rating by Churn Risk Level')
axes[2].grid(True, alpha=0.3)

plt.tight_layout()
plt.show()

# Identify at-risk high performers
at_risk_performers = driver_performance[
    (driver_performance['churn_risk'].isin(['Medium Risk', 'High Risk'])) &
    (driver_performance['avg_customer_rating'] > 4.0) &
    (driver_performance['total_revenue'] > driver_performance['total_revenue'].median())
][['driver_name', 'total_trips', 'total_revenue', 'avg_customer_rating', 'days_since_last_trip', 'churn_risk']]

print("\nHigh-Performing Drivers at Risk of Churn:")
print(at_risk_performers.head(10))

## 6. Driver Performance Trends

In [None]:
# Weekly performance trends
trips['week'] = trips['pickup_datetime'].dt.to_period('W')
weekly_performance = completed_trips.groupby(['driver_id', trips['week']]).agg({
    'trip_id': 'count',
    'total_fare': 'sum',
    'customer_rating': 'mean'
}).rename(columns={'trip_id': 'weekly_trips', 'total_fare': 'weekly_revenue'})

# Overall weekly trends
overall_weekly = weekly_performance.groupby(level=1).agg({
    'weekly_trips': 'sum',
    'weekly_revenue': 'sum',
    'customer_rating': 'mean'
})

# Convert period to timestamp for plotting
overall_weekly.index = overall_weekly.index.to_timestamp()

# Plot trends
fig, axes = plt.subplots(1, 3, figsize=(18, 5))

# Weekly trips trend
axes[0].plot(overall_weekly.index, overall_weekly['weekly_trips'], marker='o', color='steelblue', linewidth=2)
axes[0].set_xlabel('Week')
axes[0].set_ylabel('Total Trips')
axes[0].set_title('Weekly Trip Volume Trend')
axes[0].grid(True, alpha=0.3)
axes[0].tick_params(axis='x', rotation=45)

# Weekly revenue trend
axes[1].plot(overall_weekly.index, overall_weekly['weekly_revenue'], marker='s', color='green', linewidth=2)
axes[1].set_xlabel('Week')
axes[1].set_ylabel('Total Revenue ($)')
axes[1].set_title('Weekly Revenue Trend')
axes[1].grid(True, alpha=0.3)
axes[1].tick_params(axis='x', rotation=45)

# Customer rating trend
axes[2].plot(overall_weekly.index, overall_weekly['customer_rating'], marker='^', color='gold', linewidth=2)
axes[2].set_xlabel('Week')
axes[2].set_ylabel('Average Customer Rating')
axes[2].set_title('Weekly Customer Satisfaction Trend')
axes[2].set_ylim([3.5, 5.0])
axes[2].grid(True, alpha=0.3)
axes[2].tick_params(axis='x', rotation=45)

plt.tight_layout()
plt.show()

## 7. Driver Recommendations & Insights

In [None]:
print("=" * 70)
print("DRIVER PERFORMANCE ANALYTICS SUMMARY")
print("=" * 70)

# Overall statistics
print("\n📊 OVERALL STATISTICS:")
print(f"   Total active drivers: {len(driver_performance[driver_performance['status'] == 'Active'])}")
print(f"   Average trips per driver: {driver_performance['total_trips'].mean():.0f}")
print(f"   Average revenue per driver: ${driver_performance['total_revenue'].mean():.2f}")
print(f"   Average customer rating: {driver_performance['avg_customer_rating'].mean():.2f}")

# Top performers
print("\n🏆 TOP PERFORMERS:")
top_3 = driver_performance.nlargest(3, 'total_revenue')[['driver_name', 'total_revenue']]
for idx, row in top_3.iterrows():
    print(f"   {row['driver_name']}: ${row['total_revenue']:.2f}")

# Segmentation insights
print("\n👥 DRIVER SEGMENTATION:")
segment_counts = driver_performance['segment'].value_counts()
for segment, count in segment_counts.items():
    if pd.notna(segment):
        print(f"   {segment}: {count} drivers")

# Churn risk
print("\n⚠️ CHURN RISK ANALYSIS:")
high_risk_count = len(driver_performance[driver_performance['churn_risk'] == 'High Risk'])
medium_risk_count = len(driver_performance[driver_performance['churn_risk'] == 'Medium Risk'])
print(f"   High risk drivers: {high_risk_count}")
print(f"   Medium risk drivers: {medium_risk_count}")
print(f"   Total at-risk revenue: ${driver_performance[driver_performance['churn_risk'].isin(['High Risk', 'Medium Risk'])]['total_revenue'].sum():.2f}")

# Utilization insights
print("\n⏰ UTILIZATION METRICS:")
print(f"   Average utilization rate: {driver_utilization['utilization_rate'].mean():.1f}%")
print(f"   Drivers with <50% utilization: {len(driver_utilization[driver_utilization['utilization_rate'] < 50])}")
print(f"   Average working hours/day: {driver_utilization['working_hours'].mean():.1f}")

print("\n" + "=" * 70)
print("RECOMMENDATIONS")
print("=" * 70)

print("\n1. RETENTION STRATEGIES:")
print(f"   • Implement retention bonus for {high_risk_count} high-risk drivers")
print(f"   • Create re-engagement campaign for drivers inactive >14 days")
print(f"   • Offer incentives for low utilization drivers (<50%)")

print("\n2. PERFORMANCE OPTIMIZATION:")
low_rating_drivers = len(driver_performance[driver_performance['avg_customer_rating'] < 4.0])
print(f"   • Provide training for {low_rating_drivers} drivers with rating <4.0")
print(f"   • Share best practices from top 10% performers")
print(f"   • Implement route optimization tools for efficiency improvement")

print("\n3. CAPACITY MANAGEMENT:")
underutilized = len(driver_utilization[driver_utilization['utilization_rate'] < 40])
print(f"   • Reassign {underutilized} underutilized drivers to high-demand zones")
print(f"   • Implement dynamic scheduling based on demand patterns")
print(f"   • Consider part-time options for low-activity drivers")

print("\n4. REWARD PROGRAMS:")
high_performers = len(driver_performance[driver_performance['segment'] == 'Premium Service'])
print(f"   • Create premium tier rewards for {high_performers} top drivers")
print(f"   • Implement tips-based bonuses for high customer ratings")
print(f"   • Offer vehicle upgrade programs for consistent performers")

# Calculate potential impact
potential_retention_value = driver_performance[driver_performance['churn_risk'].isin(['High Risk', 'Medium Risk'])]['total_revenue'].sum() * 0.7
print(f"\n💰 POTENTIAL IMPACT:")
print(f"   Retention program value: ${potential_retention_value:.2f}")
print(f"   Utilization improvement opportunity: {(50 - driver_utilization['utilization_rate'].mean()):.1f}% increase")