# Uber Driver Data Analysis

This notebook analyzes driver performance, earnings, and behavior patterns from the Uber hackathon dataset.

## 1. Import Libraries and Load Data

In [None]:
!pip install -U pip
!pip install pandas matplotlib seaborn openpyxl

In [None]:
import pandas as pd
import matplotlib.pyplot as plt
import seaborn as sns
import warnings
warnings.filterwarnings('ignore')

# Set style for better-looking plots
sns.set_style("whitegrid")
plt.rcParams['figure.dpi'] = 100

In [None]:
# ---------------------------
# Load all sheets from the Excel file
# ---------------------------
file_path = "uber_hackathon_v2_mock_data.xlsx"
sheets = pd.read_excel(file_path, sheet_name=None)

# Assign sheets to variables
earners = sheets['earners']
rides_trips = sheets['rides_trips']
earnings_daily = sheets['earnings_daily']
incentives_weekly = sheets['incentives_weekly']
cancellation_rates = sheets['cancellation_rates']
surge_by_hour = sheets.get('surge_by_hour')        # optional
weather_daily = sheets.get('weather_daily')        # optional
heatmap_sheet = sheets.get('heatmap')              # optional
jobs_like = sheets.get('jobs_like')                # optional

# Convert datetime columns
rides_trips['start_time'] = pd.to_datetime(rides_trips['start_time'])
rides_trips['end_time'] = pd.to_datetime(rides_trips['end_time'])
earnings_daily['date'] = pd.to_datetime(earnings_daily['date'])

print(f"Loaded {len(sheets)} sheets from {file_path}")
print(f"\nEarners: {len(earners)} records")
print(f"Rides/Trips: {len(rides_trips)} records")
print(f"Daily Earnings: {len(earnings_daily)} records")
print(f"Weekly Incentives: {len(incentives_weekly)} records")

## 2. Earnings vs Working Time

In [None]:
# ---------------------------
# 1. Earnings vs Working Time
# ---------------------------
plt.figure(figsize=(8,6))
sns.scatterplot(x='rides_duration_mins', y='total_net_earnings', data=earnings_daily, alpha=0.5)
plt.title('Earnings vs. Working Time')
plt.xlabel('Rides Duration (mins)')
plt.ylabel('Total Net Earnings (€)')
plt.tight_layout()
plt.show()

## 3. Trips per Day Distribution

In [None]:
# ---------------------------
# 2. Trips per Day Distribution
# ---------------------------
plt.figure(figsize=(8,6))
sns.histplot(earnings_daily['trips_count'], bins=15, kde=False)
plt.title('Trips per Day Distribution')
plt.xlabel('Trips per Day')
plt.ylabel('Number of Drivers')
plt.tight_layout()
plt.show()

## 4. Hourly Earnings Curve

In [None]:
# ---------------------------
# 3. Hourly Earnings Curve
# ---------------------------
rides_trips['hour'] = rides_trips['start_time'].dt.hour
hourly_earnings = rides_trips.groupby('hour')['net_earnings'].mean().reset_index()
plt.figure(figsize=(8,6))
sns.lineplot(x='hour', y='net_earnings', data=hourly_earnings, marker='o')
plt.title('Hourly Earnings Curve')
plt.xlabel('Hour of Day')
plt.ylabel('Average Net Earnings (€)')
plt.tight_layout()
plt.show()

## 5. Idle Time Analysis

In [None]:
# ---------------------------
# 4. Idle Time Analysis
# ---------------------------
rides_trips_sorted = rides_trips.sort_values(['driver_id','start_time'])
rides_trips_sorted['next_start'] = rides_trips_sorted.groupby('driver_id')['start_time'].shift(-1)
rides_trips_sorted['idle_time_mins'] = (rides_trips_sorted['next_start'] - rides_trips_sorted['end_time']).dt.total_seconds()/60

# Filter out negative and extreme values
rides_trips_filtered = rides_trips_sorted[(rides_trips_sorted['idle_time_mins'] > 0) & 
                                           (rides_trips_sorted['idle_time_mins'] < 300)]

plt.figure(figsize=(8,6))
sns.scatterplot(x='idle_time_mins', y='net_earnings', data=rides_trips_filtered, alpha=0.5)
plt.title('Idle Time vs Earnings')
plt.xlabel('Idle Time (mins)')
plt.ylabel('Net Earnings (€)')
plt.tight_layout()
plt.show()

## 6. Weekly Incentives Effect

In [None]:
# ---------------------------
# 5. Weekly Incentives Effect
# ---------------------------
weekly_earnings = earnings_daily.groupby('earner_id').agg({'total_net_earnings':'sum'}).reset_index()
weekly_data = weekly_earnings.merge(incentives_weekly, on='earner_id', how='left')
plt.figure(figsize=(8,6))
sns.scatterplot(x='bonus_eur', y='total_net_earnings', data=weekly_data, alpha=0.5)
plt.title('Weekly Incentives vs Total Earnings')
plt.xlabel('Bonus (€)')
plt.ylabel('Total Net Earnings (€)')
plt.tight_layout()
plt.show()

## 7. Driver Rating vs Workload

In [None]:
# ---------------------------
# 6. Driver Rating vs Workload
# ---------------------------
driver_workload = rides_trips.groupby('driver_id')['duration_mins'].mean().reset_index()
rating_workload = driver_workload.merge(earners[['earner_id','rating']], 
                                         left_on='driver_id', 
                                         right_on='earner_id', 
                                         how='left')
plt.figure(figsize=(8,6))
sns.scatterplot(x='duration_mins', y='rating', data=rating_workload, alpha=0.5)
plt.title('Driver Rating vs Average Workload')
plt.xlabel('Average Trip Duration (mins)')
plt.ylabel('Driver Rating')
plt.tight_layout()
plt.show()

## 8. Cancellation Rate vs Surge

In [None]:
# ---------------------------
# 7. Cancellation Rate vs Surge
# ---------------------------
if surge_by_hour is not None:
    # Add hour column to cancellation_rates if needed
    # For this analysis, we'll aggregate by city
    cancellation_city = cancellation_rates.groupby('city_id').agg({
        'cancellation_rate_pct': 'mean'
    }).reset_index()
    
    surge_city = surge_by_hour.groupby('city_id').agg({
        'surge_multiplier': 'mean'
    }).reset_index()
    
    cancellation_surge = cancellation_city.merge(surge_city, on='city_id', how='left')
    
    plt.figure(figsize=(8,6))
    sns.scatterplot(x='surge_multiplier', y='cancellation_rate_pct', 
                    data=cancellation_surge, s=100)
    plt.title('Cancellation Rate vs Surge Multiplier (by City)')
    plt.xlabel('Average Surge Multiplier')
    plt.ylabel('Average Cancellation Rate (%)')
    plt.tight_layout()
    plt.show()
else:
    print("Surge data not available")

## 9. Fatigue Heatmap - Trips by Hour and Day

In [None]:
# ---------------------------
# 8. Fatigue Heatmap
# ---------------------------
rides_trips['day_of_week'] = rides_trips['start_time'].dt.day_name()
heatmap_data = rides_trips.pivot_table(index='hour', 
                                        columns='day_of_week', 
                                        values='ride_id', 
                                        aggfunc='count')

# Reorder columns to show days in proper order
day_order = ['Monday', 'Tuesday', 'Wednesday', 'Thursday', 'Friday', 'Saturday', 'Sunday']
heatmap_data = heatmap_data[[col for col in day_order if col in heatmap_data.columns]]

plt.figure(figsize=(12,8))
sns.heatmap(heatmap_data, cmap='YlOrRd', annot=True, fmt='g', cbar_kws={'label': 'Number of Trips'})
plt.title('Fatigue Heatmap: Trips per Hour by Day of Week')
plt.xlabel('Day of Week')
plt.ylabel('Hour of Day')
plt.tight_layout()
plt.show()