# Taxi Demand Forecasting & Analysis

This notebook demonstrates demand forecasting and surge pricing optimization for the taxi service.

In [None]:
import pandas as pd
import numpy as np
import matplotlib.pyplot as plt
import seaborn as sns
from datetime import datetime, timedelta
import warnings
warnings.filterwarnings('ignore')

# Set style
plt.style.use('seaborn-v0_8-darkgrid')
sns.set_palette('husl')

# Display settings
pd.set_option('display.max_columns', None)
pd.set_option('display.max_rows', 100)

In [None]:
# Load data
trips = pd.read_csv('../data/trips.csv')
zones = pd.read_csv('../data/zones.csv')
weather = pd.read_csv('../data/weather_data.csv')

# Convert datetime columns
trips['pickup_datetime'] = pd.to_datetime(trips['pickup_datetime'])
trips['dropoff_datetime'] = pd.to_datetime(trips['dropoff_datetime'])
weather['datetime'] = pd.to_datetime(weather['datetime'])

print(f"Loaded {len(trips):,} trips")
print(f"Loaded {len(zones):,} zones")
print(f"Loaded {len(weather):,} weather records")

## 1. Hourly Demand Pattern Analysis

In [None]:
# Extract time features
trips['hour'] = trips['pickup_datetime'].dt.hour
trips['day_of_week'] = trips['pickup_datetime'].dt.dayofweek
trips['day_name'] = trips['pickup_datetime'].dt.day_name()
trips['date'] = trips['pickup_datetime'].dt.date
trips['is_weekend'] = trips['day_of_week'].isin([5, 6])

# Hourly demand
hourly_demand = trips.groupby('hour').agg({
    'trip_id': 'count',
    'total_fare': 'mean',
    'surge_multiplier': 'mean',
    'wait_time': 'mean'
}).rename(columns={'trip_id': 'trip_count'})

# Plot hourly demand
fig, axes = plt.subplots(2, 2, figsize=(15, 10))

# Trip count by hour
axes[0, 0].bar(hourly_demand.index, hourly_demand['trip_count'], color='steelblue')
axes[0, 0].set_xlabel('Hour of Day')
axes[0, 0].set_ylabel('Number of Trips')
axes[0, 0].set_title('Trip Demand by Hour')
axes[0, 0].grid(True, alpha=0.3)

# Average fare by hour
axes[0, 1].plot(hourly_demand.index, hourly_demand['total_fare'], marker='o', color='green', linewidth=2)
axes[0, 1].set_xlabel('Hour of Day')
axes[0, 1].set_ylabel('Average Fare ($)')
axes[0, 1].set_title('Average Fare by Hour')
axes[0, 1].grid(True, alpha=0.3)

# Surge multiplier by hour
axes[1, 0].plot(hourly_demand.index, hourly_demand['surge_multiplier'], marker='s', color='red', linewidth=2)
axes[1, 0].set_xlabel('Hour of Day')
axes[1, 0].set_ylabel('Average Surge Multiplier')
axes[1, 0].set_title('Surge Pricing by Hour')
axes[1, 0].grid(True, alpha=0.3)

# Wait time by hour
axes[1, 1].bar(hourly_demand.index, hourly_demand['wait_time']/60, color='orange')
axes[1, 1].set_xlabel('Hour of Day')
axes[1, 1].set_ylabel('Average Wait Time (minutes)')
axes[1, 1].set_title('Wait Time by Hour')
axes[1, 1].grid(True, alpha=0.3)

plt.tight_layout()
plt.show()

# Peak hours analysis
peak_hours = hourly_demand.nlargest(5, 'trip_count')
print("\nPeak Demand Hours:")
print(peak_hours)

## 2. Day of Week Pattern Analysis

In [None]:
# Day of week demand
daily_demand = trips.groupby(['day_of_week', 'day_name']).agg({
    'trip_id': 'count',
    'total_fare': 'mean',
    'trip_distance': 'mean',
    'passenger_count': 'mean'
}).rename(columns={'trip_id': 'trip_count'})

# Reset index for plotting
daily_demand = daily_demand.reset_index()

# Create subplots
fig, axes = plt.subplots(2, 2, figsize=(15, 10))

# Trip count by day
axes[0, 0].bar(daily_demand['day_name'], daily_demand['trip_count'], color='steelblue')
axes[0, 0].set_xlabel('Day of Week')
axes[0, 0].set_ylabel('Number of Trips')
axes[0, 0].set_title('Trip Volume by Day of Week')
axes[0, 0].tick_params(axis='x', rotation=45)

# Average fare by day
axes[0, 1].bar(daily_demand['day_name'], daily_demand['total_fare'], color='green')
axes[0, 1].set_xlabel('Day of Week')
axes[0, 1].set_ylabel('Average Fare ($)')
axes[0, 1].set_title('Average Fare by Day of Week')
axes[0, 1].tick_params(axis='x', rotation=45)

# Average distance by day
axes[1, 0].bar(daily_demand['day_name'], daily_demand['trip_distance'], color='purple')
axes[1, 0].set_xlabel('Day of Week')
axes[1, 0].set_ylabel('Average Distance (miles)')
axes[1, 0].set_title('Average Trip Distance by Day')
axes[1, 0].tick_params(axis='x', rotation=45)

# Weekend vs Weekday comparison
weekend_comparison = trips.groupby('is_weekend').agg({
    'trip_id': 'count',
    'total_fare': 'mean',
    'surge_multiplier': 'mean'
}).rename(columns={'trip_id': 'trip_count'})

weekend_comparison.index = ['Weekday', 'Weekend']
axes[1, 1].bar(weekend_comparison.index, weekend_comparison['trip_count'], color=['#1f77b4', '#ff7f0e'])
axes[1, 1].set_ylabel('Number of Trips')
axes[1, 1].set_title('Weekend vs Weekday Comparison')

plt.tight_layout()
plt.show()

print("\nWeekend vs Weekday Statistics:")
print(weekend_comparison)

## 3. Zone-based Demand Analysis

In [None]:
# Merge trips with zones
trips_with_zones = trips.merge(zones, left_on='pickup_zone_id', right_on='zone_id', how='left')

# Top zones by demand
zone_demand = trips_with_zones.groupby(['zone_name', 'area_type']).agg({
    'trip_id': 'count',
    'total_fare': 'mean',
    'wait_time': 'mean',
    'surge_multiplier': 'mean'
}).rename(columns={'trip_id': 'trip_count'})

# Top 15 zones by trip count
top_zones = zone_demand.nlargest(15, 'trip_count')

# Plot top zones
fig, axes = plt.subplots(1, 2, figsize=(15, 6))

# Trip count by zone
top_zones_reset = top_zones.reset_index()
axes[0].barh(range(len(top_zones_reset)), top_zones_reset['trip_count'], color='steelblue')
axes[0].set_yticks(range(len(top_zones_reset)))
axes[0].set_yticklabels(top_zones_reset['zone_name'], fontsize=8)
axes[0].set_xlabel('Number of Trips')
axes[0].set_title('Top 15 Zones by Trip Volume')
axes[0].grid(True, alpha=0.3)

# Area type distribution
area_type_demand = trips_with_zones.groupby('area_type')['trip_id'].count()
axes[1].pie(area_type_demand.values, labels=area_type_demand.index, autopct='%1.1f%%', startangle=90)
axes[1].set_title('Trip Distribution by Area Type')

plt.tight_layout()
plt.show()

print("\nTop 10 Zones by Demand:")
print(top_zones.head(10))

## 4. Weather Impact on Demand

In [None]:
# Round datetime to nearest hour for merging
trips['hour_rounded'] = trips['pickup_datetime'].dt.floor('H')
weather['hour_rounded'] = weather['datetime'].dt.floor('H')

# Merge trips with weather data
trips_weather = trips.merge(
    weather[['zone_id', 'hour_rounded', 'weather_condition', 'temperature', 'precipitation']],
    left_on=['pickup_zone_id', 'hour_rounded'],
    right_on=['zone_id', 'hour_rounded'],
    how='left'
)

# Weather impact analysis
weather_impact = trips_weather.groupby('weather_condition').agg({
    'trip_id': 'count',
    'total_fare': 'mean',
    'surge_multiplier': 'mean',
    'wait_time': 'mean'
}).rename(columns={'trip_id': 'trip_count'})

# Plot weather impact
fig, axes = plt.subplots(2, 2, figsize=(15, 10))

# Trips by weather condition
weather_impact_reset = weather_impact.reset_index()
axes[0, 0].bar(weather_impact_reset['weather_condition'], weather_impact_reset['trip_count'], color='skyblue')
axes[0, 0].set_xlabel('Weather Condition')
axes[0, 0].set_ylabel('Number of Trips')
axes[0, 0].set_title('Trip Volume by Weather')
axes[0, 0].tick_params(axis='x', rotation=45)

# Average fare by weather
axes[0, 1].bar(weather_impact_reset['weather_condition'], weather_impact_reset['total_fare'], color='green')
axes[0, 1].set_xlabel('Weather Condition')
axes[0, 1].set_ylabel('Average Fare ($)')
axes[0, 1].set_title('Average Fare by Weather')
axes[0, 1].tick_params(axis='x', rotation=45)

# Surge multiplier by weather
axes[1, 0].bar(weather_impact_reset['weather_condition'], weather_impact_reset['surge_multiplier'], color='red')
axes[1, 0].set_xlabel('Weather Condition')
axes[1, 0].set_ylabel('Average Surge Multiplier')
axes[1, 0].set_title('Surge Pricing by Weather')
axes[1, 0].tick_params(axis='x', rotation=45)

# Temperature vs demand scatter plot
temp_demand = trips_weather.groupby('temperature')['trip_id'].count().reset_index()
axes[1, 1].scatter(temp_demand['temperature'], temp_demand['trip_id'], alpha=0.5, color='orange')
axes[1, 1].set_xlabel('Temperature (°F)')
axes[1, 1].set_ylabel('Number of Trips')
axes[1, 1].set_title('Temperature vs Demand')
axes[1, 1].grid(True, alpha=0.3)

plt.tight_layout()
plt.show()

print("\nWeather Impact on Demand:")
print(weather_impact)

## 5. Time Series Forecasting with Prophet

In [None]:
# Prepare daily aggregated data
daily_trips = trips.groupby('date')['trip_id'].count().reset_index()
daily_trips.columns = ['ds', 'y']
daily_trips['ds'] = pd.to_datetime(daily_trips['ds'])

# Simple moving average forecast
daily_trips['ma_7'] = daily_trips['y'].rolling(window=7, min_periods=1).mean()
daily_trips['ma_14'] = daily_trips['y'].rolling(window=14, min_periods=1).mean()

# Plot time series
plt.figure(figsize=(15, 6))
plt.plot(daily_trips['ds'], daily_trips['y'], label='Actual', color='blue', alpha=0.7)
plt.plot(daily_trips['ds'], daily_trips['ma_7'], label='7-day MA', color='red', linewidth=2)
plt.plot(daily_trips['ds'], daily_trips['ma_14'], label='14-day MA', color='green', linewidth=2)
plt.xlabel('Date')
plt.ylabel('Number of Trips')
plt.title('Daily Trip Volume with Moving Averages')
plt.legend()
plt.grid(True, alpha=0.3)
plt.show()

# Calculate trend
last_week_avg = daily_trips.tail(7)['y'].mean()
prev_week_avg = daily_trips.tail(14).head(7)['y'].mean()
trend_pct = ((last_week_avg - prev_week_avg) / prev_week_avg) * 100

print(f"\nLast week average: {last_week_avg:.0f} trips/day")
print(f"Previous week average: {prev_week_avg:.0f} trips/day")
print(f"Week-over-week trend: {trend_pct:+.1f}%")

## 6. Surge Pricing Optimization Analysis

In [None]:
# Calculate demand-supply ratio by hour and zone
hourly_zone_demand = trips_with_zones.groupby(['hour', 'zone_name']).agg({
    'trip_id': 'count',
    'driver_id': 'nunique',
    'total_fare': 'mean',
    'surge_multiplier': 'mean',
    'wait_time': 'mean'
}).rename(columns={'trip_id': 'demand', 'driver_id': 'supply'})

# Calculate demand-supply ratio
hourly_zone_demand['demand_supply_ratio'] = hourly_zone_demand['demand'] / hourly_zone_demand['supply'].clip(lower=1)

# Identify surge opportunities
surge_opportunities = hourly_zone_demand[hourly_zone_demand['demand_supply_ratio'] > 2].sort_values('demand_supply_ratio', ascending=False)

# Top surge opportunities
top_surge = surge_opportunities.head(20).reset_index()

# Visualize surge opportunities
fig, axes = plt.subplots(1, 2, figsize=(15, 6))

# Heatmap of demand-supply ratio by hour
pivot_data = hourly_zone_demand.reset_index().pivot_table(
    values='demand_supply_ratio',
    index='hour',
    aggfunc='mean'
)

axes[0].bar(pivot_data.index, pivot_data.values, color='coral')
axes[0].set_xlabel('Hour of Day')
axes[0].set_ylabel('Avg Demand/Supply Ratio')
axes[0].set_title('Demand-Supply Ratio by Hour')
axes[0].axhline(y=2, color='red', linestyle='--', label='Surge Threshold')
axes[0].legend()
axes[0].grid(True, alpha=0.3)

# Scatter plot of ratio vs surge multiplier
sample_data = hourly_zone_demand.reset_index().sample(min(1000, len(hourly_zone_demand)))
axes[1].scatter(sample_data['demand_supply_ratio'], sample_data['surge_multiplier'], alpha=0.5)
axes[1].set_xlabel('Demand/Supply Ratio')
axes[1].set_ylabel('Current Surge Multiplier')
axes[1].set_title('Surge Pricing vs Demand-Supply Ratio')
axes[1].grid(True, alpha=0.3)

# Add trend line
z = np.polyfit(sample_data['demand_supply_ratio'], sample_data['surge_multiplier'], 1)
p = np.poly1d(z)
axes[1].plot(sample_data['demand_supply_ratio'].sort_values(), 
             p(sample_data['demand_supply_ratio'].sort_values()), 
             "r--", alpha=0.8, label='Trend')
axes[1].legend()

plt.tight_layout()
plt.show()

print("\nTop Surge Pricing Opportunities:")
print(top_surge[['hour', 'zone_name', 'demand', 'supply', 'demand_supply_ratio', 'surge_multiplier']].head(10))

## 7. Demand Forecast Summary & Recommendations

In [None]:
# Summary statistics
print("=" * 60)
print("DEMAND FORECASTING SUMMARY")
print("=" * 60)

# Peak hours
peak_hours_list = hourly_demand.nlargest(3, 'trip_count').index.tolist()
print(f"\n📊 Peak Demand Hours: {peak_hours_list}")
print(f"   - {hourly_demand.loc[peak_hours_list[0], 'trip_count']:.0f} trips at {peak_hours_list[0]}:00")

# Busiest day
busiest_day = daily_demand.nlargest(1, 'trip_count')['day_name'].values[0]
print(f"\n📅 Busiest Day: {busiest_day}")

# Top zones
top_zone = zone_demand.nlargest(1, 'trip_count').index[0][0]
print(f"\n📍 Highest Demand Zone: {top_zone}")

# Weather impact
bad_weather_surge = weather_impact.loc[weather_impact.index.isin(['Rain', 'Heavy Rain', 'Snow']), 'surge_multiplier'].mean()
good_weather_surge = weather_impact.loc['Clear', 'surge_multiplier']
print(f"\n🌧️ Weather Impact:")
print(f"   - Bad weather surge: {bad_weather_surge:.2f}x")
print(f"   - Clear weather surge: {good_weather_surge:.2f}x")

# Recommendations
print("\n" + "=" * 60)
print("RECOMMENDATIONS")
print("=" * 60)

print("\n1. SURGE PRICING:")
print(f"   - Implement dynamic surge during hours: {peak_hours_list}")
print(f"   - Increase surge multiplier during bad weather (target: {bad_weather_surge:.1f}x)")

print("\n2. DRIVER ALLOCATION:")
print(f"   - Position more drivers in {top_zone} area")
print(f"   - Increase driver incentives on {busiest_day}")

print("\n3. CAPACITY PLANNING:")
weekend_demand = trips[trips['is_weekend']]['trip_id'].count()
weekday_demand = trips[~trips['is_weekend']]['trip_id'].count()
print(f"   - Weekend capacity: {weekend_demand:,} trips")
print(f"   - Weekday capacity: {weekday_demand:,} trips")

print("\n4. REVENUE OPTIMIZATION:")
potential_revenue = (surge_opportunities['demand'] * surge_opportunities['total_fare'] * 1.5).sum()
current_revenue = (surge_opportunities['demand'] * surge_opportunities['total_fare']).sum()
revenue_increase = ((potential_revenue - current_revenue) / current_revenue) * 100
print(f"   - Potential revenue increase with optimized surge: {revenue_increase:.1f}%")