In [5]:
import pandas as pd
import numpy as np
import random
from faker import Faker
from datetime import datetime, timedelta

# Initialize
fake = Faker()
random.seed(100)
np.random.seed(100)

# Constants
num_rows = 50000
start_date = datetime(2025, 7, 1)
end_date = datetime(2025, 7, 31)
match_days = [datetime(2025, 7, d) for d in [6, 13, 20, 27]]
weekend_days = [5, 6]

# Locations (50)
locations = [
    "Koramangala", "Indiranagar", "Whitefield", "Jayanagar", "HSR Layout",
    "BTM Layout", "Marathahalli", "Electronic City", "MG Road", "Rajajinagar",
    "Malleshwaram", "Yeshwanthpur", "Hebbal", "Bannerghatta", "Banashankari",
    "Basavanagudi", "Ulsoor", "Domlur", "Frazer Town", "RT Nagar",
    "Sanjay Nagar", "Vijayanagar", "KR Puram", "Bellandur", "Sarjapur",
    "Kengeri", "Nagarbhavi", "Peenya", "Shivajinagar", "Majestic",
    "Lalbagh", "JP Nagar", "Richmond Town", "Cunningham Road", "Cooke Town",
    "Hoodi", "Kadubeesanahalli", "Bommanahalli", "CV Raman Nagar", "HAL",
    "Hennur", "Nagawara", "Ramamurthy Nagar", "Varthur", "Brookefield",
    "Kammanahalli", "Sadashivanagar", "Thippasandra", "Wilson Garden", "Seshadripuram"
]

# Vehicle types
vehicle_types = ["Auto", "Prime Plus", "Prime Sedan", "Mini", "eBike", "Prime SUV"]

# Cancellation reasons
cancel_reasons_customer = [
    "Driver is not moving towards pickup location",
    "Driver asked to cancel",
    "AC is Not working Jnly for 4 wheelers",
    "Change of plans",
    "Wrong Address",
    "Canceled Rides by Driver"
]

cancel_reasons_driver = [
    "Personal & Car related issue",
    "Customer related issue",
    "Customer was coughing/sick",
    "More than permitted people in there",
    "Customer Demand",
    "Vehicle Breakdown",
    "Other Issue"
]

# Booking status weights: [Success, Cancelled by Customer, Cancelled by Driver, Incomplete]
status_weights = [0.62, 0.07, 0.18, 0.13]

# Booking generator
def generate_booking(i):
    date = fake.date_between_dates(start_date, end_date)
    is_weekend = date.weekday() in weekend_days
    is_match_day = date in match_days
    time = fake.time()
    booking_id = f"CNR{str(1000000 + i)}"
    customer_id = f"CUST{random.randint(10000, 99999)}"
    vehicle = random.choice(vehicle_types)
    pickup = random.choice(locations)
    drop = random.choice([loc for loc in locations if loc != pickup])
    
    status = random.choices(
        ["Success", "Cancelled by Customer", "Cancelled by Driver", "Incomplete"],
        weights=status_weights, k=1
    )[0]

    if status == "Success":
        vtat = round(random.uniform(5, 15), 2)
        ctat = round(random.uniform(15, 40), 2)
        fare_range = random.choices(["<500", "500-1000", ">1000"], weights=[70, 28, 2])[0]
        if fare_range == "<500":
            value = random.randint(100, 499)
        elif fare_range == "500-1000":
            value = random.randint(500, 999)
        else:
            value = random.randint(1000, 1500)
        if is_weekend or is_match_day:
            value += random.randint(50, 150)
        distance = round(random.uniform(2, 25), 2)
        driver_rating = round(random.uniform(3.0, 5.0), 1)
        customer_rating = round(random.uniform(3.0, 5.0), 1)
        cancel_reason_cust = ""
        cancel_reason_driver = ""
    elif status == "Cancelled by Customer":
        vtat = ctat = distance = driver_rating = customer_rating = value = None
        cancel_reason_cust = random.choice(cancel_reasons_customer)
        cancel_reason_driver = ""
    elif status == "Cancelled by Driver":
        vtat = ctat = distance = driver_rating = customer_rating = value = None
        cancel_reason_cust = ""
        cancel_reason_driver = random.choice(cancel_reasons_driver)
    else:  # Incomplete
        vtat = round(random.uniform(5, 15), 2)
        ctat = round(random.uniform(15, 40), 2)
        value = random.randint(150, 1200)
        if is_weekend or is_match_day:
            value += random.randint(30, 100)
        distance = round(random.uniform(2, 20), 2)
        driver_rating = round(random.uniform(2.5, 5.0), 1)
        customer_rating = round(random.uniform(2.5, 5.0), 1)
        cancel_reason_driver = ""
        cancel_reason_cust = ""

    return [
        date.strftime("%Y-%m-%d"), time, booking_id, status, customer_id,
        vehicle, pickup, drop, vtat, ctat, cancel_reason_cust,
        cancel_reason_driver, value, distance, driver_rating, customer_rating
    ]

# Column names
columns = [
    "Date", "Time", "BookingID", "Booking Status", "Customer ID",
    "Vehicle Type", "Pickup Location", "Drop Location", "Avg VTAT",
    "Avg CTAT", "Reason for canceling by Customer",
    "Reason for canceling by Driver", "Booking Value",
    "Ride Distance", "Driver Ratings", "Customer Rating"
]

# Generate dataset
data = [generate_booking(i) for i in range(num_rows)]
df = pd.DataFrame(data, columns=columns)

# Save to Excel
df.to_excel("Bengaluru_Booking_Data_50K.xlsx", index=False)
print("✅ File saved: Bengaluru_Booking_Data_50K.xlsx")


✅ File saved: Bengaluru_Booking_Data_50K.xlsx
