In [1]:
import random
import pandas as pd

In [2]:
# List of major locations across Delhi
locations = [
    "Connaught Place", "AIIMS", "Karol Bagh", "Dwarka", "Saket", "Lajpat Nagar", "Rajouri Garden",
    "Chandni Chowk", "Rohini", "Pitampura", "Janakpuri", "Laxmi Nagar", "Sarojini Nagar",
    "Nehru Place", "Vasant Kunj", "Mayur Vihar", "Shahdara", "Tughlakabad", "Greater Kailash",
    "Okhla", "Uttam Nagar", "Kashmere Gate", "Moti Nagar", "Tilak Nagar", "Kalkaji"
]

In [5]:

# Generate random traffic & transport data for Delhi
def generate_random_data(num_samples=5000):
    data = []
    for _ in range(num_samples):
        start_loc, dest_loc = random.sample(locations, 2)  # Ensure start ≠ destination
        
        # Random distance (5 km to 40 km)
        distance = random.uniform(5, 40)
        
        # Traffic signals on the route (2 to 10)
        num_signals = random.randint(2, 10)
        
        # Total red light delay (sum of all red lights)
        total_signal_wait = sum([random.randint(10, 180) for _ in range(num_signals)])
        
        # Queue length at peak hours (except for buses)
        queue_length = random.randint(0, 100)
        
        # Mode-based travel times (in minutes)
        base_speed = random.uniform(20, 60)  # Average speed range (km/h)
        base_time = (distance / base_speed) * 60  # Convert to minutes
        time_car = base_time + (queue_length // 10) + (total_signal_wait // 60)
        time_bike = time_car * 0.8
        time_cycle = time_car * 1.5
        time_bus = base_time * 1.2 + (total_signal_wait // 60)  # Buses skip queue delays
        time_metro = random.randint(20, 45)  # Assumed metro travel range
        
        # Travel expense (in INR)
        expense_car = distance * random.uniform(5, 10)
        expense_bike = distance * random.uniform(2, 4)
        expense_cycle = 0
        expense_bus = random.randint(10, 40)
        expense_metro = random.randint(20, 50)
        
        # Environmental impact (CO₂ emissions in g/km)
        pollution_car = distance * random.uniform(10, 15)
        pollution_bike = distance * random.uniform(5, 10)
        pollution_cycle = 0
        pollution_bus = distance * random.uniform(3, 8)
        pollution_metro = distance * random.uniform(1, 3)
        
        # Rush Factor (higher queue length = more congestion)
        rush_car = queue_length / 100
        rush_bike = rush_car * 0.8
        rush_cycle = 0  # Cycling isn't affected by congestion
        rush_bus = 0  # Bus lanes are clear
        rush_metro = random.uniform(0.2, 0.8)  # Metro rush varies
        
        # Pollution Exposure (affected by mode & route AQI)
        pollution_exposure_car = random.randint(150, 400)  
        pollution_exposure_bike = pollution_exposure_car * 1.2  # Bikes are more exposed
        pollution_exposure_cycle = pollution_exposure_car * 1.5  # Cycling has max exposure
        pollution_exposure_bus = random.randint(100, 300)
        pollution_exposure_metro = random.randint(50, 150)  # Metro is least exposed
        
        # Physical activity (cycling provides health benefits)
        physical_activity = 1 if time_cycle > 30 else 0
        
        data.append([start_loc, dest_loc, distance, num_signals, total_signal_wait, queue_length, 
                     time_car, time_bike, time_cycle, time_bus, time_metro,
                     expense_car, expense_bike, expense_cycle, expense_bus, expense_metro,
                     pollution_car, pollution_bike, pollution_cycle, pollution_bus, pollution_metro,
                     rush_car, rush_bike, rush_cycle, rush_bus, rush_metro,
                     pollution_exposure_car, pollution_exposure_bike, pollution_exposure_cycle,
                     pollution_exposure_bus, pollution_exposure_metro, physical_activity])
    
    df = pd.DataFrame(data, columns=["Start", "Destination", "Distance_KM", "Num_Signals", "Total_Signal_Wait", "Queue_Length",
                                     "Time_Car", "Time_Bike", "Time_Cycle", "Time_Bus", "Time_Metro",
                                     "Expense_Car", "Expense_Bike", "Expense_Cycle", "Expense_Bus", "Expense_Metro",
                                     "Pollution_Car", "Pollution_Bike", "Pollution_Cycle", "Pollution_Bus", "Pollution_Metro",
                                     "Rush_Car", "Rush_Bike", "Rush_Cycle", "Rush_Bus", "Rush_Metro",
                                     "Pollution_Exposure_Car", "Pollution_Exposure_Bike", "Pollution_Exposure_Cycle",
                                     "Pollution_Exposure_Bus", "Pollution_Exposure_Metro", "Physical_Activity"])
    
    return df

In [6]:
# Generate dataset
random_data = generate_random_data(5000)
print(random_data.head())

# Save to CSV
random_data.to_csv("delhi_travel_data.csv", index=False)

             Start      Destination  Distance_KM  Num_Signals  \
0      Tughlakabad     Lajpat Nagar    29.266322            6   
1  Connaught Place    Chandni Chowk    30.421922           10   
2           Dwarka   Sarojini Nagar     5.855791            4   
3  Connaught Place  Greater Kailash    17.108154            5   
4      Laxmi Nagar      Tilak Nagar    20.141383            4   

   Total_Signal_Wait  Queue_Length    Time_Car  Time_Bike  Time_Cycle  \
0                562            23   40.333380  32.266704   60.500070   
1               1009           100  104.809542  83.847633  157.214313   
2                358            83   28.559251  22.847401   42.838876   
3                616            56   44.444888  35.555911   66.667332   
4                254            82   43.664369  34.931495   65.496553   

     Time_Bus  ...  Rush_Bike  Rush_Cycle  Rush_Bus  Rush_Metro  \
0   44.200056  ...      0.184           0         0    0.371367   
1  110.571450  ...      0.800       