In [1]:
pip install fastavro faker mimesis

Collecting fastavro
  Downloading fastavro-1.10.0-cp311-cp311-manylinux_2_17_x86_64.manylinux2014_x86_64.whl.metadata (5.5 kB)
Collecting faker
  Downloading faker-37.0.0-py3-none-any.whl.metadata (15 kB)
Collecting mimesis
  Downloading mimesis-18.0.0-py3-none-any.whl.metadata (5.7 kB)
Downloading fastavro-1.10.0-cp311-cp311-manylinux_2_17_x86_64.manylinux2014_x86_64.whl (3.3 MB)
[2K   [90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━[0m [32m3.3/3.3 MB[0m [31m28.2 MB/s[0m eta [36m0:00:00[0m
[?25hDownloading faker-37.0.0-py3-none-any.whl (1.9 MB)
[2K   [90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━[0m [32m1.9/1.9 MB[0m [31m41.6 MB/s[0m eta [36m0:00:00[0m
[?25hDownloading mimesis-18.0.0-py3-none-any.whl (4.7 MB)
[2K   [90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━[0m [32m4.7/4.7 MB[0m [31m48.3 MB/s[0m eta [36m0:00:00[0m
[?25hInstalling collected packages: mimesis, fastavro, faker
Successfully installed faker-37.0.0 fastavro-1.10.0 mimesis-18.0.0


In [15]:
#DATA FEED 1: ALERTS

import json
import random
import uuid
from faker import Faker
from datetime import datetime
import fastavro
import os

# Initialize Faker, which will be used to generate random fake data
fake = Faker()

# Predefined locations within Madrid where traffic events might occur
madrid_locations = [
    "Puerta del Sol, Madrid", "Gran Via, Madrid", "Plaza Mayor, Madrid", "Atocha, Madrid",
    "Retiro Park, Madrid", "Chamartín, Madrid", "Lavapiés, Madrid", "Malasaña, Madrid",
    "Moncloa, Madrid", "Barrio de Salamanca, Madrid", "Chueca, Madrid"
]

# Define the different types of traffic events and their respective probabilities
traffic_alert_types = {
    "accident": 0.15,  # 15% chance for accident
    "closed road": 0.1,  # 10% chance for closed road
    "roadwork": 0.2,  # 20% chance for roadwork
    "weather-related": 0.05,  # 5% chance for weather-related event
    "road congestion": 0.5  # 50% chance for road congestion
}

# Define the levels of traffic severity and their probabilities
traffic_levels = {
    "low": 0.4,  # 40% chance for low traffic
    "moderate": 0.3,  # 30% chance for moderate traffic
    "high": 0.2,  # 20% chance for high traffic
    "severe": 0.1  # 10% chance for severe traffic
}

# Define the surge multiplier for each traffic severity level (how much the traffic will increase the ride price)
traffic_severity_multipliers = {
    "low": 1.0,  # No multiplier for low traffic
    "moderate": 1.2,  # 20% price increase for moderate traffic
    "high": 1.5,  # 50% price increase for high traffic
    "severe": 2.0  # 100% price increase for severe traffic
}

# Initialize a dictionary to store generated traffic alerts
traffic_alerts_data = {}

# Function to generate a single traffic surge alert
def generate_traffic_surge_alert():
    # Generate a unique alert ID
    alert_id = str(uuid.uuid4())

    # Randomly select a zone from predefined Madrid locations
    zone_id = random.choice(madrid_locations)

    # Select a traffic severity level based on the defined probabilities
    traffic_level = random.choices(list(traffic_levels.keys()), list(traffic_levels.values()))[0]

    # Select an event type based on defined probabilities (accidents, road closures, etc.)
    event_type = random.choices(list(traffic_alert_types.keys()), list(traffic_alert_types.values()))[0]

    # Calculate the surge multiplier based on the selected traffic level
    surge_multiplier = traffic_severity_multipliers[traffic_level]

    # Generate a random timestamp using Faker
    timestamp = fake.date_time_this_year().isoformat()

    # Construct the alert dictionary with all necessary information
    alert = {
        "alert_id": alert_id,
        "timestamp": timestamp,
        "zone_id": zone_id,
        "traffic_level": traffic_level,
        "event_type": event_type,
        "surge_multiplier": surge_multiplier
    }

    # Store the alert in the traffic_alerts_data dictionary, categorized by zone
    if zone_id not in traffic_alerts_data:
        traffic_alerts_data[zone_id] = []

    traffic_alerts_data[zone_id].append(alert)

    return alert

# Function to generate and save multiple traffic surge alerts to both JSON and AVRO formats
def save_traffic_surge_alerts(num_alerts=100):
    # Generate a list of traffic surge alerts
    alerts = [generate_traffic_surge_alert() for _ in range(num_alerts)]

    # Specify the paths where the files will be saved (current working directory)
    json_file_path = os.path.join(os.getcwd(), "traffic_surge_alerts.json")  # Save alerts in JSON format
    avro_file_path = os.path.join(os.getcwd(), "traffic_surge_alerts.avro")  # Save alerts in AVRO format

    # Save the alerts data to a JSON file
    with open(json_file_path, "w") as json_file:
        json.dump(traffic_alerts_data, json_file, indent=4)

    # Define the schema for the AVRO file format
    traffic_surge_schema = {
        "type": "record",
        "name": "TrafficSurgeAlert",
        "fields": [
            {"name": "alert_id", "type": "string"},
            {"name": "timestamp", "type": "string"},
            {"name": "zone_id", "type": "string"},
            {"name": "traffic_level", "type": "string"},
            {"name": "event_type", "type": "string"},
            {"name": "surge_multiplier", "type": "float"}
        ]
    }

    # Save the alerts data to an AVRO file
    with open(avro_file_path, "wb") as avro_file:
        fastavro.writer(avro_file, traffic_surge_schema, alerts)

    # Output the results to the console
    print(f"{num_alerts} traffic surge alerts generated and saved!")
    print(f"Files saved at: {json_file_path} and {avro_file_path}")

# Example call to generate and save 100 traffic surge alerts
save_traffic_surge_alerts(100)



100 traffic surge alerts generated and saved!
Files saved at: /content/traffic_surge_alerts.json and /content/traffic_surge_alerts.avro


In [16]:
#DATA FEED 2: TRIPS

#DATA FEED 2: REQUESTS

import json
import random
import uuid
import time
from datetime import datetime, timedelta
from faker import Faker
from geopy.geocoders import Nominatim
from geopy.distance import geodesic
from geopy.exc import GeocoderTimedOut
import numpy as np
import fastavro
import os

# Initialize Faker and geolocator for generating fake data and obtaining geographic coordinates
fake = Faker()
geolocator = Nominatim(user_agent="ride_hailing_simulator", timeout=3)

# Cache for storing already found coordinates to reduce repeated API calls to geolocation service
location_cache = {}

# Predefined locations within Madrid for the ride simulation
madrid_locations = [
    "Puerta del Sol, Madrid", "Gran Via, Madrid", "Plaza Mayor, Madrid", "Atocha, Madrid",
    "Retiro Park, Madrid", "Chamartín, Madrid", "Lavapiés, Madrid", "Malasaña, Madrid",
    "Moncloa, Madrid", "Barrio de Salamanca, Madrid", "Chueca, Madrid"
]

# Event types and available Uber ride types
uber_types = ["uber_share", "regular_uber"]
traffic_alerts = ["accident", "closed road", "roadwork", "weather-related", "road congestion"]

# Constants for pricing and simulation parameters
BASE_FEE = 3.5  # Base fare for the ride
AVERAGE_SPEED_KMH = 50  # Average speed of the car in km/h
LAMBDA = 0.1  # Lambda for Poisson process (used to simulate user arrival rate)
MAX_RETRIES = 3  # Max retries for finding a driver

# Predefined driver availability with random locations in Madrid
TOTAL_DRIVERS = 200
free_drivers = {driver: random.choice(madrid_locations) for driver in range(1, TOTAL_DRIVERS + 1)}
busy_drivers = {}
driver_ride_count = {driver: 0 for driver in range(1, TOTAL_DRIVERS + 1)}

# Load traffic alerts from the saved JSON file
def load_traffic_alerts():
    with open("traffic_surge_alerts.json", "r") as json_file:
        return json.load(json_file)

# Store traffic alerts data globally
traffic_alerts_data = load_traffic_alerts()

# Function to get coordinates (latitude, longitude) of a given location name
def get_coordinates(location_name):
    # Check if the coordinates are already cached to avoid redundant API calls
    if location_name in location_cache:
        return location_cache[location_name]

    # Attempt to get the coordinates from geolocator, retrying up to 3 times in case of a timeout
    for attempt in range(3):
        try:
            location = geolocator.geocode(location_name, timeout=3)
            if location:
                coords = (location.latitude, location.longitude)
                location_cache[location_name] = coords  # Cache the coordinates for future use
                return coords
            else:
                return (None, None)  # Return None if location is not found
        except GeocoderTimedOut:
            time.sleep(2)  # Wait for 2 seconds before retrying
    return (None, None)  # Return None if all attempts fail

# Function to determine the day of the week from a timestamp
def get_day_of_week(timestamp):
    dt = fake.date_time_this_year()  # Generate a random date within this year
    return dt.strftime("%A")  # Return the day of the week as a string (e.g., 'Monday')

# Function to compute the ride duration based on start and end coordinates and traffic conditions
def compute_ride_duration(start_coords, end_coords, start_location, end_location):
    if start_coords is None or end_coords is None:
        return None  # If coordinates are invalid, return None

    # Default distance and time calculation assuming no traffic
    distance_km = geodesic(start_coords, end_coords).kilometers
    duration = (distance_km / AVERAGE_SPEED_KMH) * 60  # Convert from hours to minutes
    base_duration = max(5, round(duration, 2))  # Ensure a minimum duration of 5 minutes

    # Adjust the duration based on traffic severity
    traffic_multiplier = 1.0  # Default multiplier with no traffic

    # Check for traffic alerts at start and end locations and adjust the traffic multiplier
    if start_location in traffic_alerts_data:
        for alert in traffic_alerts_data[start_location]:
            traffic_multiplier = max(traffic_multiplier, alert["surge_multiplier"])

    if end_location in traffic_alerts_data:
        for alert in traffic_alerts_data[end_location]:
            traffic_multiplier = max(traffic_multiplier, alert["surge_multiplier"])

    # Apply the traffic multiplier to the base duration
    adjusted_duration = base_duration * traffic_multiplier
    return round(adjusted_duration, 2)

# Function to determine a time surcharge based on the time of day and day of the week
def get_time_surcharge(timestamp, day_of_week):
    dt = datetime.fromisoformat(timestamp)  # Convert timestamp to datetime object
    hour, minute = dt.hour, dt.minute

    # Define specific time surcharges based on the day of the week and time of day
    if day_of_week in ["Monday", "Tuesday", "Wednesday", "Thursday"]:
        if (8 <= hour < 10) or (16 <= hour < 17 and minute < 30) or (19 <= hour < 20 and minute < 30):
            return 0.06 if (16 <= hour < 17 and minute < 30) or (19 <= hour < 20 and minute < 30) else 0.02
    elif day_of_week == "Friday":
        if (13 <= hour < 15 and minute < 30) or (20 <= hour < 22):
            return 0.06 if (13 <= hour < 15 and minute < 30) else 0.04
    elif day_of_week in ["Saturday", "Sunday"]:
        if (4 <= hour < 6) or (23 <= hour < 24 and minute < 45):
            return 0.07 if (4 <= hour < 6) else 0.05
    return 0.0  # No surcharge if none of the conditions are met

# Function to calculate the dynamic price based on various factors like traffic, distance, driver availability, etc.
def calculate_dynamic_price(start_coords, end_coords, num_available_drivers, uber_type, traffic_severity, timestamp, day_of_week, start_location, end_location):
    distance_km = geodesic(start_coords, end_coords).kilometers
    price = BASE_FEE  # Start with the base fee

    # Apply distance-based pricing (3% increase per km)
    price *= (1 + 0.03 * distance_km)

    # Apply traffic alert influence on price based on traffic severity
    traffic_multiplier = 1.0
    for zone, alerts in traffic_alerts_data.items():
        if zone in [start_location, end_location]:
            for alert in alerts:
                traffic_multiplier = alert["surge_multiplier"]

    # Driver availability surcharge based on how many drivers are available
    surcharge = 0.0
    if num_available_drivers <= 50:
        surcharge = 0.0
    elif num_available_drivers <= 100:
        surcharge = 0.03
    elif num_available_drivers <= 150:
        surcharge = 0.06
    else:
        surcharge = 0.10

    price *= (1 + surcharge)

    # Apply time-based surcharge (e.g., rush hours, weekends)
    time_surcharge = get_time_surcharge(timestamp, day_of_week)
    price *= (1 + time_surcharge)

    # Apply a discount for Uber share rides
    if uber_type == "uber_share":
        price *= 0.95  # Apply a 5% discount for shared rides

    return round(price, 2)  # Return the final price rounded to two decimal places

# Function to find the best available driver for the given start location
def find_best_driver(start_location):
    start_coords = get_coordinates(start_location)
    if not start_coords:
        return None, None  # Return None if coordinates can't be fetched

    best_driver = None
    best_distance = float("inf")

    # Check for the nearest driver available
    for driver, location in free_drivers.items():
        driver_coords = get_coordinates(location)
        if not driver_coords:
            continue  # Skip drivers with unknown locations
        distance = geodesic(start_coords, driver_coords).kilometers
        # Select the closest driver, and break ties by the least number of rides completed
        if distance < best_distance or (distance == best_distance and driver_ride_count[driver] < driver_ride_count.get(best_driver, float("inf"))):
            best_driver = driver
            best_distance = distance

    return best_driver, free_drivers.get(best_driver, None)

# Function to update the availability of drivers based on their current ride status
def update_driver_status():
    current_time = time.time()  # Get the current time in seconds
    completed_drivers = [driver for driver, available_at in busy_drivers.items() if available_at <= current_time]
    for driver in completed_drivers:
        free_drivers[driver] = random.choice(madrid_locations)  # Assign a new random location to the driver
        del busy_drivers[driver]  # Remove the driver from the busy list once their ride is completed

# Function to generate ride events
def generate_ride(retry_count=0):

    events = []  # List to store the events generated during the ride
    user_id = str(uuid.uuid4())  # Generates a unique user ID for the ride request
    ride_id = str(uuid.uuid4())  # Generates a unique ride ID for the ride
    uber_type = random.choice(uber_types)  # Randomly selects a type of Uber ride
    start_location = random.choice(madrid_locations)  # Randomly selects a start location in Madrid
    end_location = random.choice(madrid_locations)  # Randomly selects an end location in Madrid

    # Ensures the start and end locations are not the same
    while start_location == end_location:
        end_location = random.choice(madrid_locations)

    start_coords = get_coordinates(start_location)  # Fetches coordinates for the start location
    end_coords = get_coordinates(end_location)  # Fetches coordinates for the end location

    # Ensures 'current_time' exists as a global variable, if not, initializes it
    global current_time
    if 'current_time' not in globals():
        current_time = datetime.now()  # Sets the current time to now if not already set

    # Simulates inter-arrival time between users based on Poisson distribution (exponential)
    inter_arrival_time = np.random.exponential(1 / LAMBDA)  # Random time between requests based on LAMBDA
    current_time += timedelta(seconds=int(inter_arrival_time))  # Adds the inter-arrival time to the current time
    day_of_week = get_day_of_week(current_time)  # Gets the current day of the week (e.g., Monday, Tuesday, etc.)

    # Function to create and return a ride event with various attributes
    def create_event(event_type, time_offset=0, driver_id=None, driver_location=None, driver_coordinates=None,
                     traffic_alert=None, traffic_severity=None, price=None):
        global current_time  # Ensures the global current_time variable is updated

        # Assigns the correct location based on the event type
        if event_type == "Start car ride":
            driver_location = start_location
        elif event_type == "Ride finished":
            driver_location = end_location

        # Advances the time by the provided offset
        current_time += timedelta(seconds=time_offset)

        # Returns a dictionary with event data
        return {
            "event_id": str(uuid.uuid4()),  # Generates a unique event ID
            "ride_id": ride_id,  # Associates the event with the ride ID
            "user_id": user_id,  # Associates the event with the user ID
            "event_type": event_type,  # Type of event (e.g., Request, Start, Finish)
            "uber_type": uber_type,  # Type of Uber ride (e.g., UberX, UberXL)
            "start_location": start_location,  # The ride's starting location
            "end_location": end_location,  # The ride's ending location
            "start_coordinates": start_coords,  # Coordinates for the start location
            "end_coordinates": end_coords,  # Coordinates for the end location
            "timestamp_event": current_time.isoformat(),  # The event's timestamp in ISO format
            "driver_id": driver_id,  # ID of the driver (if available)
            "driver_location": driver_location,  # Location of the driver (if available)
            "driver_coordinates": driver_coordinates,  # Coordinates of the driver (if available)
            "traffic_alert": traffic_alert,  # Traffic alerts (if any)
            "traffic_severity": traffic_severity,  # Severity of traffic (if any)
            "price": price,  # Price for the ride (if calculated)
            "day_of_week": day_of_week  # Day of the week the ride occurred
        }

    # If the maximum retry count has been reached, generate a "User leaves" event and return the events
    if retry_count >= MAX_RETRIES:
        events.append(create_event("User leaves", time_offset=45))  # User leaves after retries exceed max
        return events

    # Add a "Request" event when the user first requests a ride
    events.append(create_event("Request", time_offset=0))
    update_driver_status()  # Updates the status of available drivers

    # Initialize retry mechanism if no driver is found
    max_retry_attempts = MAX_RETRIES  # Maximum number of retries to find a driver
    retries_left = max_retry_attempts  # Keeps track of remaining retries

    # Retry logic to find a driver
    while retries_left > 0:
        # Finds the best driver for the ride based on start location
        best_driver, driver_location = find_best_driver(start_location)

        if best_driver:  # If a driver is found, break out of the loop
            # Removes the found driver from the pool of available drivers
            free_drivers.pop(best_driver)
            driver_coordinates = get_coordinates(driver_location)  # Gets the driver's coordinates
            # Randomly decide if there's a traffic alert and its severity
            traffic_alert = random.choice(traffic_alerts) if random.random() > 0.7 else None
            traffic_severity = random.randint(1, 5) if traffic_alert else None
            # Simulate dynamic pricing based on various factors like traffic and location
            price = calculate_dynamic_price(start_coords, end_coords, len(free_drivers), uber_type, traffic_severity, current_time.isoformat(), day_of_week, start_location, end_location)

            # Calculate search time based on how many retries have been made
            search_time = 15 * (max_retry_attempts - retries_left) + 5
            user_decision_time = 10  # Time the user takes to decide after driver availability
            events.append(create_event("Driver available", search_time, best_driver, driver_location, driver_coordinates,
                                       traffic_alert, traffic_severity))

            # Create the "Start car ride" event
            events.append(create_event("Start car ride", user_decision_time, best_driver, start_location, driver_coordinates,
                                       traffic_alert, traffic_severity, price))

            # Calculate the ride duration based on the start and end coordinates
            ride_duration = compute_ride_duration(start_coords, end_coords, start_location, end_location)

            # Mark the driver as busy for the duration of the ride
            busy_drivers[best_driver] = time.time() + ride_duration * 60  # Convert ride duration to seconds
            driver_ride_count[best_driver] += 1  # Increment the ride count for the driver

            # Create the "Ride finished" event
            events.append(create_event("Ride finished", search_time + 10 + ride_duration, best_driver, end_location, driver_coordinates,
                                       traffic_alert, traffic_severity, price))

            return events  # Return the events once the ride is complete

        # If no driver is found, decrement the retry count and try again
        retries_left -= 1
        events.append(create_event("Driver not available", time_offset=15 * (max_retry_attempts - retries_left) + 5))

    # If retries are exhausted, create a "User leaves" event
    events.append(create_event("User leaves", time_offset=45))
    return events

# Function to generate and save ride requests data
def save_requests_data(num_requests=100):
    ride_requests = []  # List to store all the ride requests

    # Generate multiple ride events and add them to the list
    for _ in range(num_requests):
        ride_events = generate_ride()
        if ride_events:
            ride_requests.extend(ride_events)

    # Prepare ride request data for compatibility with Avro format (e.g., converting coordinates to strings)
    for ride_event in ride_requests:
        ride_event["start_coordinates"] = json.dumps(ride_event["start_coordinates"]) if ride_event["start_coordinates"] else None
        ride_event["end_coordinates"] = json.dumps(ride_event["end_coordinates"]) if ride_event["end_coordinates"] else None
        ride_event["driver_coordinates"] = json.dumps(ride_event["driver_coordinates"]) if ride_event["driver_coordinates"] else None
        if ride_event["driver_id"] is not None:
            ride_event["driver_id"] = str(ride_event["driver_id"])  # Convert driver_id to string

    # Define file paths for saving data
    json_file_path = os.path.join(os.getcwd(), "ride_events.json")
    avro_file_path = os.path.join(os.getcwd(), "ride_events.avro")

    # Save the generated ride requests as a JSON file
    with open(json_file_path, "w") as json_file:
        json.dump(ride_requests, json_file, indent=4)

    # Define Avro schema for storing the ride events in Avro format
    ride_request_schema = {
        "type": "record",
        "name": "RideRequest",
        "fields": [
            {"name": "event_id", "type": "string"},
            {"name": "ride_id", "type": "string"},
            {"name": "user_id", "type": "string"},
            {"name": "timestamp_event", "type": "string"},
            {"name": "event_type", "type": "string"},
            {"name": "uber_type", "type": "string"},
            {"name": "start_location", "type": "string"},
            {"name": "end_location", "type": "string"},
            {"name": "start_coordinates", "type": "string"},
            {"name": "end_coordinates", "type": "string"},
            {"name": "driver_id", "type": ["null", "string"], "default": None},  # Allow null values
            {"name": "driver_location", "type": ["null", "string"], "default": None},  # Allow null values
            {"name": "driver_coordinates", "type": ["null", "string"], "default": None},  # Allow null values
            {"name": "traffic_alert", "type": ["null", "string"], "default": None},  # Allow null values
            {"name": "traffic_severity", "type": ["null", "int"], "default": None},  # Allow null values
            {"name": "price", "type": ["null", "float"], "default": None},  # Allow null values
            {"name": "day_of_week", "type": "string"}
        ]
    }

    # Save the ride requests to an Avro file
    with open(avro_file_path, 'wb') as avro_file:
        writer = fastavro.writer(avro_file, ride_request_schema, ride_requests)

    print(f"Generated {num_requests} ride requests and saved them to {json_file_path} and {avro_file_path}.")

# Call the function to generate and save the ride requests
save_requests_data(num_requests=100)


Generated 100 ride requests and saved them to /content/ride_events.json and /content/ride_events.avro.
