#**Milestone 1**

**Group 1:** José Tomás Burguillos, Ignacio García, Luis Gómez, Alberto Padilla, Antonino Sistac, Pablo Stoclet

##Installation of packages and loading of libraries

In [1]:
!pip install fastavro
!pip install faker

Collecting fastavro
  Downloading fastavro-1.10.0-cp311-cp311-manylinux_2_17_x86_64.manylinux2014_x86_64.whl.metadata (5.5 kB)
Downloading fastavro-1.10.0-cp311-cp311-manylinux_2_17_x86_64.manylinux2014_x86_64.whl (3.3 MB)
[2K   [90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━[0m [32m3.3/3.3 MB[0m [31m18.3 MB/s[0m eta [36m0:00:00[0m
[?25hInstalling collected packages: fastavro
Successfully installed fastavro-1.10.0
Collecting faker
  Downloading faker-37.0.0-py3-none-any.whl.metadata (15 kB)
Downloading faker-37.0.0-py3-none-any.whl (1.9 MB)
[2K   [90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━[0m [32m1.9/1.9 MB[0m [31m50.4 MB/s[0m eta [36m0:00:00[0m
[?25hInstalling collected packages: faker
Successfully installed faker-37.0.0


In [2]:

import io
import json
import random
from datetime import datetime, timedelta
from faker import Faker
from fastavro import writer, parse_schema

fake = Faker()

# Configurable parameters for the generation including num_drivers, requests and sure factor
def generate_data(num_drivers=500, num_requests=1000, surge_factor=1.0):
    data = []

    for _ in range(num_requests):
        ride_id = f"ride_{random.randint(1000, 9999)}"
        passenger_id = f"passenger_{random.randint(1, 1000)}"
        driver_id = f"driver_{random.randint(1, num_drivers)}"
        ride_status = random.choice(["ongoing", "completed", "cancelled"])

        request_time = datetime.now()
        pickup_time = request_time + timedelta(minutes=random.randint(2, 10))
        dropoff_time = pickup_time + timedelta(minutes=random.randint(10, 30)) if ride_status == "completed" else None
        ride_duration = (dropoff_time - pickup_time).total_seconds() / 60 if dropoff_time else None

        pickup_location = fake.address()
        dropoff_location = fake.address()
        distance = round(random.uniform(1, 30), 2)
        price = round(distance * surge_factor * random.uniform(1.5, 3.0), 2)
        tip = round(random.uniform(0, price * 0.2), 2)

        cancellation_reason = None
        if ride_status == "cancelled":
            cancellation_reason = random.choice(["Driver_cancelled", "Passenger_cancelled", "Other"])

        ride_event = {
            "ride_id": ride_id,
            "passenger_id": passenger_id,
            "driver_id": driver_id,
            "ride_status": ride_status,
            "request_time": request_time.timestamp(),
            "pickup_time": pickup_time.timestamp(),
            "dropoff_time": dropoff_time.timestamp() if dropoff_time else None,
            "ride_duration": ride_duration,
            "pickup_location": pickup_location,
            "dropoff_location": dropoff_location,
            "distance": distance,
            "price": price,
            "tip": tip,
            "cancellation_reason": cancellation_reason
        }

        data.append(ride_event)

    return data

# Generate ride data
ride_data = generate_data(num_drivers=500, num_requests=1000, surge_factor=1.5)

# Save data to JSON file
with open("ride_status.json", "w") as json_file:
    json.dump(ride_data, json_file, indent=4)

# Define AVRO schema
schema = {
    "type": "record",
    "name": "RideStatus",
    "fields": [
        {"name": "ride_id", "type": "string"},
        {"name": "passenger_id", "type": "string"},
        {"name": "driver_id", "type": "string"},
        {"name": "ride_status", "type": {"type": "enum", "name": "RideStatusEnum", "symbols": ["ongoing", "completed", "cancelled"]}},
        {"name": "request_time", "type": "double"},
        {"name": "pickup_time", "type": ["null", "double"], "default": None},
        {"name": "dropoff_time", "type": ["null", "double"], "default": None},
        {"name": "ride_duration", "type": ["null", "float"], "default": None},
        {"name": "pickup_location", "type": "string"},
        {"name": "dropoff_location", "type": "string"},
        {"name": "distance", "type": "float"},
        {"name": "price", "type": "float"},
        {"name": "tip", "type": "float"},
        {"name": "cancellation_reason", "type": ["null", "string"], "default": None}
    ]
}

# Serialize data in AVRO format
parsed_schema = parse_schema(schema)
with open("ride_status.avro", "wb") as avro_file:
    writer(avro_file, parsed_schema, ride_data)


##Avro Schemas

In [3]:
schema1 = {
    "doc": "A ride status recording.",
    "name": "RideStatus",
    "namespace": "acme.status",
    "type": "record",
    "fields": [
        {"name": "ride_id", "type": "string"},
        {"name": "passenger_id", "type": "string"},
        {"name": "ride_status", "type": {
            "type": "enum",
            "name": "RideStatusEnum",
            "symbols": ["ongoing", "completed", "cancelled"]
        }},
        {"name": "request_time", "type": "long"},
        {"name": "pickup_time", "type": "long"},
        {"name": "dropoff_time", "type": "long"},
        {"name": "ride_duration", "type": "float"},
        {"name": "pickup_location", "type": "string"},
        {"name": "dropoff_location", "type": "string"},
        {"name": "distance", "type": "float"},
        {"name": "price", "type": "float"},
        {"name": "tip", "type": "float"},
        {"name": "cancellation_reason", "type": [
            "null", {
                "type": "enum",
                "name": "CancellationReason",
                "symbols": ["Driver_cancelled", "Passenger_cancelled", "Other"]
            }
        ], "default": None}
    ]
}


In [4]:
schema2 = {
    "doc": "A passenger requests recording.",
    "name": "PassengerRequest",
    "namespace": "acme.requests",
    "type": "record",
    "fields": [
        {"name": "request_id", "type": "string"},
        {"name": "timestamp", "type": "long"},
        {"name": "passenger_id", "type": "string"},
        {"name": "pickup_location", "type": "string"},
        {"name": "dropoff_location", "type": "string"},
        {"name": "distance", "type": "float"},
        {"name": "status", "type": {
            "type": "enum",
            "name": "PassengerRequestStatus",
            "symbols": ["cancelled", "completed"]
        }},
        {"name": "payment_type", "type": [
            "null", {
                "type": "enum",
                "name": "Payment",
                "symbols": ["cash", "card"]
            }
        ], "default": None}
    ]
}


##Data Generator

In order to have an orientative estimate for the generation of the data, we asked ChatGPT how many rides are booked in Madrid on a casual Friday night. The prompt and outputs are as follows:

*Prompt: I want to know how many uber rides are booked on a Friday night in Madrid. You will also give me a brief summary of the sources from which you obtained the information, as well as a debrief of your calculations.*

Result:

To estimate the number of Uber rides on a Friday night in Madrid, let’s break it down using the available data:

	- Baseline Daily Average:
	The study recorded 665,977 rides over ~300 days, giving an average of 2,220 rides per day.

	- Friday Ride Volume Increase:
	Fridays generally experience higher ride demand, especially at night.
	Studies on urban ride-hailing trends suggest that weekend nights (Friday & Saturday) can see 1.5x to 2.5x the normal demand.
    
	- Estimating Friday Total Rides:
	Assuming a 1.8x increase on Fridays:
2,220 \times 1.8 = 3,996 \text{ rides}
	If Friday night (6 PM – 4 AM) accounts for ~70% of total Friday rides, then:
3,996 \times 0.7 = 2,797 \text{ rides on a typical Friday night}

Final Estimate:

On a Friday night in Madrid, approximately **2,800 Uber rides** are booked, though this number can vary due to events, weather, and surge pricing.


Sources:

1. MDPI Study on Uber Rides in Madrid
Herrera-García, L. P., Montoya-Torres, J. R., & Faulin, J. (2023). A data-driven approach for analyzing ride-hailing services: A case study of Uber in Madrid, Spain. Applied Sciences, 13(8), 5147. https://doi.org/10.3390/app13085147

2. Surge Pricing & Demand Trends in Spain
Sur in English. (2023, August 15). Bolt, Cabify and Uber triple their prices in Malaga at times of high demand. Sur in English. https://www.surinenglish.com/malaga/malaga-city/bolt-cabify-and-uber-triple-their-prices-20230815093123-nt.html

3. Urban Ride-Hailing Demand Trends (General Studies)
Various authors. (n.d.). Patterns of ride-hailing demand in urban areas. Research on transportation & mobility.


Initialise the faker and parse the schemas:

In [5]:
fake = Faker()

parsed_schema1 = parse_schema(schema1)
parsed_schema2 = parse_schema(schema2)

Define the generator functions:

In [6]:
def generate_ride_status():
    ride_id = fake.uuid4()
    passenger_id = fake.uuid4()
    ride_status = random.choice(["ongoing", "completed", "cancelled"])
    request_time = fake.unix_time()
    pickup_time = request_time + random.randint(0, 300)
    dropoff_time = pickup_time + random.randint(300, 1800) if ride_status == "completed" else 0
    ride_duration = dropoff_time - pickup_time if ride_status == "completed" else 0
    pickup_location = fake.address()
    dropoff_location = fake.address()
    distance = round(random.uniform(1.0, 20.0), 2)
    price = round(distance * random.uniform(1.5, 3.5), 2) if ride_status == "completed" else 0
    tip = round(price * 0.1, 2) if ride_status == "completed" and random.random() > 0.5 else 0
    cancellation_reason = None if ride_status != "cancelled" else random.choice(["Driver_cancelled", "Passenger_cancelled", "Other"])

    return {
        "ride_id": ride_id,
        "passenger_id": passenger_id,
        "ride_status": ride_status,
        "request_time": request_time,
        "pickup_time": pickup_time,
        "dropoff_time": dropoff_time,
        "ride_duration": ride_duration,
        "pickup_location": pickup_location,
        "dropoff_location": dropoff_location,
        "distance": distance,
        "price": price,
        "tip": tip,
        "cancellation_reason": cancellation_reason
    }


In [7]:
def generate_passenger_request():
    request_id = fake.uuid4()
    timestamp = fake.unix_time()
    passenger_id = fake.uuid4()
    pickup_location = fake.address()
    dropoff_location = fake.address()
    distance = round(random.uniform(1.0, 20.0), 2)
    status = random.choice(["cancelled", "completed"])
    payment_type = None if status == "cancelled" else random.choice(["cash", "card"])

    return {
        "request_id": request_id,
        "timestamp": timestamp,
        "passenger_id": passenger_id,
        "pickup_location": pickup_location,
        "dropoff_location": dropoff_location,
        "distance": distance,
        "status": status,
        "payment_type": payment_type
    }

Generate records:

In [8]:
ride_status_records = [generate_ride_status() for _ in range(2800)]
passenger_request_records = [generate_passenger_request() for _ in range(2800)]

Display sample records:

In [9]:
print("Sample of generated RideStatus records:")
for record in ride_status_records[:5]:
    print(record)

print("\nSample of generated PassengerRequest records:")
for record in passenger_request_records[:5]:
    print(record)

Sample of generated RideStatus records:
{'ride_id': '31b35dbd-c1af-4eb7-a962-f865eb704df2', 'passenger_id': '17b45400-ea05-44f8-956b-4aea091ee0d0', 'ride_status': 'completed', 'request_time': 1109823177.821655, 'pickup_time': 1109823257.821655, 'dropoff_time': 1109824334.821655, 'ride_duration': 1077.0, 'pickup_location': '426 Ochoa Haven Apt. 733\nSouth David, WA 38579', 'dropoff_location': '69483 Obrien Flat\nAlvaradochester, MN 79032', 'distance': 3.02, 'price': 8.03, 'tip': 0, 'cancellation_reason': None}
{'ride_id': '86c646b2-7aa4-4352-a2f7-ae266b28f363', 'passenger_id': 'fc72ca4a-2a27-4ab6-8e52-204e6db924db', 'ride_status': 'ongoing', 'request_time': 608205002.4828515, 'pickup_time': 608205034.4828515, 'dropoff_time': 0, 'ride_duration': 0, 'pickup_location': '3422 Rose Mountains\nChenborough, UT 59832', 'dropoff_location': '595 Jonathan Stravenue\nLake Scottborough, HI 61820', 'distance': 4.49, 'price': 0, 'tip': 0, 'cancellation_reason': None}
{'ride_id': 'b8628bca-0f78-4b80-87

Serialise the records:

In [10]:
with io.BytesIO() as bytes_writer:
    writer(bytes_writer, parsed_schema1, ride_status_records)
    serialized_data1 = bytes_writer.getvalue()
    print("Serialized RideStatus data (AVRO - Binary Format):", serialized_data1)

with io.BytesIO() as bytes_writer:
    writer(bytes_writer, parsed_schema2, passenger_request_records)
    serialized_data2 = bytes_writer.getvalue()
    print("Serialized PassengerRequest data (AVRO - Binary Format):", serialized_data2)

Serialized RideStatus data (AVRO - Binary Format): b'Obj\x01\x04\x14avro.codec\x08null\x16avro.schema\xee\r{"type": "record", "doc": "A ride status recording.", "name": "acme.status.RideStatus", "fields": [{"name": "ride_id", "type": "string"}, {"name": "passenger_id", "type": "string"}, {"name": "ride_status", "type": {"type": "enum", "name": "acme.status.RideStatusEnum", "symbols": ["ongoing", "completed", "cancelled"]}}, {"name": "request_time", "type": "long"}, {"name": "pickup_time", "type": "long"}, {"name": "dropoff_time", "type": "long"}, {"name": "ride_duration", "type": "float"}, {"name": "pickup_location", "type": "string"}, {"name": "dropoff_location", "type": "string"}, {"name": "distance", "type": "float"}, {"name": "price", "type": "float"}, {"name": "tip", "type": "float"}, {"default": null, "name": "cancellation_reason", "type": ["null", {"type": "enum", "name": "acme.status.CancellationReason", "symbols": ["Driver_cancelled", "Passenger_cancelled", "Other"]}]}]}\x00\x