diff --git a/.gitignore b/.gitignore index dfcfd56..da65b16 100644 --- a/.gitignore +++ b/.gitignore @@ -348,3 +348,12 @@ MigrationBackup/ # Ionide (cross platform F# VS Code tools) working folder .ionide/ + +# Airlines demo benchmark results +bench/results/*.csv +bench/results/*.txt +bench/results/*.log + +# Docker volumes and temporary files +docker/init/*.sql.gz +docker/init/*.disabled diff --git a/README.md b/README.md index fda3bbb..1c13dc4 100644 --- a/README.md +++ b/README.md @@ -46,3 +46,168 @@ The results below represent the amount of time (ns) the operation takes per iter As we can see in this comparison, Doublets are from 1746 to 15745 times faster than PostgreSQL in write operations, and from 100 to 9694 times faster in read operations. To get fresh numbers, please fork the repository and rerun benchmark in GitHub Actions. + +--- + +## Benchmark: Flight Timetable (Airlines Demo) + +A new benchmark comparing PostgreSQL 18 and Doublets on realistic airline timetable queries using the [PostgresPro Airlines demo database](https://postgrespro.ru/education/demodb). + +### What's Being Tested + +This benchmark evaluates both systems on: +- **Complex relational queries**: Multi-table joins with temporal validity checks +- **Large datasets**: 6-month to 1-year flight schedules (~500k flights) +- **Real-world operations**: Airport departures/arrivals, route searches, aggregations +- **Two durability modes**: + - **Durable** (production-like): Full ACID with WAL + - **Embedded-like**: WAL-light configuration (similar to embedded databases) + +### Queries + +The benchmark includes 9 timetable queries: +1. Departures from airport by date +2. Arrivals to airport by date +3. Next available flight on a route +4. Manual join with temporal validity checks +5. Route details with airport information +6. Flight status distribution +7. Busiest routes analysis +8. Flights by date range +9. And more... + +See [`sql/10_timetable_queries.sql`](sql/10_timetable_queries.sql) for details. + +### Getting Started + +#### Prerequisites +- Docker and Docker Compose +- ~10GB free disk space (for 1-year dataset) +- Python 3 (for result analysis) + +#### Quick Start + +```bash +# 1. Start PostgreSQL 18 with Airlines demo data (6 months) +cd docker +docker compose up -d + +# Wait for database to load (~5 minutes) +docker compose logs -f pg + +# 2. Run PostgreSQL benchmarks +cd ../bench/pg +./run.sh durable 6m 10 # Durable mode +./run.sh embedded 6m 10 # Embedded-like mode + +# 3. Run Doublets benchmarks (TODO: implement) +cd ../doublets +./run.sh volatile 6m 10 +./run.sh nonvolatile 6m 10 + +# 4. Compare results +ls -lh ../results/*.csv +``` + +#### Durability Modes + +**Durable Mode** (PostgreSQL default): +- Full ACID guarantees +- WAL enabled with fsync +- Production-safe +- Baseline for comparison + +**Embedded-Like Mode** (PostgreSQL optimized): +- `fsync=off`, `synchronous_commit=off` +- `wal_level=minimal` +- Optional: UNLOGGED tables +- Trades durability for speed (matches embedded DB behavior) + +To run in embedded-like mode: +```bash +cd docker +docker compose -f docker-compose.yml -f compose.embedded.yml up -d +``` + +### Directory Structure + +``` +docker/ + docker-compose.yml # PostgreSQL 18 setup (durable mode) + compose.embedded.yml # Override for embedded-like mode + init/ + 01_download_demo.sh # Auto-download Airlines demo DB + 99_unlogged.sql # Optional: convert to UNLOGGED tables + +sql/ + 10_timetable_queries.sql # All benchmark queries + +bench/ + pg/ + run.sh # PostgreSQL benchmark script + doublets/ + run.sh # Doublets benchmark script (placeholder) + results/ # CSV output and EXPLAIN logs + schema-mapping.md # How to map Airlines schema to Doublets + +docs/ + HOWTO.md # Detailed setup and usage guide +``` + +### Documentation + +- **[HOWTO.md](docs/HOWTO.md)** - Complete setup guide, dataset options, troubleshooting +- **[schema-mapping.md](bench/schema-mapping.md)** - Mapping Airlines entities to Doublets links +- **[10_timetable_queries.sql](sql/10_timetable_queries.sql)** - All queries with explanations + +### Dataset Sizes + +| Size | Period | Flights | PostgreSQL | Compressed | Download Time | +|------|----------|---------|------------|------------|---------------| +| 3m | 3 months | ~125k | ~1.3 GB | 133 MB | ~2 min | +| 6m | 6 months | ~250k | ~2.7 GB | 276 MB | ~5 min | +| 1y | 1 year | ~500k | ~5.4 GB | 558 MB | ~10 min | +| 2y | 2 years | ~1M | ~11 GB | 1137 MB | ~20 min | + +Default: **6 months** (good balance of size and completeness) + +### Implementation Status + +- [x] PostgreSQL 18 Docker setup +- [x] Airlines demo database integration +- [x] Timetable queries (10 queries) +- [x] PostgreSQL benchmark script +- [x] Durability modes (durable + embedded-like) +- [x] Schema mapping documentation +- [ ] **Doublets implementation** (TODO) +- [ ] Results comparison and visualization + +### Next Steps + +To complete this benchmark: + +1. **Implement Doublets data model** (see `bench/schema-mapping.md`) + - Map Airports, Routes, Flights to links + - Handle temporal data (validity ranges) + - Support NULL values and enums + +2. **Implement equivalent queries** + - Ensure exact same result sets as PostgreSQL + - Validate with checksums + +3. **Run comparative benchmarks** + - Two durability modes + - Two dataset sizes (6m, 1y) + - 10 runs per query + +4. **Analyze and visualize results** + - Compare median times + - Identify bottlenecks + - Generate comparison charts + +### References + +- [PostgresPro Airlines Demo](https://postgrespro.ru/education/demodb) - Official documentation +- [PostgreSQL 18 Release Notes](https://www.postgresql.org/docs/18/) - What's new +- [Doublets Documentation](https://github.com/linksplatform/Data.Doublets) - Link storage system +- [Issue #11](https://github.com/linksplatform/Comparisons.PostgreSQLVSDoublets/issues/11) - Original requirements diff --git a/bench/doublets/run.sh b/bench/doublets/run.sh new file mode 100755 index 0000000..55744d2 --- /dev/null +++ b/bench/doublets/run.sh @@ -0,0 +1,164 @@ +#!/bin/bash +# ============================================================================ +# Doublets Airlines Demo - Benchmark Script (Placeholder) +# ============================================================================ +# This script runs timetable queries against the Doublets implementation +# of the Airlines demo database and collects timing measurements. +# +# Usage: ./run.sh [num_runs] +# durability_mode: volatile or nonvolatile +# dataset_size: 3m, 6m, 1y, or 2y +# num_runs: number of iterations per query (default: 10) +# +# Example: +# ./run.sh volatile 6m 10 +# ./run.sh nonvolatile 1y 20 +# +# Output: +# - CSV file: ../results/doublets___.csv +# +# TODO: This is a placeholder. Implement actual Doublets benchmarking logic. +# ============================================================================ + +set -euo pipefail + +# Configuration +DURABILITY_MODE="${1:-volatile}" +DATASET_SIZE="${2:-6m}" +NUM_RUNS="${3:-10}" +TIMESTAMP=$(date +%Y%m%d_%H%M%S) +RESULTS_DIR="../results" +OUTPUT_CSV="${RESULTS_DIR}/doublets_${DURABILITY_MODE}_${DATASET_SIZE}_${TIMESTAMP}.csv" + +# Colors for output +RED='\033[0;31m' +GREEN='\033[0;32m' +YELLOW='\033[1;33m' +NC='\033[0m' # No Color + +# Create results directory +mkdir -p "${RESULTS_DIR}" + +# Initialize CSV +echo "system,durability_mode,dataset,query_id,run,rows,ms" > "${OUTPUT_CSV}" + +echo -e "${GREEN}=== Doublets Benchmark ===${NC}" +echo "Mode: ${DURABILITY_MODE}" +echo "Dataset: ${DATASET_SIZE}" +echo "Runs per query: ${NUM_RUNS}" +echo "Output: ${OUTPUT_CSV}" +echo "" + +echo -e "${YELLOW}=== TODO: Doublets Implementation ===${NC}" +echo "" +echo "This is a placeholder script. To complete the Doublets benchmark, implement:" +echo "" +echo "1. Data Loading:" +echo " - Load Airlines data from PostgreSQL or CSV export" +echo " - Convert entities to Doublets links (see bench/schema-mapping.md)" +echo " - Store in Doublets database (volatile or nonvolatile mode)" +echo "" +echo "2. Query Implementation:" +echo " - Implement equivalent queries using Doublets link API" +echo " - Ensure result sets match PostgreSQL exactly" +echo " - See bench/schema-mapping.md for query mappings" +echo "" +echo "3. Benchmark Execution:" +echo " - Warm-up: run each query once" +echo " - Measure: run each query ${NUM_RUNS} times" +echo " - Record: wall-clock time (ms) and row count" +echo " - Write results to CSV with same format as PostgreSQL benchmark" +echo "" +echo "4. Validation:" +echo " - Compare result sets with PostgreSQL (checksums)" +echo " - Verify performance improvements" +echo " - Report any discrepancies" +echo "" +echo "Suggested implementation approaches:" +echo "" +echo " a) Rust implementation (matching existing rust/ directory):" +echo " - Use existing Doublets Rust library" +echo " - Create Airlines data model" +echo " - Implement queries using Doublets API" +echo " - Add benchmark harness" +echo "" +echo " b) C++ implementation (matching existing cpp/ directory):" +echo " - Use existing Doublets C++ library" +echo " - Follow same approach as Rust" +echo "" +echo " c) Standalone tool:" +echo " - Create separate benchmark binary" +echo " - Load data from CSV export" +echo " - Run queries and output CSV" +echo "" +echo "Reference implementations:" +echo " - rust/benches/bench.rs - existing Doublets benchmarks" +echo " - bench/pg/run.sh - PostgreSQL benchmark (for CSV format)" +echo " - bench/schema-mapping.md - detailed mapping documentation" +echo "" +echo -e "${YELLOW}Until implementation is complete, this script generates mock data.${NC}" +echo "" + +# Generate mock data for testing the analysis pipeline +echo -e "${YELLOW}Generating mock benchmark data...${NC}" + +# Define query IDs (matching PostgreSQL) +QUERY_IDS=( + "departures_svo" + "arrivals_svo" + "next_flight_svx_wuh" + "manual_departures_svo" + "manual_arrivals_svo" + "route_details" + "status_counts" + "busiest_routes" + "date_range" +) + +# Mock: Doublets should be ~1000-10000x faster than PostgreSQL +# Generate realistic-looking performance data +for query_id in "${QUERY_IDS[@]}"; do + # Simulate row counts (would come from actual queries) + case "${query_id}" in + "next_flight_svx_wuh") + row_count=1 + ;; + "route_details") + row_count=20 + ;; + "status_counts") + row_count=5 + ;; + "busiest_routes") + row_count=10 + ;; + "date_range") + row_count=7 + ;; + *) + row_count=$((RANDOM % 100 + 10)) + ;; + esac + + # Generate ${NUM_RUNS} measurements with small variance + base_time=$((RANDOM % 50 + 10)) # 10-60ms for Doublets (vs seconds for PostgreSQL) + + for run in $(seq 1 "${NUM_RUNS}"); do + # Add small random variance + variance=$((RANDOM % 20 - 10)) + time=$((base_time + variance)) + [ ${time} -lt 1 ] && time=1 # Ensure positive + + echo "doublets,${DURABILITY_MODE},${DATASET_SIZE},${query_id},${run},${row_count},${time}" >> "${OUTPUT_CSV}" + done +done + +echo -e "${GREEN}Mock data generated${NC}" +echo "" + +echo -e "${GREEN}=== Benchmark Complete (Mock) ===${NC}" +echo "Mock results saved to: ${OUTPUT_CSV}" +echo "" +echo -e "${RED}WARNING: This data is MOCK data for testing purposes.${NC}" +echo -e "${RED}Implement actual Doublets queries to get real measurements.${NC}" +echo "" diff --git a/bench/pg/run.sh b/bench/pg/run.sh new file mode 100755 index 0000000..1555404 --- /dev/null +++ b/bench/pg/run.sh @@ -0,0 +1,207 @@ +#!/bin/bash +# ============================================================================ +# PostgreSQL Airlines Demo - Benchmark Script +# ============================================================================ +# This script runs timetable queries against the PostgreSQL Airlines demo +# database and collects timing measurements for benchmarking. +# +# Usage: ./run.sh [num_runs] +# durability_mode: durable or embedded +# dataset_size: 3m, 6m, 1y, or 2y +# num_runs: number of iterations per query (default: 10) +# +# Example: +# ./run.sh durable 6m 10 +# ./run.sh embedded 1y 20 +# +# Output: +# - CSV file: ../results/pg___.csv +# - EXPLAIN logs: ../results/pg____explain.txt +# ============================================================================ + +set -euo pipefail + +# Configuration +DURABILITY_MODE="${1:-durable}" +DATASET_SIZE="${2:-6m}" +NUM_RUNS="${3:-10}" +TIMESTAMP=$(date +%Y%m%d_%H%M%S) +RESULTS_DIR="../results" +OUTPUT_CSV="${RESULTS_DIR}/pg_${DURABILITY_MODE}_${DATASET_SIZE}_${TIMESTAMP}.csv" +EXPLAIN_LOG="${RESULTS_DIR}/pg_${DURABILITY_MODE}_${DATASET_SIZE}_${TIMESTAMP}_explain.txt" + +# Database connection +PGHOST="${PGHOST:-localhost}" +PGPORT="${PGPORT:-5432}" +PGUSER="${PGUSER:-postgres}" +PGDATABASE="${PGDATABASE:-demo}" +export PGPASSWORD="${PGPASSWORD:-postgres}" + +# Colors for output +RED='\033[0;31m' +GREEN='\033[0;32m' +YELLOW='\033[1;33m' +NC='\033[0m' # No Color + +# Create results directory +mkdir -p "${RESULTS_DIR}" + +# Initialize CSV +echo "system,durability_mode,dataset,query_id,run,rows,ms" > "${OUTPUT_CSV}" + +echo -e "${GREEN}=== PostgreSQL Benchmark ===${NC}" +echo "Mode: ${DURABILITY_MODE}" +echo "Dataset: ${DATASET_SIZE}" +echo "Runs per query: ${NUM_RUNS}" +echo "Output: ${OUTPUT_CSV}" +echo "" + +# Function to execute a query and measure time +benchmark_query() { + local query_id="$1" + local query="$2" + local run="$3" + + # Execute query and measure wall-clock time + local start_ms=$(date +%s%3N) + local row_count=$(psql -h "${PGHOST}" -p "${PGPORT}" -U "${PGUSER}" -d "${PGDATABASE}" \ + -t -c "${query}" | wc -l) + local end_ms=$(date +%s%3N) + local elapsed_ms=$((end_ms - start_ms)) + + # Trim whitespace from row_count + row_count=$(echo "${row_count}" | xargs) + + # Write to CSV + echo "pg,${DURABILITY_MODE},${DATASET_SIZE},${query_id},${run},${row_count},${elapsed_ms}" >> "${OUTPUT_CSV}" + + echo " Run ${run}: ${elapsed_ms}ms (${row_count} rows)" +} + +# Function to run EXPLAIN ANALYZE for a query +explain_query() { + local query_id="$1" + local query="$2" + + echo "" >> "${EXPLAIN_LOG}" + echo "========================================" >> "${EXPLAIN_LOG}" + echo "Query: ${query_id}" >> "${EXPLAIN_LOG}" + echo "========================================" >> "${EXPLAIN_LOG}" + echo "" >> "${EXPLAIN_LOG}" + + psql -h "${PGHOST}" -p "${PGPORT}" -U "${PGUSER}" -d "${PGDATABASE}" \ + -c "EXPLAIN (ANALYZE, BUFFERS, TIMING) ${query}" >> "${EXPLAIN_LOG}" 2>&1 || true + + echo "" >> "${EXPLAIN_LOG}" +} + +# Define queries +declare -A QUERIES + +QUERIES["departures_svo"]="SELECT flight_id, route_no, departure_airport, arrival_airport, scheduled_departure, scheduled_arrival, status FROM bookings.timetable WHERE departure_airport = 'SVO' AND (scheduled_departure AT TIME ZONE 'UTC')::date = DATE '2025-10-07' ORDER BY scheduled_departure;" + +QUERIES["arrivals_svo"]="SELECT flight_id, route_no, departure_airport, arrival_airport, scheduled_departure, scheduled_arrival, status FROM bookings.timetable WHERE arrival_airport = 'SVO' AND (scheduled_arrival AT TIME ZONE 'UTC')::date = DATE '2025-10-07' ORDER BY scheduled_arrival;" + +QUERIES["next_flight_svx_wuh"]="SELECT flight_id, route_no, departure_airport, arrival_airport, scheduled_departure, scheduled_arrival, status FROM bookings.timetable WHERE departure_airport = 'SVX' AND arrival_airport = 'WUH' AND scheduled_departure > bookings.now() ORDER BY scheduled_departure LIMIT 1;" + +QUERIES["manual_departures_svo"]="SELECT f.flight_id, r.route_no, r.departure_airport, r.arrival_airport, f.status, f.scheduled_departure, f.scheduled_arrival FROM bookings.flights AS f JOIN bookings.routes AS r ON r.route_no = f.route_no AND r.validity @> f.scheduled_departure WHERE r.departure_airport = 'SVO' AND f.scheduled_departure::date = DATE '2025-10-07' ORDER BY f.scheduled_departure;" + +QUERIES["manual_arrivals_svo"]="SELECT f.flight_id, r.route_no, r.departure_airport, r.arrival_airport, f.status, f.scheduled_departure, f.scheduled_arrival FROM bookings.flights AS f JOIN bookings.routes AS r ON r.route_no = f.route_no AND r.validity @> f.scheduled_departure WHERE r.arrival_airport = 'SVO' AND f.scheduled_arrival::date = DATE '2025-10-07' ORDER BY f.scheduled_arrival;" + +QUERIES["route_details"]="SELECT f.flight_id, f.route_no, dep.airport_code AS dep_code, arr.airport_code AS arr_code, f.scheduled_departure, f.scheduled_arrival, f.status FROM bookings.flights f JOIN bookings.routes r ON r.route_no = f.route_no AND r.validity @> f.scheduled_departure JOIN bookings.airports dep ON dep.airport_code = r.departure_airport JOIN bookings.airports arr ON arr.airport_code = r.arrival_airport WHERE f.route_no = 'PG0001' ORDER BY f.scheduled_departure LIMIT 20;" + +QUERIES["status_counts"]="SELECT f.status, COUNT(*) AS flight_count FROM bookings.flights f JOIN bookings.routes r ON r.route_no = f.route_no AND r.validity @> f.scheduled_departure WHERE r.departure_airport = 'SVO' GROUP BY f.status ORDER BY flight_count DESC;" + +QUERIES["busiest_routes"]="SELECT r.departure_airport, r.arrival_airport, COUNT(*) AS num_flights FROM bookings.flights f JOIN bookings.routes r ON r.route_no = f.route_no AND r.validity @> f.scheduled_departure WHERE r.departure_airport = 'SVO' GROUP BY r.departure_airport, r.arrival_airport ORDER BY num_flights DESC LIMIT 10;" + +QUERIES["date_range"]="SELECT DATE(f.scheduled_departure) AS flight_date, COUNT(*) AS num_flights FROM bookings.flights f JOIN bookings.routes r ON r.route_no = f.route_no AND r.validity @> f.scheduled_departure WHERE f.scheduled_departure >= DATE '2025-10-01' AND f.scheduled_departure < DATE '2025-10-08' GROUP BY DATE(f.scheduled_departure) ORDER BY flight_date;" + +# Check database connection +echo -e "${YELLOW}Checking database connection...${NC}" +if ! psql -h "${PGHOST}" -p "${PGPORT}" -U "${PGUSER}" -d "${PGDATABASE}" -c "SELECT 1;" >/dev/null 2>&1; then + echo -e "${RED}ERROR: Cannot connect to database${NC}" + echo "Host: ${PGHOST}:${PGPORT}" + echo "Database: ${PGDATABASE}" + echo "User: ${PGUSER}" + exit 1 +fi +echo -e "${GREEN}Connected successfully${NC}" +echo "" + +# Verify database has data +echo -e "${YELLOW}Verifying database...${NC}" +FLIGHT_COUNT=$(psql -h "${PGHOST}" -p "${PGPORT}" -U "${PGUSER}" -d "${PGDATABASE}" -t -c "SELECT COUNT(*) FROM bookings.flights;" | xargs) +echo "Flights in database: ${FLIGHT_COUNT}" +if [ "${FLIGHT_COUNT}" -eq 0 ]; then + echo -e "${RED}ERROR: Database has no flights${NC}" + exit 1 +fi +echo "" + +# Run EXPLAIN ANALYZE for each query (once) +echo -e "${YELLOW}Collecting EXPLAIN ANALYZE plans...${NC}" +for query_id in "${!QUERIES[@]}"; do + echo " ${query_id}..." + explain_query "${query_id}" "${QUERIES[$query_id]}" +done +echo -e "${GREEN}EXPLAIN plans saved to: ${EXPLAIN_LOG}${NC}" +echo "" + +# Warm-up: run each query once +echo -e "${YELLOW}Warming up (running each query once)...${NC}" +for query_id in "${!QUERIES[@]}"; do + echo " ${query_id}..." + psql -h "${PGHOST}" -p "${PGPORT}" -U "${PGUSER}" -d "${PGDATABASE}" \ + -t -c "${QUERIES[$query_id]}" >/dev/null 2>&1 || true +done +echo -e "${GREEN}Warm-up complete${NC}" +echo "" + +# Run benchmarks +echo -e "${YELLOW}Running benchmarks...${NC}" +for query_id in "${!QUERIES[@]}"; do + echo -e "${GREEN}Query: ${query_id}${NC}" + for run in $(seq 1 "${NUM_RUNS}"); do + benchmark_query "${query_id}" "${QUERIES[$query_id]}" "${run}" + done + echo "" +done + +echo -e "${GREEN}=== Benchmark Complete ===${NC}" +echo "Results saved to: ${OUTPUT_CSV}" +echo "EXPLAIN logs saved to: ${EXPLAIN_LOG}" +echo "" + +# Generate summary statistics +echo -e "${YELLOW}Generating summary statistics...${NC}" +python3 - < Route` relationship = link from flight instance to route instance +- Many-to-many relationships = pairs of links through junction instances + +### 3. Temporal Data + +Timestamps and date ranges are represented as: +- Numeric values (Unix timestamps or custom encoding) +- Range types (validity periods) as pairs of start/end links + +## Schema Mapping Tables + +### Airports Table + +**PostgreSQL Schema:** +```sql +CREATE TABLE bookings.airports ( + airport_code char(3) PRIMARY KEY, + airport_name jsonb NOT NULL, + city jsonb NOT NULL, + coordinates point NOT NULL, + timezone text NOT NULL +); +``` + +**Doublets Representation:** +``` +Type: Airport +Attributes: + - airport_code (string → numeric ID) + - airport_name_en (string) + - airport_name_ru (string) + - city_en (string) + - city_ru (string) + - latitude (float → integer encoding) + - longitude (float → integer encoding) + - timezone (string → numeric ID) + +Link Structure: + [Airport Type] → [Airport Instance: SVO] + [Airport Instance: SVO] → [Code] → [SVO_numeric_id] + [Airport Instance: SVO] → [Name_EN] → ["Sheremetyevo"] + [Airport Instance: SVO] → [Latitude] → [encoded_value] + [Airport Instance: SVO] → [Longitude] → [encoded_value] +``` + +### Routes Table + +**PostgreSQL Schema:** +```sql +CREATE TABLE bookings.routes ( + route_no char(6) PRIMARY KEY, + departure_airport char(3) REFERENCES airports, + arrival_airport char(3) REFERENCES airports, + aircraft_code char(3) REFERENCES aircraft, + duration interval NOT NULL, + validity tstzrange NOT NULL +); +``` + +**Doublets Representation:** +``` +Type: Route +Relationships: + - departure_airport → Airport instance + - arrival_airport → Airport instance + - aircraft_code → Aircraft instance +Attributes: + - route_no (string → numeric ID) + - duration_seconds (integer) + - validity_start (timestamp as integer) + - validity_end (timestamp as integer) + +Link Structure: + [Route Type] → [Route Instance: PG0001] + [Route Instance: PG0001] → [RouteNo] → [PG0001_numeric_id] + [Route Instance: PG0001] → [DepartureAirport] → [Airport Instance: SVO] + [Route Instance: PG0001] → [ArrivalAirport] → [Airport Instance: LED] + [Route Instance: PG0001] → [ValidityStart] → [timestamp_value] + [Route Instance: PG0001] → [ValidityEnd] → [timestamp_value] +``` + +### Flights Table + +**PostgreSQL Schema:** +```sql +CREATE TABLE bookings.flights ( + flight_id serial PRIMARY KEY, + route_no char(6) REFERENCES routes, + aircraft_code char(3) REFERENCES aircraft, + status varchar(20) NOT NULL, + scheduled_departure timestamptz NOT NULL, + scheduled_arrival timestamptz NOT NULL, + actual_departure timestamptz, + actual_arrival timestamptz +); +``` + +**Doublets Representation:** +``` +Type: Flight +Relationships: + - route_no → Route instance + - aircraft_code → Aircraft instance +Attributes: + - flight_id (integer, direct mapping) + - status (enum → numeric ID: Scheduled=1, OnTime=2, Delayed=3, etc.) + - scheduled_departure (timestamp as integer) + - scheduled_arrival (timestamp as integer) + - actual_departure (timestamp as integer or NULL) + - actual_arrival (timestamp as integer or NULL) + +Link Structure: + [Flight Type] → [Flight Instance: 123456] + [Flight Instance: 123456] → [FlightID] → [123456] + [Flight Instance: 123456] → [Route] → [Route Instance: PG0001] + [Flight Instance: 123456] → [Status] → [Status: OnTime] + [Flight Instance: 123456] → [ScheduledDeparture] → [timestamp_value] + [Flight Instance: 123456] → [ScheduledArrival] → [timestamp_value] +``` + +### Aircraft Table + +**PostgreSQL Schema:** +```sql +CREATE TABLE bookings.aircraft ( + aircraft_code char(3) PRIMARY KEY, + model jsonb NOT NULL, + range integer NOT NULL +); +``` + +**Doublets Representation:** +``` +Type: Aircraft +Attributes: + - aircraft_code (string → numeric ID) + - model_en (string) + - model_ru (string) + - range (integer) + +Link Structure: + [Aircraft Type] → [Aircraft Instance: 773] + [Aircraft Instance: 773] → [Code] → [773_numeric_id] + [Aircraft Instance: 773] → [Model_EN] → ["Boeing 777-300"] + [Aircraft Instance: 773] → [Range] → [11100] +``` + +## Query Mapping + +### Query 1: Departures from Airport on Date + +**PostgreSQL:** +```sql +SELECT * +FROM bookings.timetable +WHERE departure_airport = 'SVO' + AND scheduled_departure::date = DATE '2025-10-07'; +``` + +**Doublets Equivalent:** +``` +1. Find all Flight instances +2. Filter by: + - Flight → Route → DepartureAirport = [Airport Instance: SVO] + - Flight → ScheduledDeparture between [date_start, date_end) +3. For each matching flight: + - Traverse Flight → Route to get departure/arrival airports + - Traverse Flight → ScheduledDeparture/Arrival for times + - Traverse Flight → Status for status +4. Return result set +``` + +### Query 2: Next Available Flight + +**PostgreSQL:** +```sql +SELECT * +FROM bookings.timetable +WHERE departure_airport = 'SVX' + AND arrival_airport = 'WUH' + AND scheduled_departure > bookings.now() +ORDER BY scheduled_departure +LIMIT 1; +``` + +**Doublets Equivalent:** +``` +1. Find all Flight instances where: + - Flight → Route → DepartureAirport = [Airport Instance: SVX] + - Flight → Route → ArrivalAirport = [Airport Instance: WUH] + - Flight → ScheduledDeparture > [current_model_time] +2. Sort by ScheduledDeparture (ascending) +3. Return first match +``` + +### Query 3: Manual Join with Validity Check + +**PostgreSQL:** +```sql +SELECT f.flight_id, r.route_no +FROM bookings.flights f +JOIN bookings.routes r + ON r.route_no = f.route_no + AND r.validity @> f.scheduled_departure +WHERE r.departure_airport = 'SVO'; +``` + +**Doublets Equivalent:** +``` +1. Find all Flight instances +2. For each flight: + - Traverse Flight → Route + - Check Route → ValidityStart <= Flight → ScheduledDeparture + - Check Route → ValidityEnd >= Flight → ScheduledDeparture + - Check Route → DepartureAirport = [Airport Instance: SVO] +3. Return matching flights with route info +``` + +## Data Type Encoding + +### Strings +- Convert to numeric IDs using a string interning table +- Store mapping: String → Numeric ID +- Use numeric IDs in all links + +### Timestamps +- Store as Unix timestamps (seconds since epoch) +- Or use custom encoding (days since base date + seconds within day) +- Range queries use numeric comparisons + +### JSON/JSONB +- Extract relevant fields (e.g., `airport_name->>'en'`) +- Store each field as separate attribute link +- Ignore unused fields + +### NULL Values +- Option 1: Omit the attribute link (absence = NULL) +- Option 2: Create special NULL link target + +### Enums (Status, etc.) +- Map each value to numeric ID +- Store as: `[Instance] → [AttributeType] → [EnumValue_ID]` + +## Implementation Considerations + +### 1. Data Loading +- Parse PostgreSQL dump or query results +- Convert to Doublets links in batches +- Build indexes for common access patterns + +### 2. Query Translation +- Implement query builder that translates SQL-like operations to link traversals +- Support common patterns: filter, join, aggregate, order, limit + +### 3. Performance Optimizations +- Index on departure_airport for fast filtering +- Index on scheduled_departure for date range queries +- Consider denormalization for frequent joins (e.g., cache airport codes on flights) + +### 4. Validity Checks +- Implement range overlap efficiently: + - `validity_start <= scheduled_departure <= validity_end` +- Consider creating validity index + +### 5. Benchmark Equivalence +- Ensure result sets match PostgreSQL exactly (same rows, same order) +- Compute checksums (hash of sorted result set) +- Report any discrepancies + +## Storage Estimates + +Assuming 1-year dataset (~5.4GB in PostgreSQL): + +- **Airports**: ~100 records → ~2,000 links (20 attributes each) +- **Routes**: ~500 records → ~20,000 links (40 attributes each) +- **Flights**: ~500,000 records → ~4,000,000 links (8 attributes each) +- **Aircraft**: ~10 records → ~100 links + +**Total estimate**: ~4-5 million links + +With Doublets split storage (index + data): +- Index: ~200MB (assuming 48 bytes per link) +- Data: ~200MB +- **Total**: ~400MB (10x compression vs PostgreSQL) + +## Testing Strategy + +1. **Data Integrity**: + - Load subset of data (3 months) + - Verify all entities and relationships present + - Check attribute values match + +2. **Query Correctness**: + - Run same query on both systems + - Compare result sets (row counts, values, order) + - Report mismatches + +3. **Performance Baseline**: + - Run benchmarks on small dataset (3m) + - Verify Doublets is faster (as expected) + - Scale to larger datasets + +4. **Durability Modes**: + - Test Doublets volatile (in-memory) mode + - Test Doublets non-volatile (persistent) mode + - Compare with PostgreSQL durable/embedded modes + +## Open Questions + +1. **How to handle JSONB fields efficiently?** + - Extract all fields vs. only used fields + - Dynamic schema vs. fixed schema + +2. **String interning strategy?** + - Global intern table vs. per-type + - Hash-based vs. sequential IDs + +3. **Timestamp precision?** + - Seconds vs. milliseconds + - Timezone handling + +4. **Query API design?** + - SQL-like DSL vs. native link API + - Type safety vs. flexibility + +## Next Steps + +1. Implement basic entity mapping (Airport, Flight, Route) +2. Create data loader script (PostgreSQL → Doublets) +3. Implement first query (departures by airport/date) +4. Verify correctness against PostgreSQL +5. Add timing measurements +6. Iterate on remaining queries diff --git a/docker/compose.embedded.yml b/docker/compose.embedded.yml new file mode 100644 index 0000000..55330a4 --- /dev/null +++ b/docker/compose.embedded.yml @@ -0,0 +1,13 @@ +version: "3.9" +services: + pg: + command: + - "postgres" + - "-c" + - "fsync=off" + - "-c" + - "synchronous_commit=off" + - "-c" + - "full_page_writes=off" + - "-c" + - "wal_level=minimal" diff --git a/docker/docker-compose.yml b/docker/docker-compose.yml new file mode 100644 index 0000000..b319380 --- /dev/null +++ b/docker/docker-compose.yml @@ -0,0 +1,22 @@ +version: "3.9" +services: + pg: + image: postgres:18 + container_name: airlines-pg18 + environment: + POSTGRES_USER: postgres + POSTGRES_PASSWORD: postgres + POSTGRES_DB: postgres + ports: + - "5432:5432" + volumes: + - pgdata:/var/lib/postgresql/data + - ./init:/docker-entrypoint-initdb.d + healthcheck: + test: ["CMD-SHELL", "pg_isready -U postgres"] + interval: 5s + timeout: 5s + retries: 20 + +volumes: + pgdata: diff --git a/docker/init/01_download_demo.sh b/docker/init/01_download_demo.sh new file mode 100755 index 0000000..2b8463a --- /dev/null +++ b/docker/init/01_download_demo.sh @@ -0,0 +1,20 @@ +#!/bin/bash +# This script downloads the PostgresPro Airlines demo database. +# It will be executed automatically when the container starts (if the DB doesn't exist yet). + +set -e + +# Default to 6 months dataset (can be overridden with DEMO_SIZE env var) +DEMO_SIZE=${DEMO_SIZE:-6m} +DEMO_DATE="20250901" + +echo "Downloading PostgresPro Airlines demo database (${DEMO_SIZE})..." + +cd /tmp +wget -q "https://edu.postgrespro.ru/demo-${DEMO_DATE}-${DEMO_SIZE}.sql.gz" -O demo.sql.gz + +echo "Extracting and loading demo database..." +gunzip -c demo.sql.gz | psql -U postgres -d postgres + +echo "Demo database loaded successfully!" +rm demo.sql.gz diff --git a/docker/init/99_unlogged.sql b/docker/init/99_unlogged.sql new file mode 100644 index 0000000..84d7a24 --- /dev/null +++ b/docker/init/99_unlogged.sql @@ -0,0 +1,21 @@ +-- This script converts all tables in the bookings schema to UNLOGGED +-- for embedded-like (WAL-light) mode benchmarking. +-- UNLOGGED tables are faster but not crash-safe (truncated on crash/restart). +-- Use this only for embedded-like performance testing, not production. + +DO $$ +DECLARE + r record; +BEGIN + FOR r IN + SELECT format('%I.%I', n.nspname, c.relname) AS fqname + FROM pg_class c + JOIN pg_namespace n ON n.oid = c.relnamespace + WHERE n.nspname = 'bookings' AND c.relkind = 'r' + LOOP + EXECUTE 'ALTER TABLE ' || r.fqname || ' SET UNLOGGED'; + RAISE NOTICE 'Converted % to UNLOGGED', r.fqname; + END LOOP; +END$$; + +-- To revert to durable (LOGGED) tables, replace SET UNLOGGED with SET LOGGED above diff --git a/docs/HOWTO.md b/docs/HOWTO.md new file mode 100644 index 0000000..61b48ec --- /dev/null +++ b/docs/HOWTO.md @@ -0,0 +1,378 @@ +# Airlines Demo Database - Benchmark HOWTO + +This guide explains how to set up the PostgresPro Airlines demo database, run timetable queries, and benchmark PostgreSQL vs Doublets in both durable and embedded-like modes. + +## Prerequisites + +- Docker and Docker Compose installed +- At least 10GB free disk space (for 1-year dataset) +- `psql` client (optional, for manual queries) +- `gh` CLI (GitHub CLI) for CI integration + +## Quick Start + +### 1. Start PostgreSQL 18 (Durable Mode) + +```bash +cd docker +docker compose up -d +``` + +This will: +- Pull PostgreSQL 18 image +- Create a persistent volume for data +- Download and load the Airlines demo database (6 months by default) +- Start the database on port 5432 + +Wait for the database to be ready: +```bash +docker compose logs -f pg +``` + +### 2. Connect to the Database + +```bash +docker compose exec pg psql -U postgres -d demo +``` + +### 3. Verify the Setup + +```sql +-- Check model time +SELECT bookings.now(); + +-- Check table sizes +\dt+ bookings.* + +-- Check airports +SELECT COUNT(*) FROM bookings.airports; + +-- Check flights +SELECT COUNT(*) FROM bookings.flights; +``` + +## Dataset Sizes + +You can choose different dataset sizes by setting the `DEMO_SIZE` environment variable: + +- `3m` - 3 months (1.3 GB, ~133 MB compressed) +- `6m` - 6 months (2.7 GB, ~276 MB compressed) **[Default]** +- `1y` - 1 year (5.4 GB, ~558 MB compressed) +- `2y` - 2 years (11 GB, ~1137 MB compressed) + +To use a different size: + +```bash +# Option 1: Set environment variable +export DEMO_SIZE=1y +docker compose up -d + +# Option 2: Inline +DEMO_SIZE=1y docker compose up -d +``` + +## Durability Modes + +### Durable Mode (Default) + +This is the standard, production-like mode with full ACID guarantees: + +```bash +docker compose -f docker-compose.yml up -d +``` + +Configuration: +- Full WAL (Write-Ahead Logging) +- `fsync=on` (default) +- `synchronous_commit=on` (default) +- All tables are LOGGED (persistent) + +### Embedded-Like Mode (WAL-Light) + +This mode trades durability for performance, similar to embedded databases: + +```bash +docker compose -f docker-compose.yml -f compose.embedded.yml up -d +``` + +Configuration: +- `fsync=off` - No forced disk syncs +- `synchronous_commit=off` - Async commit +- `full_page_writes=off` - Skip full-page writes +- `wal_level=minimal` - Minimal WAL logging + +#### Optional: UNLOGGED Tables + +For maximum speed (no WAL at all for data tables), convert tables to UNLOGGED: + +```sql +-- Connect to the database +docker compose exec pg psql -U postgres -d demo + +-- Run the unlogged conversion script +\i /docker-entrypoint-initdb.d/99_unlogged.sql +``` + +**Warning**: UNLOGGED tables: +- Are truncated after crashes or unclean shutdowns +- Cannot be replicated +- Are NOT suitable for production +- Should only be used for benchmarking + +To revert to LOGGED tables: + +```sql +DO $$ +DECLARE r record; +BEGIN + FOR r IN + SELECT format('%I.%I', n.nspname, c.relname) AS fqname + FROM pg_class c + JOIN pg_namespace n ON n.oid = c.relnamespace + WHERE n.nspname = 'bookings' AND c.relkind = 'r' + LOOP + EXECUTE 'ALTER TABLE ' || r.fqname || ' SET LOGGED'; + END LOOP; +END$$; +``` + +### Verify Durability Mode + +Check table persistence: + +```sql +SELECT relname, + CASE relpersistence + WHEN 'p' THEN 'LOGGED' + WHEN 'u' THEN 'UNLOGGED' + WHEN 't' THEN 'TEMPORARY' + END AS persistence +FROM pg_class +WHERE relnamespace = 'bookings'::regnamespace AND relkind='r' +ORDER BY relname; +``` + +Check WAL settings: + +```sql +SHOW fsync; +SHOW synchronous_commit; +SHOW full_page_writes; +SHOW wal_level; +``` + +## Custom Data Generation + +Instead of loading a pre-generated snapshot, you can generate your own data: + +1. Start the container without the download script: + +```bash +# Temporarily disable the download script +mv docker/init/01_download_demo.sh docker/init/01_download_demo.sh.disabled +docker compose up -d +``` + +2. Download the generator from PostgresPro: + +```bash +wget https://edu.postgrespro.ru/demo-generator.tar.gz +tar -xzf demo-generator.tar.gz +``` + +3. Install in PostgreSQL: + +```bash +docker compose exec -T pg psql -U postgres < install.sql +``` + +4. Generate data: + +```sql +-- Connect to demo database +docker compose exec pg psql -U postgres -d demo + +-- Generate 1 year of data with 4 worker processes +CALL bookings.generate(now(), now() + interval '1 year', 4); + +-- Monitor progress +SELECT bookings.busy(); + +-- Run post-generation checks +\i check.sql +``` + +## Running Timetable Queries + +All timetable queries are in `sql/10_timetable_queries.sql`. + +### From Host + +```bash +docker compose exec -T pg psql -U postgres -d demo < ../sql/10_timetable_queries.sql +``` + +### From Inside Container + +```bash +docker compose exec pg psql -U postgres -d demo +\i /path/to/sql/10_timetable_queries.sql +``` + +## Running Benchmarks + +### PostgreSQL Benchmarks + +The benchmark script runs all timetable queries with timing and EXPLAIN ANALYZE: + +```bash +cd bench/pg +./run.sh durable 6m # Durable mode, 6-month dataset +./run.sh embedded 6m # Embedded mode, 6-month dataset +``` + +Results are saved to `bench/results/pg_*.csv`. + +### Doublets Benchmarks + +```bash +cd bench/doublets +# TODO: Add Doublets benchmark instructions +``` + +Results are saved to `bench/results/doublets_*.csv`. + +## Benchmark Procedure + +1. **Prepare environment**: Start Docker with desired durability mode +2. **Warm-up**: Run each query once (results discarded) +3. **Measure**: Run each query 10 times, record wall-clock time +4. **Collect**: Save results to CSV with columns: + - `system` (pg or doublets) + - `durability_mode` (durable or embedded) + - `dataset` (3m, 6m, 1y, 2y) + - `query_id` (departures_svo, arrivals_svo, etc.) + - `run` (1-10) + - `rows` (result count) + - `ms` (wall-clock milliseconds) + +5. **Analyze**: Compute min/median/p95 for each query + +## Database Schema Overview + +The Airlines demo database contains: + +### Main Tables + +- `bookings.airports` - Airport codes and locations +- `bookings.aircraft` - Aircraft models and seat configurations +- `bookings.routes` - Flight routes with validity periods +- `bookings.flights` - Scheduled flights (references routes) +- `bookings.bookings` - Passenger bookings +- `bookings.tickets` - Individual tickets +- `bookings.ticket_flights` - Ticket-to-flight associations +- `bookings.boarding_passes` - Boarding pass assignments + +### Key Concepts + +- **Model Time**: `bookings.now()` returns the current "time" in the simulation +- **Validity Period**: Routes have `validity` ranges (tstzrange) that must overlap with flight scheduled times +- **Airport-Local Times**: The `bookings.timetable` view handles timezone conversions +- **Temporal Joins**: When joining routes to flights, always check `r.validity @> f.scheduled_departure` + +## Validation Checks + +### Data Consistency + +```sql +-- Count flights by status +SELECT status, COUNT(*) FROM bookings.flights GROUP BY status; + +-- Check date ranges +SELECT + MIN(scheduled_departure) AS first_flight, + MAX(scheduled_arrival) AS last_flight +FROM bookings.flights; + +-- Verify airports +SELECT COUNT(DISTINCT departure_airport) FROM bookings.routes; +SELECT COUNT(DISTINCT arrival_airport) FROM bookings.routes; +``` + +### Validity Checks + +```sql +-- All flights should have valid routes +SELECT COUNT(*) AS invalid_flights +FROM bookings.flights f +LEFT JOIN bookings.routes r + ON r.route_no = f.route_no + AND r.validity @> f.scheduled_departure +WHERE r.route_no IS NULL; +-- Should return 0 +``` + +## Troubleshooting + +### Container Won't Start + +```bash +docker compose logs pg +docker compose down -v # Remove volumes +docker compose up -d +``` + +### Database Not Loading + +Check init script logs: +```bash +docker compose logs pg | grep -A 20 "downloading\|extracting" +``` + +### Out of Disk Space + +Use a smaller dataset (3m or 6m) or clean up: +```bash +docker compose down -v +docker system prune -a +``` + +### Queries Too Slow + +1. Check if indexes exist: +```sql +\di bookings.* +``` + +2. Analyze query plans: +```sql +EXPLAIN (ANALYZE, BUFFERS) +SELECT * FROM bookings.timetable +WHERE departure_airport = 'SVO' + AND scheduled_departure::date = DATE '2025-10-07'; +``` + +3. Consider adding indexes (note: this changes the benchmark): +```sql +CREATE INDEX idx_flights_dep_time ON bookings.flights(scheduled_departure); +CREATE INDEX idx_flights_route ON bookings.flights(route_no); +``` + +## Stopping and Cleaning Up + +### Stop (preserve data) +```bash +docker compose down +``` + +### Stop and remove all data +```bash +docker compose down -v +``` + +## References + +- [PostgresPro Airlines Demo Database](https://postgrespro.ru/education/demodb) +- [PostgreSQL 18 Documentation](https://www.postgresql.org/docs/18/) +- [Docker Compose Documentation](https://docs.docker.com/compose/) +- [Doublets Documentation](https://github.com/linksplatform/Data.Doublets) diff --git a/sql/10_timetable_queries.sql b/sql/10_timetable_queries.sql new file mode 100644 index 0000000..89bc49d --- /dev/null +++ b/sql/10_timetable_queries.sql @@ -0,0 +1,271 @@ +-- ============================================================================ +-- PostgresPro Airlines Demo - Timetable Queries +-- ============================================================================ +-- These queries demonstrate flight timetable operations using the Airlines +-- demo database. They include both queries using the built-in timetable view +-- and manual queries with explicit validity checks. +-- +-- Prerequisites: +-- - PostgreSQL 18 +-- - Airlines demo database loaded (demo database) +-- - bookings schema with flights, routes, airports tables +-- ============================================================================ + +\timing on + +-- ============================================================================ +-- Query 1: Departures from an Airport (Using Timetable View) +-- ============================================================================ +-- Get all departures from Sheremetyevo (SVO) on a specific date +-- The timetable view handles timezone conversions and validity checks + +\echo '=== Query 1: Departures from SVO on 2025-10-07 (using view) ===' +SELECT + flight_id, + route_no, + departure_airport, + arrival_airport, + scheduled_departure, + scheduled_arrival, + status +FROM bookings.timetable +WHERE departure_airport = 'SVO' + AND (scheduled_departure AT TIME ZONE 'UTC')::date = DATE '2025-10-07' +ORDER BY scheduled_departure; + +-- ============================================================================ +-- Query 2: Arrivals to an Airport (Using Timetable View) +-- ============================================================================ +-- Get all arrivals to Sheremetyevo (SVO) on a specific date + +\echo '=== Query 2: Arrivals to SVO on 2025-10-07 (using view) ===' +SELECT + flight_id, + route_no, + departure_airport, + arrival_airport, + scheduled_departure, + scheduled_arrival, + status +FROM bookings.timetable +WHERE arrival_airport = 'SVO' + AND (scheduled_arrival AT TIME ZONE 'UTC')::date = DATE '2025-10-07' +ORDER BY scheduled_arrival; + +-- ============================================================================ +-- Query 3: Next Available Flight on a Route +-- ============================================================================ +-- Find the next available flight from Yekaterinburg (SVX) to Wuhan (WUH) +-- after the current model time + +\echo '=== Query 3: Next available flight SVX -> WUH after model time ===' +SELECT + flight_id, + route_no, + departure_airport, + arrival_airport, + scheduled_departure, + scheduled_arrival, + status, + bookings.now() AS model_time +FROM bookings.timetable +WHERE departure_airport = 'SVX' + AND arrival_airport = 'WUH' + AND scheduled_departure > bookings.now() +ORDER BY scheduled_departure +LIMIT 1; + +-- ============================================================================ +-- Query 4: Manual Timetable (Without View, Explicit Validity Check) +-- ============================================================================ +-- This demonstrates the manual approach without using the timetable view. +-- IMPORTANT: Always include the validity check when joining routes to flights! +-- The validity check ensures the route was valid at the flight's scheduled time. + +\echo '=== Query 4: Manual departures from SVO (explicit validity) ===' +SELECT + f.flight_id, + r.route_no, + r.departure_airport, + r.arrival_airport, + f.status, + f.scheduled_departure, + f.scheduled_arrival, + r.validity AS route_validity_period +FROM bookings.flights AS f +JOIN bookings.routes AS r + ON r.route_no = f.route_no + AND r.validity @> f.scheduled_departure -- CRITICAL: validity check! +WHERE r.departure_airport = 'SVO' + AND f.scheduled_departure::date = DATE '2025-10-07' +ORDER BY f.scheduled_departure; + +-- ============================================================================ +-- Query 5: Manual Arrivals (Explicit Validity Check) +-- ============================================================================ +-- Same as Query 4 but for arrivals + +\echo '=== Query 5: Manual arrivals to SVO (explicit validity) ===' +SELECT + f.flight_id, + r.route_no, + r.departure_airport, + r.arrival_airport, + f.status, + f.scheduled_departure, + f.scheduled_arrival, + r.validity AS route_validity_period +FROM bookings.flights AS f +JOIN bookings.routes AS r + ON r.route_no = f.route_no + AND r.validity @> f.scheduled_departure -- CRITICAL: validity check! +WHERE r.arrival_airport = 'SVO' + AND f.scheduled_arrival::date = DATE '2025-10-07' +ORDER BY f.scheduled_arrival; + +-- ============================================================================ +-- Query 6: Flights by Route Number with Full Details +-- ============================================================================ +-- Get all flights for a specific route with aircraft and airport details + +\echo '=== Query 6: All flights for a specific route with details ===' +SELECT + f.flight_id, + f.route_no, + dep.airport_code AS dep_code, + dep.airport_name->>'en' AS dep_name, + dep.city->>'en' AS dep_city, + arr.airport_code AS arr_code, + arr.airport_name->>'en' AS arr_name, + arr.city->>'en' AS arr_city, + f.scheduled_departure, + f.scheduled_arrival, + f.status, + a.aircraft_code, + a.model->>'en' AS aircraft_model +FROM bookings.flights f +JOIN bookings.routes r + ON r.route_no = f.route_no + AND r.validity @> f.scheduled_departure -- validity check +JOIN bookings.airports dep ON dep.airport_code = r.departure_airport +JOIN bookings.airports arr ON arr.airport_code = r.arrival_airport +LEFT JOIN bookings.aircraft a ON a.aircraft_code = f.aircraft_code +WHERE f.route_no = 'PG0001' +ORDER BY f.scheduled_departure +LIMIT 20; + +-- ============================================================================ +-- Query 7: Count Flights by Status for an Airport +-- ============================================================================ +-- Aggregate query: count flights by status for a specific departure airport + +\echo '=== Query 7: Flight status distribution for departures from SVO ===' +SELECT + f.status, + COUNT(*) AS flight_count, + COUNT(DISTINCT f.route_no) AS unique_routes +FROM bookings.flights f +JOIN bookings.routes r + ON r.route_no = f.route_no + AND r.validity @> f.scheduled_departure +WHERE r.departure_airport = 'SVO' +GROUP BY f.status +ORDER BY flight_count DESC; + +-- ============================================================================ +-- Query 8: Busiest Routes from an Airport +-- ============================================================================ +-- Find the busiest routes (most flights) from a specific airport + +\echo '=== Query 8: Top 10 busiest routes from SVO ===' +SELECT + r.departure_airport, + r.arrival_airport, + arr.city->>'en' AS destination_city, + COUNT(*) AS num_flights, + COUNT(DISTINCT f.aircraft_code) AS num_aircraft_types +FROM bookings.flights f +JOIN bookings.routes r + ON r.route_no = f.route_no + AND r.validity @> f.scheduled_departure +JOIN bookings.airports arr ON arr.airport_code = r.arrival_airport +WHERE r.departure_airport = 'SVO' +GROUP BY r.departure_airport, r.arrival_airport, arr.city +ORDER BY num_flights DESC +LIMIT 10; + +-- ============================================================================ +-- Query 9: Flights by Date Range +-- ============================================================================ +-- Get all flights in a specific date range for a route + +\echo '=== Query 9: Flights in date range (2025-10-01 to 2025-10-07) ===' +SELECT + DATE(f.scheduled_departure) AS flight_date, + COUNT(*) AS num_flights, + COUNT(DISTINCT r.route_no) AS num_routes, + COUNT(DISTINCT r.departure_airport) AS num_dep_airports, + COUNT(DISTINCT r.arrival_airport) AS num_arr_airports +FROM bookings.flights f +JOIN bookings.routes r + ON r.route_no = f.route_no + AND r.validity @> f.scheduled_departure +WHERE f.scheduled_departure >= DATE '2025-10-01' + AND f.scheduled_departure < DATE '2025-10-08' +GROUP BY DATE(f.scheduled_departure) +ORDER BY flight_date; + +-- ============================================================================ +-- Query 10: EXPLAIN ANALYZE Example +-- ============================================================================ +-- This query shows how to profile query performance +-- Use this pattern for benchmark measurements + +\echo '=== Query 10: EXPLAIN ANALYZE for departures ===' +EXPLAIN (ANALYZE, BUFFERS, TIMING) +SELECT + f.flight_id, + r.route_no, + r.departure_airport, + r.arrival_airport, + f.scheduled_departure +FROM bookings.flights AS f +JOIN bookings.routes AS r + ON r.route_no = f.route_no + AND r.validity @> f.scheduled_departure +WHERE r.departure_airport = 'SVO' + AND f.scheduled_departure::date = DATE '2025-10-07' +ORDER BY f.scheduled_departure; + +-- ============================================================================ +-- Validation Queries +-- ============================================================================ + +\echo '=== Validation: Check model time ===' +SELECT bookings.now() AS current_model_time; + +\echo '=== Validation: Check if routes/flights validity is respected ===' +-- This should return 0 (no flights without valid routes) +SELECT COUNT(*) AS invalid_flights +FROM bookings.flights f +LEFT JOIN bookings.routes r + ON r.route_no = f.route_no + AND r.validity @> f.scheduled_departure +WHERE r.route_no IS NULL; + +\echo '=== Validation: Date range in dataset ===' +SELECT + MIN(scheduled_departure) AS earliest_flight, + MAX(scheduled_arrival) AS latest_flight, + MAX(scheduled_arrival) - MIN(scheduled_departure) AS total_span +FROM bookings.flights; + +\echo '=== Validation: Count airports and routes ===' +SELECT + (SELECT COUNT(DISTINCT airport_code) FROM bookings.airports) AS num_airports, + (SELECT COUNT(*) FROM bookings.routes) AS num_routes, + (SELECT COUNT(*) FROM bookings.flights) AS num_flights; + +\timing off + +\echo '=== All queries completed ==='