**How do passenger loads compare across flights, and do higher loads correlate with operational difficulty?**

In [36]:
import duckdb
con = duckdb.connect('../sql_databases/skyhack.duckdb')
con.execute("SHOW TABLES").df()

Unnamed: 0,name
0,PNRFlight
1,airports_cleaned
2,bagsData_cleaned
3,flights_cleaned
4,pnr_remarks_cleaned


In [None]:
sql_query_q4 = '''
WITH flight_norm AS (
  SELECT DISTINCT
    UPPER(TRIM(company_id))                      AS company_id,
    CAST(flight_number AS VARCHAR)               AS flight_number,
    CAST(scheduled_departure_date_local AS DATE) AS dep_date,
    UPPER(TRIM(scheduled_departure_station_code)) AS dep_iata,
    UPPER(TRIM(scheduled_arrival_station_code))   AS arr_iata,
    COALESCE(total_seats, 0)                     AS total_seats,
    CAST(scheduled_departure_datetime_local AS TIMESTAMP) AS sched_dep_dt,
    CAST(actual_departure_datetime_local AS TIMESTAMP)    AS actual_dep_dt,
    COALESCE(scheduled_ground_time_minutes, NULL) AS scheduled_ground_time_minutes,
    COALESCE(actual_ground_time_minutes, NULL)    AS actual_ground_time_minutes,
    COALESCE(minimum_turn_minutes, NULL)          AS minimum_turn_minutes
  FROM flights_cleaned
  WHERE scheduled_departure_datetime_local IS NOT NULL
),
pnr_agg AS (
  SELECT
    UPPER(TRIM(company_id))                      AS company_id,
    CAST(flight_number AS VARCHAR)               AS flight_number,
    CAST(scheduled_departure_date_local AS DATE) AS dep_date,
    UPPER(TRIM(scheduled_departure_station_code)) AS dep_iata,
    UPPER(TRIM(scheduled_arrival_station_code))   AS arr_iata,
    SUM(COALESCE(total_pax,0))                   AS pax_total
  FROM pnrFlight
  GROUP BY 1,2,3,4,5
),
delays AS (
  SELECT
    f.company_id,
    f.flight_number,
    f.dep_date,
    f.dep_iata,
    f.arr_iata,
    f.total_seats,
    COALESCE(p.pax_total, 0) AS pax_total,
    CASE WHEN f.total_seats > 0 THEN CAST(COALESCE(p.pax_total,0) AS DOUBLE) / f.total_seats ELSE NULL END AS load_factor,
    ROUND((epoch(f.actual_dep_dt) - epoch(f.sched_dep_dt)) / 60.0, 1) AS dep_delay_mins,
    f.scheduled_ground_time_minutes,
    f.actual_ground_time_minutes,
    CASE
      WHEN f.scheduled_ground_time_minutes IS NOT NULL AND f.actual_ground_time_minutes IS NOT NULL
      THEN ROUND((f.actual_ground_time_minutes - f.scheduled_ground_time_minutes), 1)
    END AS ground_time_diff_mins,
    CASE
      WHEN f.scheduled_ground_time_minutes IS NOT NULL AND f.scheduled_ground_time_minutes != 0
      THEN ROUND((CAST(f.actual_ground_time_minutes AS DOUBLE) / f.scheduled_ground_time_minutes), 3)
    END AS ground_time_ratio,
    f.minimum_turn_minutes
  FROM flight_norm f
  LEFT JOIN pnr_agg p
    ON f.company_id = p.company_id
    AND f.flight_number = p.flight_number
    AND f.dep_date = p.dep_date
    AND f.dep_iata = p.dep_iata
    AND f.arr_iata = p.arr_iata
),
bins AS (
  SELECT *,
    CASE
      WHEN load_factor IS NULL THEN 'UNKNOWN'
      WHEN load_factor < 0.6 THEN 'LOW (<60%)'
      WHEN load_factor BETWEEN 0.6 AND 0.85 THEN 'MEDIUM (60–85%)'
      WHEN load_factor > 0.85 THEN 'HIGH (>85%)'
    END AS load_bin
  FROM delays
)
SELECT
  load_bin,
  COUNT(*) AS flight_count,
  ROUND(AVG(dep_delay_mins), 2) AS avg_delay_mins,
  ROUND(100.0 * SUM(CASE WHEN dep_delay_mins > 0 THEN 1 ELSE 0 END) / NULLIF(COUNT(dep_delay_mins),0), 2) AS pct_late,
  ROUND(AVG(scheduled_ground_time_minutes), 1) AS avg_scheduled_ground_time_mins,
  ROUND(AVG(actual_ground_time_minutes), 1)    AS avg_actual_ground_time_mins,
  ROUND(AVG(ground_time_diff_mins), 1)         AS avg_ground_time_diff_mins,
  ROUND(100.0 * SUM(CASE WHEN actual_ground_time_minutes IS NOT NULL AND minimum_turn_minutes IS NOT NULL AND actual_ground_time_minutes < minimum_turn_minutes THEN 1 ELSE 0 END) / NULLIF(SUM(CASE WHEN actual_ground_time_minutes IS NOT NULL AND minimum_turn_minutes IS NOT NULL THEN 1 ELSE 0 END),0), 2) AS pct_actual_below_min_turn
FROM bins
WHERE dep_delay_mins IS NOT NULL
GROUP BY 1
ORDER BY
  CASE load_bin
    WHEN 'LOW (<60%)' THEN 1
    WHEN 'MEDIUM (60–85%)' THEN 2
    WHEN 'HIGH (>85%)' THEN 3
    ELSE 4
  END;
'''
con.execute(sql_query_q4).df()   #pct-> percentage, load bin: load i.e total pax/seats in the plane



Unnamed: 0,load_bin,flight_count,avg_delay_mins,pct_late,avg_scheduled_ground_time_mins,avg_actual_ground_time_mins,avg_ground_time_diff_mins,pct_actual_below_min_turn
0,LOW (<60%),347,57.89,52.74,234.0,237.8,3.8,14.12
1,MEDIUM (60–85%),875,41.35,51.31,224.4,233.9,9.4,9.6
2,HIGH (>85%),6843,16.79,49.36,179.5,182.2,2.7,4.41


In [38]:
sql_query_q4_by_carrier = '''
WITH flight_norm AS (
  SELECT DISTINCT
    UPPER(TRIM(company_id)) AS company_id,
    CAST(flight_number AS VARCHAR) AS flight_number,
    CAST(scheduled_departure_date_local AS DATE) AS dep_date,
    UPPER(TRIM(scheduled_departure_station_code)) AS dep_iata,
    UPPER(TRIM(scheduled_arrival_station_code)) AS arr_iata,
    COALESCE(total_seats, 0) AS total_seats,
    CAST(scheduled_departure_datetime_local AS TIMESTAMP) AS sched_dep_dt,
    CAST(actual_departure_datetime_local AS TIMESTAMP) AS actual_dep_dt,
    carrier
  FROM flights_cleaned
  WHERE scheduled_departure_datetime_local IS NOT NULL
),
pnr_agg AS (
  SELECT
    UPPER(TRIM(company_id)) AS company_id,
    CAST(flight_number AS VARCHAR) AS flight_number,
    CAST(scheduled_departure_date_local AS DATE) AS dep_date,
    UPPER(TRIM(scheduled_departure_station_code)) AS dep_iata,
    UPPER(TRIM(scheduled_arrival_station_code)) AS arr_iata,
    SUM(COALESCE(total_pax,0)) AS pax_total
  FROM pnrFlight
  GROUP BY 1,2,3,4,5
),
joined AS (
  SELECT
    f.company_id,
    f.flight_number,
    f.dep_date,
    f.dep_iata,
    f.arr_iata,
    f.carrier,
    f.total_seats,
    COALESCE(p.pax_total,0) AS pax_total,
    CASE WHEN f.total_seats>0 THEN CAST(p.pax_total AS DOUBLE)/f.total_seats ELSE NULL END AS load_factor,
    ROUND((epoch(f.actual_dep_dt) - epoch(f.sched_dep_dt))/60.0,1) AS dep_delay_mins
  FROM flight_norm f
  LEFT JOIN pnr_agg p
    USING (company_id, flight_number, dep_date, dep_iata, arr_iata)
),
bins AS (
  SELECT *,
    CASE
      WHEN load_factor IS NULL THEN 'UNKNOWN'
      WHEN load_factor < 0.6 THEN 'LOW (<60%)'
      WHEN load_factor BETWEEN 0.6 AND 0.85 THEN 'MEDIUM (60–85%)'
      WHEN load_factor > 0.85 THEN 'HIGH (>85%)'
    END AS load_bin
  FROM joined
)
SELECT
  carrier,
  load_bin,
  ROUND(AVG(dep_delay_mins),2) AS avg_delay_mins,
  COUNT(*) AS flight_count
FROM bins
WHERE dep_delay_mins IS NOT NULL
GROUP BY carrier, load_bin
ORDER BY carrier,
  CASE load_bin
    WHEN 'LOW (<60%)' THEN 1
    WHEN 'MEDIUM (60–85%)' THEN 2
    WHEN 'HIGH (>85%)' THEN 3
    ELSE 4
  END;
'''
con.execute(sql_query_q4_by_carrier).df()  
 


Unnamed: 0,carrier,load_bin,avg_delay_mins,flight_count
0,EXPRESS,LOW (<60%),67.06,154
1,EXPRESS,MEDIUM (60–85%),42.43,434
2,EXPRESS,HIGH (>85%),14.06,2996
3,MAINLINE,LOW (<60%),50.57,193
4,MAINLINE,MEDIUM (60–85%),40.27,441
5,MAINLINE,HIGH (>85%),18.91,3847


In [39]:
con.close()