**NEW DATASET CREATION**

In [2]:
import duckdb
con = duckdb.connect('../sql_databases/skyhack.duckdb')
con.execute("SHOW TABLES").df()

Unnamed: 0,name
0,PNRFlight
1,airports_cleaned
2,bagsData_cleaned
3,flight_difficulty_ml
4,flights_cleaned
5,pnr_remarks_cleaned


In [3]:
sql_metric = '''WITH flight_metrics AS (
  SELECT
    CAST(flight_number AS VARCHAR) AS flight_number,
    CAST(scheduled_departure_date_local AS DATE) AS dep_date,
    ROUND((epoch(actual_departure_datetime_local) - epoch(scheduled_departure_datetime_local)) / 60.0, 1) AS dep_delay_mins,
    COALESCE(scheduled_ground_time_minutes, 0) AS sched_ground,
    COALESCE(actual_ground_time_minutes, 0) AS actual_ground,
    ROUND((actual_ground_time_minutes - scheduled_ground_time_minutes), 1) AS ground_diff_mins,
    ROUND((epoch(actual_departure_datetime_local) - epoch(scheduled_departure_datetime_local)) / 60.0 
          + 0.5 * (actual_ground_time_minutes - scheduled_ground_time_minutes), 1) AS difficulty_index
  FROM flights_cleaned
  WHERE scheduled_departure_datetime_local IS NOT NULL
)
SELECT
  COUNT(*) AS total_flights,
  ROUND(MIN(difficulty_index), 2) AS min_diff_index,
  ROUND(MAX(difficulty_index), 2) AS max_diff_index,
  ROUND(AVG(difficulty_index), 2) AS avg_diff_index,
  ROUND(PERCENTILE_CONT(0.25) WITHIN GROUP (ORDER BY difficulty_index), 2) AS p25,
  ROUND(PERCENTILE_CONT(0.33) WITHIN GROUP (ORDER BY difficulty_index), 2) AS p33,
  ROUND(PERCENTILE_CONT(0.5) WITHIN GROUP (ORDER BY difficulty_index), 2) AS median,
  ROUND(PERCENTILE_CONT(0.66) WITHIN GROUP (ORDER BY difficulty_index), 2) AS p66,
  ROUND(PERCENTILE_CONT(0.75) WITHIN GROUP (ORDER BY difficulty_index), 2) AS p75,
  ROUND(PERCENTILE_CONT(0.9)  WITHIN GROUP (ORDER BY difficulty_index), 2) AS p90
FROM flight_metrics; ''' 

con.execute(sql_metric).df()


Unnamed: 0,total_flights,min_diff_index,max_diff_index,avg_diff_index,p25,p33,median,p66,p75,p90
0,8065,-831.0,1395.5,22.95,-5.0,-1.5,4.5,13.0,22.5,84.0


In [4]:
sql_new_table = '''
CREATE OR REPLACE TABLE flight_difficulty_ml AS
WITH base AS (
  SELECT
    f.company_id,
    CAST(f.flight_number AS VARCHAR) AS flight_number,
    CAST(f.scheduled_departure_date_local AS DATE) AS dep_date,
    UPPER(TRIM(f.carrier)) AS carrier,
    COALESCE(f.total_seats, 0) AS total_seats,
    COALESCE(f.scheduled_ground_time_minutes, 0) AS sched_ground,
    COALESCE(f.actual_ground_time_minutes, 0) AS actual_ground,
    COALESCE(f.minimum_turn_minutes, 0) AS min_turn,
    ROUND((epoch(f.actual_departure_datetime_local) - epoch(f.scheduled_departure_datetime_local))/60.0,1) AS dep_delay_mins,
    ROUND((f.actual_ground_time_minutes - f.scheduled_ground_time_minutes),1) AS ground_diff_mins,
    ROUND((epoch(f.actual_departure_datetime_local) - epoch(f.scheduled_departure_datetime_local))/60.0 
          + 0.5*(f.actual_ground_time_minutes - f.scheduled_ground_time_minutes),1) AS difficulty_index
  FROM flights_cleaned f
  WHERE f.scheduled_departure_datetime_local IS NOT NULL
),
pnr AS (
  SELECT
    CAST(flight_number AS VARCHAR) AS flight_number,
    CAST(scheduled_departure_date_local AS DATE) AS dep_date,
    SUM(COALESCE(total_pax,0)) AS pax_total
  FROM pnrFlight
  GROUP BY 1,2
),
bags AS (
  SELECT
    CAST(flight_number AS VARCHAR) AS flight_number,
    CAST(scheduled_departure_date_local AS DATE) AS dep_date,
    SUM(CASE WHEN UPPER(bag_type) LIKE 'TRANSFER%' THEN 1 ELSE 0 END) AS transfer_bags,
    SUM(CASE WHEN UPPER(bag_type) LIKE 'ORIGIN%' THEN 1 ELSE 0 END) AS checked_bags
  FROM bagsData_cleaned
  GROUP BY 1,2
),
ssr AS (
  SELECT
    CAST(r.flight_number AS VARCHAR) AS flight_number,
    CAST(COALESCE(pf.scheduled_departure_date_local, r.pnr_creation_date) AS DATE) AS dep_date,
    COUNT(*) AS ssr_total
  FROM pnr_remarks_cleaned r
  LEFT JOIN pnrFlight pf 
    ON r.record_locator = pf.record_locator 
   AND r.flight_number = pf.flight_number
  GROUP BY 1,2
)
SELECT
  b.company_id,
  b.flight_number,
  b.dep_date,
  b.carrier,
  COALESCE(b.total_seats,0) AS total_seats,
  COALESCE(p.pax_total,0) AS pax_total,
  CASE WHEN b.total_seats>0 THEN CAST(p.pax_total AS DOUBLE)/b.total_seats ELSE 0 END AS pax_load_factor,
  COALESCE(s.ssr_total,0) AS ssr_total,
  COALESCE(b.min_turn,0) AS minimum_turn_minutes,
  CASE WHEN b2.checked_bags>0 THEN CAST(b2.transfer_bags AS DOUBLE)/b2.checked_bags ELSE 0 END AS transfer_to_checked_ratio,
  b.sched_ground,
  b.actual_ground,
  b.dep_delay_mins,
  b.ground_diff_mins,
  b.difficulty_index,
  CASE 
    WHEN b.difficulty_index <= -1.5 THEN 'Easy'
    WHEN b.difficulty_index <= 13 THEN 'Medium'
    ELSE 'Difficult'
  END AS difficulty_class
FROM base b
LEFT JOIN pnr p
  ON b.flight_number = p.flight_number AND b.dep_date = p.dep_date
LEFT JOIN bags b2
  ON b.flight_number = b2.flight_number AND b.dep_date = b2.dep_date
LEFT JOIN ssr s
  ON b.flight_number = s.flight_number AND b.dep_date = s.dep_date;
'''
con.execute(sql_new_table).df()



Unnamed: 0,Count
0,8065


In [5]:
con.execute("SELECT difficulty_class, COUNT(*) AS flights, ROUND(100.0*COUNT(*)/SUM(COUNT(*)) OVER(),2) AS pct FROM flight_difficulty_ml GROUP BY difficulty_class;").df()


Unnamed: 0,difficulty_class,flights,pct
0,Medium,2606,32.31
1,Easy,2759,34.21
2,Difficult,2700,33.48


In [6]:
sql_check ='''SELECT *
FROM flight_difficulty_ml
LIMIT 20;'''  
con.execute(sql_check).df()

Unnamed: 0,company_id,flight_number,dep_date,carrier,total_seats,pax_total,pax_load_factor,ssr_total,minimum_turn_minutes,transfer_to_checked_ratio,sched_ground,actual_ground,dep_delay_mins,ground_diff_mins,difficulty_index,difficulty_class
0,UA,556,2025-08-14,MAINLINE,179,123.0,0.687151,2,56,2.0,447,456,-7.0,9,-2.5,Easy
1,UA,1859,2025-08-13,MAINLINE,126,132.0,1.047619,2,43,0.793103,65,100,7.0,35,24.5,Difficult
2,G7,4387,2025-08-13,EXPRESS,50,49.0,0.98,1,29,8.5,96,54,170.0,-42,149.0,Difficult
3,UA,2278,2025-08-13,MAINLINE,200,222.0,1.11,7,62,1.975,80,102,0.0,22,11.0,Medium
4,G7,4482,2025-08-13,EXPRESS,50,50.0,1.0,1,29,12.0,40,34,-1.0,-6,-4.0,Easy
5,UA,881,2025-08-13,MAINLINE,318,227.0,0.713836,5,155,2.657143,365,376,-1.0,11,4.5,Medium
6,UA,2616,2025-08-13,MAINLINE,166,175.0,1.054217,4,51,2.409091,63,90,2.0,27,15.5,Difficult
7,G7,4429,2025-08-13,EXPRESS,50,58.0,1.16,2,29,26.0,62,67,-8.0,5,-5.5,Easy
8,UA,2665,2025-08-13,MAINLINE,179,136.0,0.759777,1,56,0.892857,72,63,0.0,-9,-4.5,Easy
9,UA,1793,2025-08-13,MAINLINE,126,132.0,1.047619,2,51,0.65625,51,56,-6.0,5,-3.5,Easy


In [7]:
con.close()