**Summarize which destinations consistently show more difficulty**

In [2]:
import duckdb
con = duckdb.connect('../sql_databases/skyhack.duckdb')
con.execute("SHOW TABLES").df()

Unnamed: 0,name
0,PNRFlight
1,airports_cleaned
2,bagsData_cleaned
3,flight_difficulty_ml
4,flights_cleaned
5,pnr_remarks_cleaned


In [3]:
con.execute('''PRAGMA table_info('flight_difficulty_ml');''').df()


Unnamed: 0,cid,name,type,notnull,dflt_value,pk
0,0,company_id,VARCHAR,False,,False
1,1,flight_number,VARCHAR,False,,False
2,2,dep_date,DATE,False,,False
3,3,carrier,VARCHAR,False,,False
4,4,total_seats,BIGINT,False,,False
5,5,pax_total,HUGEINT,False,,False
6,6,pax_load_factor,DOUBLE,False,,False
7,7,ssr_total,BIGINT,False,,False
8,8,minimum_turn_minutes,BIGINT,False,,False
9,9,transfer_to_checked_ratio,DOUBLE,False,,False


In [4]:
sql_query = '''--joining with flight data
CREATE OR REPLACE TABLE flight_difficulty_ml_enriched AS
SELECT
  d.company_id,
  d.flight_number,
  d.dep_date,
  f.scheduled_departure_station_code AS origin,
  f.scheduled_arrival_station_code   AS destination,
  d.carrier,
  d.total_seats,
  d.pax_total,
  d.pax_load_factor,
  d.ssr_total,
  d.minimum_turn_minutes,
  d.transfer_to_checked_ratio,
  d.sched_ground,
  d.actual_ground,
  d.dep_delay_mins,
  d.ground_diff_mins,
  d.difficulty_index,
  d.difficulty_class
FROM flight_difficulty_ml d
LEFT JOIN flights_cleaned f
  ON UPPER(TRIM(d.company_id)) = UPPER(TRIM(f.company_id))
  AND UPPER(TRIM(d.flight_number)) = UPPER(TRIM(CAST(f.flight_number AS VARCHAR)))
  AND CAST(d.dep_date AS DATE) = CAST(f.scheduled_departure_date_local AS DATE)
;''' 

con.execute(sql_query).df()


Unnamed: 0,Count
0,8069


In [5]:
sql_1 = '''SELECT
  COUNT(*) AS total_rows,
  SUM(CASE WHEN destination IS NOT NULL THEN 1 ELSE 0 END) AS matched_rows,
  ROUND(100.0 * SUM(CASE WHEN destination IS NOT NULL THEN 1 ELSE 0 END) / COUNT(*),2) AS pct_matched
FROM flight_difficulty_ml_enriched;
''' 
con.execute(sql_1).df()

Unnamed: 0,total_rows,matched_rows,pct_matched
0,8069,8069.0,100.0


In [6]:
final_query = '''SELECT
  destination,
  COUNT(*) AS total_flights,
  ROUND(AVG(difficulty_index), 2) AS avg_difficulty_index,
  ROUND(AVG(pax_load_factor), 3) AS avg_load_factor,
  ROUND(AVG(ssr_total), 3) AS avg_ssr,
  ROUND(AVG(transfer_to_checked_ratio), 3) AS avg_transfer_ratio,
  ROUND(100.0 * SUM(CASE WHEN difficulty_class = 'Difficult' THEN 1 ELSE 0 END) / COUNT(*), 2) AS pct_difficult
FROM flight_difficulty_ml_enriched
WHERE destination IS NOT NULL
GROUP BY destination
HAVING COUNT(*) > 10
ORDER BY avg_difficulty_index DESC
LIMIT 10;
''' 
con.execute(final_query).df()

Unnamed: 0,destination,total_flights,avg_difficulty_index,avg_load_factor,avg_ssr,avg_transfer_ratio,pct_difficult
0,ASE,15,185.63,0.963,1.333,1.253,40.0
1,LHR,45,87.56,1.056,3.489,0.703,66.67
2,GRU,15,78.83,1.049,11.267,2.137,73.33
3,ONT,15,66.13,1.002,4.733,0.889,86.67
4,BCN,15,64.67,0.93,3.733,1.587,66.67
5,CDG,15,64.3,1.058,5.2,0.52,73.33
6,LAF,17,62.82,0.352,0.176,1.205,35.29
7,RNO,15,61.7,1.058,2.733,1.058,60.0
8,JLN,15,60.47,0.904,1.267,2.282,46.67
9,CAE,30,60.3,0.997,1.033,1.936,26.67
