In [862]:
import pandas as pd
flights = pd.read_csv('../Data/Flight Level Data.csv')
flights.shape


(8099, 15)

In [863]:
flights.info()
flights.columns

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 8099 entries, 0 to 8098
Data columns (total 15 columns):
 #   Column                              Non-Null Count  Dtype 
---  ------                              --------------  ----- 
 0   company_id                          8099 non-null   object
 1   flight_number                       8099 non-null   int64 
 2   scheduled_departure_date_local      8099 non-null   object
 3   scheduled_departure_station_code    8099 non-null   object
 4   scheduled_arrival_station_code      8099 non-null   object
 5   scheduled_departure_datetime_local  8099 non-null   object
 6   scheduled_arrival_datetime_local    8099 non-null   object
 7   actual_departure_datetime_local     8099 non-null   object
 8   actual_arrival_datetime_local       8099 non-null   object
 9   total_seats                         8099 non-null   int64 
 10  fleet_type                          8099 non-null   object
 11  carrier                             8099 non-null   obje

Index(['company_id', 'flight_number', 'scheduled_departure_date_local',
       'scheduled_departure_station_code', 'scheduled_arrival_station_code',
       'scheduled_departure_datetime_local',
       'scheduled_arrival_datetime_local', 'actual_departure_datetime_local',
       'actual_arrival_datetime_local', 'total_seats', 'fleet_type', 'carrier',
       'scheduled_ground_time_minutes', 'actual_ground_time_minutes',
       'minimum_turn_minutes'],
      dtype='object')

In [864]:
# Uppercase-safe string columns
str_cols_upper = [
    'company_id',
    'scheduled_departure_station_code',
    'scheduled_arrival_station_code',
    'carrier',
    'fleet_type'
]

str_cols_strip = [
    # keep flight_number as string for joining (avoid leading zero loss)
    'flight_number'
]

date_cols    = ['scheduled_departure_date_local']  # date only
datetime_cols = [
    'scheduled_departure_datetime_local',
    'scheduled_arrival_datetime_local',
    'actual_departure_datetime_local',
    'actual_arrival_datetime_local'
]

numeric_cols = [
    'total_seats',
    'scheduled_ground_time_minutes',
    'actual_ground_time_minutes',
    'minimum_turn_minutes'
]

# Convert flight_number to string (to avoid leading zero issues)
flights['flight_number'] = flights['flight_number'].astype(str).str.strip()

# Strip & uppercase where appropriate
for col in str_cols_upper:
    if col in flights.columns:
        flights[col] = flights[col].astype(str).str.strip().str.upper()

# Strip only (no case change)
for col in str_cols_strip:
    if col in flights.columns:
        flights[col] = flights[col].astype(str).str.strip()

# Parse date-only column to date
for col in date_cols:
    if col in flights.columns:
        flights[col] = pd.to_datetime(flights[col], errors='coerce').dt.date

# Parse datetime columns
for col in datetime_cols:
    if col in flights.columns:
        flights[col] = pd.to_datetime(flights[col], errors='coerce')

# checking missing values
missing_counts = {}

for col in flights.columns:
    if (col in str_cols_upper) or (col in str_cols_strip):
        missing_counts[col] = (flights[col].isna() | (flights[col].str.len() == 0)).sum()
    else:
        missing_counts[col] = flights[col].isna().sum()

print("\nMissing values per column:")
for k, v in missing_counts.items():
    print(f"{k}: {v}")




Missing values per column:
company_id: 0
flight_number: 0
scheduled_departure_date_local: 0
scheduled_departure_station_code: 0
scheduled_arrival_station_code: 0
scheduled_departure_datetime_local: 0
scheduled_arrival_datetime_local: 0
actual_departure_datetime_local: 0
actual_arrival_datetime_local: 0
total_seats: 0
fleet_type: 0
carrier: 0
scheduled_ground_time_minutes: 0
actual_ground_time_minutes: 0
minimum_turn_minutes: 0


In [865]:
# Count of complete duplicates
full_dup_count = flights.duplicated(keep=False).sum()
print(f"Number of complete duplicate rows: {full_dup_count}")

# View the duplicate rows themselves
full_duplicates = flights[flights.duplicated(keep=False)].sort_values(list(flights.columns))
full_duplicates.head()


Number of complete duplicate rows: 0


Unnamed: 0,company_id,flight_number,scheduled_departure_date_local,scheduled_departure_station_code,scheduled_arrival_station_code,scheduled_departure_datetime_local,scheduled_arrival_datetime_local,actual_departure_datetime_local,actual_arrival_datetime_local,total_seats,fleet_type,carrier,scheduled_ground_time_minutes,actual_ground_time_minutes,minimum_turn_minutes


In [866]:
flights.shape

(8099, 15)

In [867]:
subset_cols = [
    'company_id',
    'flight_number',
    'scheduled_departure_date_local',
    'scheduled_departure_station_code',
    'scheduled_arrival_station_code',
    'scheduled_arrival_datetime_local'
]

#rows that are part of duplicate groups (by your subset)
dup_mask = flights.duplicated(subset=subset_cols, keep=False)

# negative ground time
neg_mask = flights['scheduled_ground_time_minutes'] < 0

# rows to drop: negatives within duplicate groups
to_drop = dup_mask & neg_mask

print("Duplicate-group rows (any):", int(dup_mask.sum()))
print("Negative ground in duplicates:", int(to_drop.sum()))       #there were duplicates with negative ground time so they are removed

# drop them
flights = flights.loc[~to_drop].reset_index(drop=True)



Duplicate-group rows (any): 46
Negative ground in duplicates: 7


In [868]:
subset_cols = [
    'company_id',
    'flight_number',
    'scheduled_departure_date_local',
    'scheduled_departure_station_code',
    'scheduled_arrival_station_code',
    'scheduled_arrival_datetime_local',
    'scheduled_ground_time_minutes'                  #no duplicate when we consider scheduled_ground_time_minutes
]

# Count duplicates based on these columns
subset_dup_count = flights.duplicated(subset=subset_cols, keep=False).sum()
print(f"Number of duplicate rows based on {subset_cols}: {subset_dup_count}")

# View them
subset_duplicates = flights[flights.duplicated(subset=subset_cols, keep=False)]
subset_duplicates = subset_duplicates.sort_values(subset_cols)
subset_duplicates.head(20)

Number of duplicate rows based on ['company_id', 'flight_number', 'scheduled_departure_date_local', 'scheduled_departure_station_code', 'scheduled_arrival_station_code', 'scheduled_arrival_datetime_local', 'scheduled_ground_time_minutes']: 0


Unnamed: 0,company_id,flight_number,scheduled_departure_date_local,scheduled_departure_station_code,scheduled_arrival_station_code,scheduled_departure_datetime_local,scheduled_arrival_datetime_local,actual_departure_datetime_local,actual_arrival_datetime_local,total_seats,fleet_type,carrier,scheduled_ground_time_minutes,actual_ground_time_minutes,minimum_turn_minutes


In [869]:
cols_to_check = [
    'scheduled_ground_time_minutes',
    'actual_ground_time_minutes',
    'minimum_turn_minutes'
]

for col in cols_to_check:
    neg_count = (flights[col] < 0).sum()
    print(f"{col}: {neg_count} negative values")

# To inspect the rows if any negatives found:
for col in cols_to_check:
    negative_rows = flights[flights[col] < 0]
    if not negative_rows.empty:
        print(f"\nNegative values in {col}:")
        display(negative_rows[[col] + ['flight_number', 'scheduled_departure_date_local']].head(10))


scheduled_ground_time_minutes: 298 negative values
actual_ground_time_minutes: 0 negative values
minimum_turn_minutes: 0 negative values

Negative values in scheduled_ground_time_minutes:


Unnamed: 0,scheduled_ground_time_minutes,flight_number,scheduled_departure_date_local
38,-5,4546,2025-08-06
98,-20,4590,2025-08-14
105,-28,5521,2025-08-01
111,-7,5790,2025-08-01
155,-5,5309,2025-08-11
246,-26,2652,2025-08-04
264,-22,5843,2025-08-15
280,-18,1564,2025-08-07
281,-50,1030,2025-08-04
296,-25,4789,2025-08-11


In [870]:
# Replace negative scheduled_ground_time_minutes with 0
flights.loc[flights['scheduled_ground_time_minutes'] < 0, 'scheduled_ground_time_minutes'] = 0

# Verify
print("Remaining negative scheduled ground times:",
      (flights['scheduled_ground_time_minutes'] < 0).sum())


Remaining negative scheduled ground times: 0


In [871]:
flights.shape

(8092, 15)

In [872]:
#check for invalid scheduled
invalid_sched_mask = flights['scheduled_departure_datetime_local'] >= flights['scheduled_arrival_datetime_local']

invalid_sched_count = invalid_sched_mask.sum()
print(f"Number of rows with scheduled_departure >= scheduled_arrival: {invalid_sched_count}")

invalid_sched_rows = flights.loc[invalid_sched_mask, [
    'company_id', 'flight_number', 'scheduled_departure_date_local',
    'scheduled_departure_datetime_local', 'scheduled_arrival_datetime_local'
]]
print("\nExamples of invalid scheduled datetime rows:")
display(invalid_sched_rows.head(10))

#check for invalid actual
invalid_actual_mask = flights['actual_departure_datetime_local'] >= flights['actual_arrival_datetime_local']

invalid_actual_count = invalid_actual_mask.sum()
print(f"\nNumber of rows with actual_departure >= actual_arrival: {invalid_actual_count}")

invalid_actual_rows = flights.loc[invalid_actual_mask, [
    'company_id', 'flight_number', 'scheduled_departure_date_local',
    'actual_departure_datetime_local', 'actual_arrival_datetime_local'
]]
print("\nExamples of invalid actual datetime rows:")
display(invalid_actual_rows.head(10))

#dropping both
combined_invalid_mask = invalid_sched_mask | invalid_actual_mask
flights = flights.loc[~combined_invalid_mask].reset_index(drop=True)

print(f"\nShape after dropping invalid scheduled/actual datetime rows: {flights.shape}")


Number of rows with scheduled_departure >= scheduled_arrival: 17

Examples of invalid scheduled datetime rows:


Unnamed: 0,company_id,flight_number,scheduled_departure_date_local,scheduled_departure_datetime_local,scheduled_arrival_datetime_local
766,UA,2635,2025-08-01,2025-08-01 19:13:00+00:00,2025-08-01 17:18:00+00:00
1658,UA,1775,2025-08-04,2025-08-04 17:31:00+00:00,2025-08-04 15:03:00+00:00
1906,OO,5251,2025-08-11,2025-08-11 11:16:00+00:00,2025-08-10 21:44:00+00:00
2661,G7,4482,2025-08-11,2025-08-11 19:40:00+00:00,2025-08-11 17:35:00+00:00
3478,UA,2635,2025-08-01,2025-08-01 19:13:00+00:00,2025-08-01 17:18:00+00:00
3739,UA,1857,2025-08-07,2025-08-07 20:11:00+00:00,2025-08-07 15:14:00+00:00
3842,UA,1270,2025-08-10,2025-08-10 16:25:00+00:00,2025-08-10 15:05:00+00:00
4824,UA,1857,2025-08-07,2025-08-07 20:11:00+00:00,2025-08-07 15:14:00+00:00
5416,UA,2144,2025-08-12,2025-08-12 18:34:00+00:00,2025-08-12 15:46:00+00:00
5427,G7,4597,2025-08-11,2025-08-11 11:26:00+00:00,2025-08-11 10:14:00+00:00



Number of rows with actual_departure >= actual_arrival: 0

Examples of invalid actual datetime rows:


Unnamed: 0,company_id,flight_number,scheduled_departure_date_local,actual_departure_datetime_local,actual_arrival_datetime_local



Shape after dropping invalid scheduled/actual datetime rows: (8075, 15)


In [873]:
subset_cols = [
    'company_id',
    'flight_number',
    'scheduled_departure_date_local',
    'scheduled_departure_station_code',
    'scheduled_arrival_station_code',
    'scheduled_arrival_datetime_local'
]

# Count duplicates based on these columns
subset_dup_count = flights.duplicated(subset=subset_cols, keep=False).sum()
print(f"Number of duplicate rows based on {subset_cols}: {subset_dup_count}")

# View them
subset_duplicates = flights[flights.duplicated(subset=subset_cols, keep=False)]
subset_duplicates = subset_duplicates.sort_values(subset_cols)
subset_duplicates.head(32)     


Number of duplicate rows based on ['company_id', 'flight_number', 'scheduled_departure_date_local', 'scheduled_departure_station_code', 'scheduled_arrival_station_code', 'scheduled_arrival_datetime_local']: 20


Unnamed: 0,company_id,flight_number,scheduled_departure_date_local,scheduled_departure_station_code,scheduled_arrival_station_code,scheduled_departure_datetime_local,scheduled_arrival_datetime_local,actual_departure_datetime_local,actual_arrival_datetime_local,total_seats,fleet_type,carrier,scheduled_ground_time_minutes,actual_ground_time_minutes,minimum_turn_minutes
6049,OO,5021,2025-08-04,ORD,SUX,2025-08-04 20:50:00+00:00,2025-08-04 22:40:00+00:00,2025-08-04 20:44:00+00:00,2025-08-04 22:40:00+00:00,50,CRJ-200,EXPRESS,20,668,29
6664,OO,5021,2025-08-04,ORD,SUX,2025-08-04 20:50:00+00:00,2025-08-04 22:40:00+00:00,2025-08-04 20:44:00+00:00,2025-08-04 22:40:00+00:00,50,CRJ-200,EXPRESS,120,61,29
302,OO,5565,2025-08-13,ORD,CHO,2025-08-13 14:04:00+00:00,2025-08-13 17:43:00+00:00,2025-08-13 14:15:00+00:00,2025-08-13 17:43:00+00:00,50,CRJ-550,EXPRESS,20,38,29
7106,OO,5565,2025-08-13,ORD,CHO,2025-08-13 14:04:00+00:00,2025-08-13 17:43:00+00:00,2025-08-13 14:15:00+00:00,2025-08-13 17:43:00+00:00,50,CRJ-550,EXPRESS,41,77,29
471,UA,1775,2025-08-04,ORD,FLL,2025-08-04 10:45:00+00:00,2025-08-04 17:01:00+00:00,2025-08-04 10:44:00+00:00,2025-08-04 17:01:00+00:00,179,B737-MAX9,MAINLINE,743,768,56
7714,UA,1775,2025-08-04,ORD,FLL,2025-08-04 10:45:00+00:00,2025-08-04 17:01:00+00:00,2025-08-04 10:44:00+00:00,2025-08-04 17:01:00+00:00,179,B737-MAX9,MAINLINE,20,30,56
1208,UA,1857,2025-08-07,ORD,MIA,2025-08-07 10:54:00+00:00,2025-08-07 15:44:00+00:00,2025-08-07 10:53:00+00:00,2025-08-07 15:44:00+00:00,166,B737-800,MAINLINE,20,267,51
3650,UA,1857,2025-08-07,ORD,MIA,2025-08-07 10:54:00+00:00,2025-08-07 15:44:00+00:00,2025-08-07 10:53:00+00:00,2025-08-07 15:44:00+00:00,166,B737-800,MAINLINE,75,101,51
7269,UA,2144,2025-08-12,ORD,YYZ,2025-08-12 13:01:00+00:00,2025-08-12 18:33:00+00:00,2025-08-12 13:23:00+00:00,2025-08-12 18:33:00+00:00,126,A319-100,MAINLINE,76,108,52
7613,UA,2144,2025-08-12,ORD,YYZ,2025-08-12 13:01:00+00:00,2025-08-12 18:33:00+00:00,2025-08-12 13:23:00+00:00,2025-08-12 18:33:00+00:00,126,A319-100,MAINLINE,20,1,52


In [874]:
'''it had duplicate rows with different value of actual and scheduled ground time , so to tackle this i have 
compared the relative difference between these 2 column values for the duplicate rows and have used the 
row with a lower percentage, this isnt very reliable but i think this is the best way out'''
import numpy as np

subset_cols = [
    'company_id',
    'flight_number',
    'scheduled_departure_date_local',
    'scheduled_departure_station_code',
    'scheduled_arrival_station_code',
    'scheduled_arrival_datetime_local',
]

# Ensure numeric types for the two ground-time columns 
gt_cols = ['scheduled_ground_time_minutes', 'actual_ground_time_minutes']
for c in gt_cols:
    flights[c] = pd.to_numeric(flights[c], errors='coerce')

# Build deviation metric (absolute % difference)
flights['ground_time_dev_pct'] = np.where(
    flights['scheduled_ground_time_minutes'] > 0,
    np.abs(flights['actual_ground_time_minutes'] - flights['scheduled_ground_time_minutes']) 
    / flights['scheduled_ground_time_minutes'],
    np.inf
)



#  Deduplicate: keep the row with the lowest deviation per subset 
before = len(flights)
flights_deduped = (
    flights
    .sort_values(subset_cols + ['ground_time_dev_pct'], ascending=True)
    .drop_duplicates(subset=subset_cols, keep='first')
    .reset_index(drop=True)
)
after = len(flights_deduped)

print(f"Deduped by {subset_cols}")
print(f"Rows before: {before:,} | after: {after:,} | removed: {before - after:,}")

flights = flights_deduped


Deduped by ['company_id', 'flight_number', 'scheduled_departure_date_local', 'scheduled_departure_station_code', 'scheduled_arrival_station_code', 'scheduled_arrival_datetime_local']
Rows before: 8,075 | after: 8,065 | removed: 10


In [875]:
subset_cols = [
    'company_id',
    'flight_number',
    'scheduled_departure_date_local',
    'scheduled_departure_station_code',
    'scheduled_arrival_station_code',
    'scheduled_arrival_datetime_local'
]

# 1. Count duplicates based on these columns
subset_dup_count = flights.duplicated(subset=subset_cols, keep=False).sum()
print(f"Number of duplicate rows based on {subset_cols}: {subset_dup_count}")

# 2. View them
subset_duplicates = flights[flights.duplicated(subset=subset_cols, keep=False)]
subset_duplicates = subset_duplicates.sort_values(subset_cols)
subset_duplicates.head(32)      #confirmed with no duplicates

Number of duplicate rows based on ['company_id', 'flight_number', 'scheduled_departure_date_local', 'scheduled_departure_station_code', 'scheduled_arrival_station_code', 'scheduled_arrival_datetime_local']: 0


Unnamed: 0,company_id,flight_number,scheduled_departure_date_local,scheduled_departure_station_code,scheduled_arrival_station_code,scheduled_departure_datetime_local,scheduled_arrival_datetime_local,actual_departure_datetime_local,actual_arrival_datetime_local,total_seats,fleet_type,carrier,scheduled_ground_time_minutes,actual_ground_time_minutes,minimum_turn_minutes,ground_time_dev_pct


In [876]:
flights.shape[0]

8065

In [877]:
# Load airports dataset
airports = pd.read_csv('../Data/Airports Data.csv')

# Clean column names just in case
airports.columns = airports.columns.str.strip()

# Make a set of valid IATA codes
valid_iata_codes = set(airports['airport_iata_code'].dropna().str.strip().str.upper())
print(f"Total valid IATA codes: {len(valid_iata_codes)}")

Total valid IATA codes: 5613


In [878]:
invalid_departure_mask = ~flights['scheduled_departure_station_code'].isin(valid_iata_codes)
invalid_departure_count = invalid_departure_mask.sum()

print(f"Invalid departure codes count: {invalid_departure_count}")
if invalid_departure_count > 0:
    print(flights.loc[invalid_departure_mask, 'scheduled_departure_station_code'].value_counts().head(10))


Invalid departure codes count: 0


In [879]:
invalid_arrival_mask = ~flights['scheduled_arrival_station_code'].isin(valid_iata_codes)
invalid_arrival_count = invalid_arrival_mask.sum()

print(f"Invalid arrival codes count: {invalid_arrival_count}")
if invalid_arrival_count > 0:
    print(flights.loc[invalid_arrival_mask, 'scheduled_arrival_station_code'].value_counts().head(10))

Invalid arrival codes count: 0


In [886]:
flights.shape[0]

8065

In [880]:
import duckdb

# connecting to the database
con = duckdb.connect('../sql_databases/skyhack.duckdb')

con.register('flights_df', flights)

# creating the table
con.execute("""
    CREATE OR REPLACE TABLE flights_cleaned AS
    SELECT * FROM flights_df
""")
con.close()

In [None]:
import duckdb

con = duckdb.connect('../sql_databases/skyhack.duckdb')


In [882]:
con.execute("SHOW TABLES").df()

Unnamed: 0,name
0,PNRFlight
1,airports_cleaned
2,bagsData_cleaned
3,flights_cleaned
4,pnr_remarks_cleaned


In [883]:
sql ='''WITH p AS (
  SELECT
    UPPER(TRIM(company_id)) AS company_id,
    TRIM(CAST(flight_number AS VARCHAR)) AS flight_number,
    DATE(scheduled_departure_date_local) AS dep_date,
    UPPER(TRIM(scheduled_departure_station_code)) AS dep_iata,
    UPPER(TRIM(scheduled_arrival_station_code)) AS arr_iata
  FROM PNRFlight
),
f AS (
  SELECT
    UPPER(TRIM(company_id)) AS company_id,
    TRIM(CAST(flight_number AS VARCHAR)) AS flight_number,
    DATE(scheduled_departure_date_local) AS dep_date,
    UPPER(TRIM(scheduled_departure_station_code)) AS dep_iata,
    UPPER(TRIM(scheduled_arrival_station_code)) AS arr_iata
  FROM flights_cleaned
)
SELECT
  CASE WHEN f.company_id IS NULL THEN 'left_only' ELSE 'both' END AS match_status,
  COUNT(*) AS row_count
FROM p
LEFT JOIN f
  ON  p.company_id = f.company_id
  AND p.flight_number = f.flight_number
  AND p.dep_date = f.dep_date
  AND p.dep_iata = f.dep_iata         
  AND p.arr_iata = f.arr_iata
GROUP BY 1
ORDER BY 1'''
con.execute(sql).df()     #very small error, so ignoring

# these are done to match if the tables match with the desired entries and there's no mismatch

Unnamed: 0,match_status,row_count
0,both,644551
1,left_only,97


In [884]:
sql= '''WITH r AS (
  SELECT
  TRIM(CAST(flight_number AS VARCHAR))            AS flight_number,
  FROM pnr_remarks_cleaned
),
f AS (
  SELECT
    TRIM(CAST(flight_number AS VARCHAR))            AS flight_number,
  FROM flights_cleaned
)
SELECT
  CASE WHEN r.flight_number IS NULL THEN 'left_only' ELSE 'both' END AS match_status,
  COUNT(*) AS row_count
FROM f
LEFT JOIN r
  ON r.flight_number = f.flight_number
GROUP BY 1
ORDER BY 1'''

con.execute(sql).df()    #no error


Unnamed: 0,match_status,row_count
0,both,393358


In [885]:
con.close()