In [342]:
import pandas as pd
bags = pd.read_csv('../Data/Bag+Level+Data.csv')
print("Shape of Bag dataset:", bags.shape)
bags.head()


Shape of Bag dataset: (687245, 8)


Unnamed: 0,company_id,flight_number,scheduled_departure_date_local,scheduled_departure_station_code,scheduled_arrival_station_code,bag_tag_unique_number,bag_tag_issue_date,bag_type
0,UA,1068,2025-08-01,ORD,IAD,BAGTAG_418666,2025-07-27,Transfer
1,UA,622,2025-08-01,ORD,DEN,BAGTAG_418667,2025-07-28,Transfer
2,YX,3718,2025-08-01,ORD,MSN,BAGTAG_84760,2025-07-28,Transfer
3,BA,294,2025-08-01,ORD,LHR,BAGTAG_418669,2025-07-28,Transfer
4,UA,2627,2025-08-01,ORD,MSP,BAGTAG_335257,2025-07-28,Transfer


In [343]:
bags.info()
bags.columns

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 687245 entries, 0 to 687244
Data columns (total 8 columns):
 #   Column                            Non-Null Count   Dtype 
---  ------                            --------------   ----- 
 0   company_id                        687245 non-null  object
 1   flight_number                     687245 non-null  int64 
 2   scheduled_departure_date_local    687245 non-null  object
 3   scheduled_departure_station_code  687245 non-null  object
 4   scheduled_arrival_station_code    687245 non-null  object
 5   bag_tag_unique_number             687245 non-null  object
 6   bag_tag_issue_date                687245 non-null  object
 7   bag_type                          687245 non-null  object
dtypes: int64(1), object(7)
memory usage: 41.9+ MB


Index(['company_id', 'flight_number', 'scheduled_departure_date_local',
       'scheduled_departure_station_code', 'scheduled_arrival_station_code',
       'bag_tag_unique_number', 'bag_tag_issue_date', 'bag_type'],
      dtype='object')

In [344]:
# Convert flight_number to string (to avoid leading zero issues)
bags['flight_number'] = bags['flight_number'].astype(str).str.strip()

# Strip & uppercase where appropriate
str_cols_upper = ['company_id', 'scheduled_departure_station_code', 'scheduled_arrival_station_code', 'bag_type','bag_tag_unique_number']

for col in str_cols_upper:
    bags[col] = bags[col].astype(str).str.strip().str.upper()


# Parse dates safely
bags['scheduled_departure_date_local'] = pd.to_datetime(
    bags['scheduled_departure_date_local'], errors='coerce'
).dt.date

bags['bag_tag_issue_date'] = pd.to_datetime(
    bags['bag_tag_issue_date'], errors='coerce'
)

# checking for missing values
missing_counts = {}
for col in bags.columns:
    if pd.api.types.is_string_dtype(bags[col]) or col in str_cols_upper:
        missing_counts[col] = (bags[col].isna() | (bags[col].str.len() == 0)).sum()
    else:
        missing_counts[col] = bags[col].isna().sum()

print("\nMissing values per column:")
for k, v in missing_counts.items():
    print(f"{k}: {v}")



Missing values per column:
company_id: 0
flight_number: 0
scheduled_departure_date_local: 0
scheduled_departure_station_code: 0
scheduled_arrival_station_code: 0
bag_tag_unique_number: 0
bag_tag_issue_date: 0
bag_type: 0


In [345]:
exact_dups = bags.duplicated().sum()
print(f"\nExact duplicate rows: {exact_dups}")

# 2. Duplicate bag tags
if 'bag_tag_unique_number' in bags.columns:
    tag_dups = bags.duplicated(subset=['bag_tag_unique_number']).sum()
    print(f"Duplicate bag_tag_unique_number entries: {tag_dups}")         #bags that were re routed, gone to wrong airport, they have same bag tags


Exact duplicate rows: 293
Duplicate bag_tag_unique_number entries: 20090


In [346]:
bags = bags.drop_duplicates().copy()

In [347]:
duplicate_tag_ids = bags['bag_tag_unique_number'][bags['bag_tag_unique_number'].duplicated()].unique()

print(f"Number of duplicate bag IDs: {len(duplicate_tag_ids)}")
duplicate_tag_rows = bags[bags['bag_tag_unique_number'].isin(duplicate_tag_ids)]   #number of duplicate bag ids

duplicate_tag_rows.head(20)

Number of duplicate bag IDs: 16896


Unnamed: 0,company_id,flight_number,scheduled_departure_date_local,scheduled_departure_station_code,scheduled_arrival_station_code,bag_tag_unique_number,bag_tag_issue_date,bag_type
10,OO,5350,2025-08-02,ORD,GRR,BAGTAG_251915,2025-07-28,TRANSFER
11,OO,5470,2025-08-01,ORD,GEG,BAGTAG_251915,2025-07-28,TRANSFER
29,UA,468,2025-08-03,ORD,IAH,BAGTAG_251920,2025-07-29,TRANSFER
30,UA,1658,2025-08-03,ORD,EWR,BAGTAG_251920,2025-07-29,TRANSFER
33,OO,5868,2025-08-04,ORD,YWG,BAGTAG_335262,2025-07-29,TRANSFER
34,OO,5868,2025-08-01,ORD,YWG,BAGTAG_335262,2025-07-29,TRANSFER
51,UA,3,2025-08-04,ORD,ZRH,BAGTAG_585167,2025-07-30,TRANSFER
52,UA,3,2025-08-02,ORD,ZRH,BAGTAG_585167,2025-07-30,TRANSFER
56,UA,533,2025-08-01,ORD,OMA,BAGTAG_501993,2025-07-30,TRANSFER
57,UA,1778,2025-08-02,ORD,OMA,BAGTAG_501993,2025-07-30,TRANSFER


In [348]:
bag_company_counts = bags['company_id'].value_counts()
bag_company_counts

company_id
UA    499282
OO     97434
YX     32203
G7     29454
LH      7156
NH      4070
ET      2557
LX      2487
AC      2170
AA      2064
OS      1731
EK      1474
TK      1169
XX      1016
AV       356
CM       343
EI       261
AI       260
LF       206
AS       206
BR       188
KG       187
LO       128
FI       107
DL        88
KL        48
KE        46
TP        42
9X        34
SK        32
BA        29
CX        20
AF        16
AY        15
IB        12
SN        10
JL         8
AZ         5
AD         5
WS         5
AM         4
WN         4
EY         3
A3         3
OZ         2
HA         2
VN         2
NZ         2
LA         1
NK         1
4Y         1
XQ         1
CA         1
SQ         1
Name: count, dtype: int64

In [349]:
valid_company_ids = {'UA', 'G7', 'OO', 'YX'}
non_ua_bags = bags[~bags['company_id'].isin(valid_company_ids)].copy()
print("Total non-UA bag rows:", non_ua_bags.shape[0])
non_ua_bags['bag_type'].value_counts(dropna=False)


Total non-UA bag rows: 28579


bag_type
TRANSFER        27534
HOT TRANSFER      847
ORIGIN            198
Name: count, dtype: int64

In [350]:
# Keep all UA/Express
# Keep transfer & hot transfer for all
# Dropping non-UA origin
bags_filtered = bags[
    (bags['company_id'].isin(valid_company_ids)) |
    (~bags['company_id'].isin(valid_company_ids) & bags['bag_type'].isin(['TRANSFER', 'HOT TRANSFER']))
].copy()

print("Removed:", len(bags) - len(bags_filtered))

Removed: 198


In [351]:
non_ua_bags = bags_filtered[~bags_filtered['company_id'].isin(valid_company_ids)].copy()
print("Total non-UA bag rows:", non_ua_bags.shape[0])
non_ua_bags['bag_type'].value_counts(dropna=False)

Total non-UA bag rows: 28381


bag_type
TRANSFER        27534
HOT TRANSFER      847
Name: count, dtype: int64

In [352]:
import re

# Boolean mask for valid bag tags
valid_bagtag_mask = bags_filtered['bag_tag_unique_number'].str.match(r'^BAGTAG_\d+$')

# How many are invalid
invalid_count = (~valid_bagtag_mask).sum()
print(f"Invalid bag_tag_unique_number count: {invalid_count}")     #checking if the format is correct


Invalid bag_tag_unique_number count: 0


In [353]:

# Load airports dataset
airports = pd.read_csv('../Data/Airports Data.csv')

# Clean column names just in case
airports.columns = airports.columns.str.strip()

# Make a set of valid IATA codes
valid_iata_codes = set(airports['airport_iata_code'].dropna().str.strip().str.upper())
print(f"Total valid IATA codes: {len(valid_iata_codes)}")


Total valid IATA codes: 5613


In [354]:
invalid_departure_mask = ~bags_filtered['scheduled_departure_station_code'].isin(valid_iata_codes)
invalid_departure_count = invalid_departure_mask.sum()

print(f"Invalid departure codes count: {invalid_departure_count}")
if invalid_departure_count > 0:
    print(bags_filtered.loc[invalid_departure_mask, 'scheduled_departure_station_code'].value_counts().head(10))


Invalid departure codes count: 0


In [355]:
invalid_arrival_mask = ~bags_filtered['scheduled_arrival_station_code'].isin(valid_iata_codes)
invalid_arrival_count = invalid_arrival_mask.sum()

print(f"Invalid arrival codes count: {invalid_arrival_count}")
if invalid_arrival_count > 0:
    print(bags_filtered.loc[invalid_arrival_mask, 'scheduled_arrival_station_code'].value_counts().head(10))

if 'AUL' in bags_filtered['scheduled_arrival_station_code'].unique():
    print("AUL is in the dataset")
else:
    print("AUL is not in the dataset")

#AUL is a valid iata airport code, with ISO = MH, so adding it to the Airports Data.csv file at the end line



Invalid arrival codes count: 0
AUL is in the dataset


In [356]:
origin_mask = bags_filtered['bag_type'] == 'ORIGIN'
origin_invalid = bags_filtered[origin_mask & (bags_filtered['scheduled_departure_date_local'] < bags_filtered['bag_tag_issue_date'].dt.date)]
print(f"Number of ORIGIN rows with departure date < bag issue date: {len(origin_invalid)}")
origin_invalid.head(10)      #maybe flight delayed from the scheduled departure time


Number of ORIGIN rows with departure date < bag issue date: 14462


Unnamed: 0,company_id,flight_number,scheduled_departure_date_local,scheduled_departure_station_code,scheduled_arrival_station_code,bag_tag_unique_number,bag_tag_issue_date,bag_type
25618,UA,602,2025-08-01,ORD,PHX,BAGTAG_424971,2025-08-02,ORIGIN
25619,UA,2360,2025-08-01,ORD,SNA,BAGTAG_508252,2025-08-02,ORIGIN
25620,YX,3493,2025-08-01,ORD,DTW,BAGTAG_591373,2025-08-02,ORIGIN
25621,YX,3604,2025-08-01,ORD,CMH,BAGTAG_258171,2025-08-02,ORIGIN
25622,UA,845,2025-08-01,ORD,GRU,BAGTAG_258173,2025-08-02,ORIGIN
25623,UA,845,2025-08-01,ORD,GRU,BAGTAG_91098,2025-08-02,ORIGIN
25625,UA,1756,2025-08-01,ORD,YYZ,BAGTAG_591375,2025-08-02,ORIGIN
25626,UA,2200,2025-08-01,ORD,SEA,BAGTAG_258174,2025-08-02,ORIGIN
25627,UA,938,2025-08-01,ORD,LHR,BAGTAG_91100,2025-08-02,ORIGIN
25628,UA,2106,2025-08-01,ORD,SFO,BAGTAG_341339,2025-08-02,ORIGIN


In [357]:
nonOrigin_mask = bags_filtered['bag_type'].isin(['TRANSFER', 'HOT TRANSFER'])
nonOrigin_invalid = bags_filtered[nonOrigin_mask & (bags_filtered['scheduled_departure_date_local'] < bags_filtered['bag_tag_issue_date'].dt.date)]
print(f"Number of NON ORIGIN rows with departure date < bag issue date: {len(nonOrigin_invalid)}")
nonOrigin_invalid.head(10)  #probably due to time zone differences as for connecting flights bag tag isnt changed

Number of NON ORIGIN rows with departure date < bag issue date: 185


Unnamed: 0,company_id,flight_number,scheduled_departure_date_local,scheduled_departure_station_code,scheduled_arrival_station_code,bag_tag_unique_number,bag_tag_issue_date,bag_type
27121,XX,9999,2025-08-01,ORD,TQO,BAGTAG_258572,2025-08-02,TRANSFER
27433,YX,3614,2025-08-01,ORD,MCI,BAGTAG_341829,2025-08-02,TRANSFER
27438,YX,3614,2025-08-01,ORD,MCI,BAGTAG_425393,2025-08-02,TRANSFER
51509,UA,767,2025-08-02,ORD,DEN,BAGTAG_597696,2025-08-03,TRANSFER
73813,UA,1209,2025-08-03,ORD,EWR,BAGTAG_186475,2025-08-04,TRANSFER
96204,YX,3493,2025-08-04,ORD,DTW,BAGTAG_525825,2025-08-05,TRANSFER
96305,UA,2628,2025-08-04,ORD,BNA,BAGTAG_108560,2025-08-05,TRANSFER
96306,UA,2628,2025-08-04,ORD,BNA,BAGTAG_442211,2025-08-05,TRANSFER
96307,UA,2628,2025-08-04,ORD,BNA,BAGTAG_191989,2025-08-05,TRANSFER
96308,UA,2628,2025-08-04,ORD,BNA,BAGTAG_191990,2025-08-05,TRANSFER


In [358]:
bag_issue_dt = pd.to_datetime(bags_filtered['bag_tag_issue_date'], errors='coerce')
departure_dt = pd.to_datetime(bags_filtered['scheduled_departure_date_local'], errors='coerce')

bag_issue_norm = bag_issue_dt.dt.normalize()
departure_norm = departure_dt.dt.normalize()

origin_diff_days = (bag_issue_norm - departure_norm).dt.days
origin_mask = bags_filtered['bag_type'] == 'ORIGIN'
invalid_large_gap_mask = origin_mask & (origin_diff_days > 1)
invalid_large_gap = bags_filtered[invalid_large_gap_mask].copy()
invalid_large_gap['diff_days'] = origin_diff_days.loc[invalid_large_gap_mask]  # index-safe
# optional: invalid_large_gap['diff_days'] = invalid_large_gap['diff_days'].astype('Int64')

print(f"Number of ORIGIN bags with issue date more than 1 day after departure: {len(invalid_large_gap)}")
invalid_large_gap.head()
         #very small amount, most probably if the bag is misplaced by the airline and bag tag gets damaged so we have to re issue the bag tag


Number of ORIGIN bags with issue date more than 1 day after departure: 13


Unnamed: 0,company_id,flight_number,scheduled_departure_date_local,scheduled_departure_station_code,scheduled_arrival_station_code,bag_tag_unique_number,bag_tag_issue_date,bag_type,diff_days
233263,UA,2378,2025-08-06,ORD,LAX,BAGTAG_641740,2025-08-11,ORIGIN,5
249085,UA,219,2025-08-10,ORD,HNL,BAGTAG_145584,2025-08-12,ORIGIN,2
265472,UA,468,2025-08-08,ORD,IAH,BAGTAG_566375,2025-08-12,ORIGIN,4
281213,OO,5021,2025-08-04,ORD,OMA,BAGTAG_153090,2025-08-13,ORIGIN,9
285769,YX,3409,2025-08-08,ORD,YOW,BAGTAG_237585,2025-08-13,ORIGIN,5


In [359]:
import duckdb
from pathlib import Path

db_path = Path('../sql_databases/skyhack.duckdb')   # creates file if not exists
con = duckdb.connect(str(db_path))

# Register the DataFrame in this session
con.register('bagsData_df', bags_filtered)

# Create/replace table in the DB file
con.execute("""
    CREATE OR REPLACE TABLE bagsData_cleaned AS
    SELECT * FROM bagsData_df
""")

con.close()


In [360]:
con = duckdb.connect('../sql_databases/skyhack.duckdb')
con.execute("SHOW TABLES").df()

Unnamed: 0,name
0,PNRFlight
1,airports_cleaned
2,bagsData_cleaned
3,flights_cleaned
4,pnr_remarks_cleaned


In [361]:
sql ='''WITH b AS (
  SELECT
    UPPER(TRIM(company_id)) AS company_id,
    TRIM(CAST(flight_number AS VARCHAR)) AS flight_number,
    DATE(scheduled_departure_date_local) AS dep_date,
    UPPER(TRIM(scheduled_departure_station_code)) AS dep_iata
  FROM bagsData_cleaned
),
f AS (
  SELECT
    UPPER(TRIM(company_id)) AS company_id,
    TRIM(CAST(flight_number AS VARCHAR)) AS flight_number,
    DATE(scheduled_departure_date_local) AS dep_date,
    UPPER(TRIM(scheduled_departure_station_code)) AS dep_iata
  FROM flights_cleaned
)
SELECT
  CASE WHEN f.flight_number IS NULL THEN 'left_only' ELSE 'both' END AS match_status,
  COUNT(*) AS row_count
FROM b
LEFT JOIN f
  ON b.dep_iata = f.dep_iata  
  AND b.flight_number = f.flight_number        
  AND b.dep_date = f.dep_date
GROUP BY 1
ORDER BY 1'''
con.execute(sql).df()  #looks like the bag+level dataset has flight numbers of connecting flights that is of the company id                          

Unnamed: 0,match_status,row_count
0,both,634522
1,left_only,52924


In [362]:
sql ='''WITH b AS (
  SELECT
    UPPER(TRIM(company_id)) AS company_id,
    TRIM(CAST(flight_number AS VARCHAR)) AS flight_number,
    DATE(scheduled_departure_date_local) AS dep_date,
    UPPER(TRIM(scheduled_departure_station_code)) AS dep_iata
  FROM bagsData_cleaned
),
f AS (
  SELECT
    UPPER(TRIM(company_id)) AS company_id,
    TRIM(CAST(flight_number AS VARCHAR)) AS flight_number,
    DATE(scheduled_departure_date_local) AS dep_date,
    UPPER(TRIM(scheduled_departure_station_code)) AS dep_iata
  FROM flights_cleaned
)
SELECT
  CASE WHEN f.flight_number IS NULL THEN 'left_only' ELSE 'both' END AS match_status,
  COUNT(*) AS row_count
FROM b
LEFT JOIN f
  ON b.dep_iata = f.dep_iata                            
  AND b.dep_date = f.dep_date
GROUP BY 1
ORDER BY 1'''
con.execute(sql).df()

Unnamed: 0,match_status,row_count
0,both,369916894


In [363]:
sql ='''WITH b AS (
  SELECT
    UPPER(TRIM(company_id)) AS company_id,
    TRIM(CAST(flight_number AS VARCHAR)) AS flight_number,
    DATE(scheduled_departure_date_local) AS dep_date,
    UPPER(TRIM(scheduled_departure_station_code)) AS dep_iata
  FROM bagsData_cleaned
),
f AS (
  SELECT
    UPPER(TRIM(company_id)) AS company_id,
    TRIM(CAST(flight_number AS VARCHAR)) AS flight_number,
    DATE(scheduled_departure_date_local) AS dep_date,
    UPPER(TRIM(scheduled_departure_station_code)) AS dep_iata
  FROM flights_cleaned
)
SELECT
  CASE WHEN f.flight_number IS NULL THEN 'left_only' ELSE 'both' END AS match_status,
  COUNT(*) AS row_count
FROM f
LEFT JOIN b
  ON b.dep_iata = f.dep_iata  
  AND b.flight_number = f.flight_number                            
  AND b.dep_date = f.dep_date
GROUP BY 1
ORDER BY 1'''
con.execute(sql).df() #  so all UA,G7,OO,YX present in bags+data 
# these are done to match if the tables match with the desired entries and there's no mismatch

Unnamed: 0,match_status,row_count
0,both,634522


In [364]:
sql='''WITH b AS (
  SELECT
    UPPER(TRIM(company_id)) AS company_id,
    TRIM(CAST(flight_number AS VARCHAR)) AS flight_number,
    DATE(scheduled_departure_date_local) AS dep_date,
    UPPER(TRIM(scheduled_departure_station_code)) AS dep_iata
  FROM bagsData_cleaned
),
f AS (
  SELECT
    UPPER(TRIM(company_id)) AS company_id,
    TRIM(CAST(flight_number AS VARCHAR)) AS flight_number,
    DATE(scheduled_departure_date_local) AS dep_date,
    UPPER(TRIM(scheduled_departure_station_code)) AS dep_iata
  FROM flights_cleaned
)
SELECT 
  b.company_id,
  b.flight_number,
  b.dep_date,
  b.dep_iata,
  COUNT(*) AS unmatched_rows
FROM b
LEFT JOIN f
  ON b.dep_date     = f.dep_date
  AND b.dep_iata     = f.dep_iata
WHERE f.flight_number IS NULL
GROUP BY 1,2,3,4
ORDER BY unmatched_rows DESC
LIMIT 50;
''' 
con.execute(sql).df()

Unnamed: 0,company_id,flight_number,dep_date,dep_iata,unmatched_rows


In [365]:
con.close()