In [69]:
import pandas as pd
pnr_flights = pd.read_csv("../Data/PNR+Flight+Level+Data.csv")

In [70]:
pnr_flights.info()
pnr_flights.columns

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 687878 entries, 0 to 687877
Data columns (total 12 columns):
 #   Column                            Non-Null Count   Dtype 
---  ------                            --------------   ----- 
 0   company_id                        687878 non-null  object
 1   flight_number                     687878 non-null  int64 
 2   scheduled_departure_date_local    687878 non-null  object
 3   scheduled_departure_station_code  687878 non-null  object
 4   scheduled_arrival_station_code    687878 non-null  object
 5   record_locator                    687878 non-null  object
 6   pnr_creation_date                 687878 non-null  object
 7   total_pax                         687878 non-null  int64 
 8   is_child                          687878 non-null  object
 9   basic_economy_ind                 687878 non-null  int64 
 10  is_stroller_user                  687878 non-null  object
 11  lap_child_count                   687878 non-null  int64 
dtypes:

Index(['company_id', 'flight_number', 'scheduled_departure_date_local',
       'scheduled_departure_station_code', 'scheduled_arrival_station_code',
       'record_locator', 'pnr_creation_date', 'total_pax', 'is_child',
       'basic_economy_ind', 'is_stroller_user', 'lap_child_count'],
      dtype='object')

In [71]:
str_cols_upper = [
    'company_id',
    'scheduled_departure_station_code',
    'scheduled_arrival_station_code'
]

str_cols_strip = [
    'flight_number',
    'record_locator',
    'is_child',
    'is_stroller_user'
]

date_cols = [
    'scheduled_departure_date_local',
    'pnr_creation_date'
]

numeric_cols = [
    'total_pax',
    'lap_child_count',
    'basic_economy_ind'
]
# Convert flight_number to string (avoid leading zero issues)
pnr_flights['flight_number'] = pnr_flights['flight_number'].astype(str).str.strip()

# Uppercase where appropriate
for col in str_cols_upper:
    if col in pnr_flights.columns:
        pnr_flights[col] = pnr_flights[col].astype(str).str.strip().str.upper()

# Strip (no case change) for listed columns
for col in str_cols_strip:
    if col in pnr_flights.columns:
        pnr_flights[col] = pnr_flights[col].astype(str).str.strip()

# dates handled separately
for col in date_cols:
    if col in pnr_flights.columns:
        pnr_flights[col] = pd.to_datetime(pnr_flights[col], errors='coerce').dt.date

# Expect values like: Y/N, Yes/No, True/False, 1/0, blanks → map to 1/0/NaN
def to_bool01(series):
    s = series.astype(str).str.strip().str.upper()
    true_vals  = {'Y','YES','TRUE','T','1'}
    false_vals = {'N','NO','FALSE','F','0',''}
    mapped = s.map(lambda x: 1 if x in true_vals else (0 if x in false_vals else np.nan))
    return mapped.astype('float').astype('Int64') 

if 'is_child' in pnr_flights.columns:
    pnr_flights['is_child_ind'] = to_bool01(pnr_flights['is_child'])

if 'is_stroller_user' in pnr_flights.columns:
    pnr_flights['is_stroller_user_ind'] = to_bool01(pnr_flights['is_stroller_user'])

# Ensure basic_economy_ind is 0/1 
if 'basic_economy_ind' in pnr_flights.columns:
    pnr_flights['basic_economy_ind'] = pd.to_numeric(pnr_flights['basic_economy_ind'], errors='coerce').astype('float').astype('Int64')

# checking missing values
missing_counts = {}
for col in pnr_flights.columns:
    if (col in str_cols_upper) or (col in str_cols_strip):
        missing_counts[col] = (pnr_flights[col].isna() | (pnr_flights[col].str.len() == 0)).sum()
    else:
        missing_counts[col] = pnr_flights[col].isna().sum()

print("\nMissing values per column:")
for k, v in missing_counts.items():
    print(f"{k}: {v}")

#checking for full row duplicates
full_dupes = pnr_flights.duplicated().sum()
print(f"\nFull duplicate rows: {full_dupes}")



Missing values per column:
company_id: 0
flight_number: 0
scheduled_departure_date_local: 0
scheduled_departure_station_code: 0
scheduled_arrival_station_code: 0
record_locator: 0
pnr_creation_date: 0
total_pax: 0
is_child: 0
basic_economy_ind: 0
is_stroller_user: 0
lap_child_count: 0
is_child_ind: 0
is_stroller_user_ind: 0

Full duplicate rows: 0


In [72]:
total_rows = pnr_flights.shape[0]  #check for unique record locators
unique_record_locators = pnr_flights['record_locator'].nunique()

print(f"Total rows: {total_rows}")
print(f"Unique record locators: {unique_record_locators}")
print(f"Average SSR entries per record_locator: {total_rows / unique_record_locators:.2f}") 

Total rows: 687878
Unique record locators: 609758
Average SSR entries per record_locator: 1.13


In [73]:
key_cols = ['record_locator', 'flight_number','scheduled_departure_date_local','scheduled_arrival_station_code','scheduled_departure_station_code','company_id','total_pax','pnr_creation_date','basic_economy_ind']

# Check for duplicates
duplicates_mask = pnr_flights.duplicated(subset=key_cols, keep=False)  
duplicates = pnr_flights[duplicates_mask].sort_values(key_cols)

print(f"Number of duplicate rows (based on {key_cols}): {len(duplicates)}")   #every column without "is_child", "is_stroller_user", "lap_child_count"
duplicates.head(20)

Number of duplicate rows (based on ['record_locator', 'flight_number', 'scheduled_departure_date_local', 'scheduled_arrival_station_code', 'scheduled_departure_station_code', 'company_id', 'total_pax', 'pnr_creation_date', 'basic_economy_ind']): 80227


Unnamed: 0,company_id,flight_number,scheduled_departure_date_local,scheduled_departure_station_code,scheduled_arrival_station_code,record_locator,pnr_creation_date,total_pax,is_child,basic_economy_ind,is_stroller_user,lap_child_count,is_child_ind,is_stroller_user_ind
683627,UA,2481,2025-08-02,ORD,SYR,PNR_100021,2025-02-25,3,Y,1,N,0,1,0
683628,UA,2481,2025-08-02,ORD,SYR,PNR_100021,2025-02-25,3,N,1,N,0,0,0
515970,OO,5320,2025-08-13,ORD,MKE,PNR_100051,2025-07-29,2,N,0,N,0,0,0
515971,OO,5320,2025-08-13,ORD,MKE,PNR_100051,2025-07-29,2,Y,0,N,0,1,0
347166,UA,587,2025-08-02,ORD,YUL,PNR_100068,2025-07-06,2,Y,0,N,0,1,0
347167,UA,587,2025-08-02,ORD,YUL,PNR_100068,2025-07-06,2,N,0,N,0,0,0
347320,UA,909,2025-08-05,ORD,AMS,PNR_100075,2025-04-14,4,N,0,N,0,0,0
347321,UA,909,2025-08-05,ORD,AMS,PNR_100075,2025-04-14,4,Y,0,N,0,1,0
347322,UA,909,2025-08-05,ORD,AMS,PNR_100075,2025-04-14,4,Y,0,Y,0,1,1
347348,OO,4778,2025-08-01,ORD,FAR,PNR_100076,2025-07-29,2,Y,0,N,0,1,0


In [74]:
key_cols = ['record_locator', 'flight_number','scheduled_departure_date_local','scheduled_arrival_station_code','scheduled_departure_station_code','company_id','total_pax','pnr_creation_date','basic_economy_ind','is_child']

# Check for duplicates
duplicates_mask = pnr_flights.duplicated(subset=key_cols, keep=False) 
duplicates = pnr_flights[duplicates_mask].sort_values(key_cols)

print(f"Number of duplicate rows (based on {key_cols}): {len(duplicates)}")  #every column without "is_stroller_user", "lap_child_count"
duplicates.head(20)

Number of duplicate rows (based on ['record_locator', 'flight_number', 'scheduled_departure_date_local', 'scheduled_arrival_station_code', 'scheduled_departure_station_code', 'company_id', 'total_pax', 'pnr_creation_date', 'basic_economy_ind', 'is_child']): 18080


Unnamed: 0,company_id,flight_number,scheduled_departure_date_local,scheduled_departure_station_code,scheduled_arrival_station_code,record_locator,pnr_creation_date,total_pax,is_child,basic_economy_ind,is_stroller_user,lap_child_count,is_child_ind,is_stroller_user_ind
347321,UA,909,2025-08-05,ORD,AMS,PNR_100075,2025-04-14,4,Y,0,N,0,1,0
347322,UA,909,2025-08-05,ORD,AMS,PNR_100075,2025-04-14,4,Y,0,Y,0,1,1
348748,UA,2672,2025-08-11,ORD,PNS,PNR_100128,2025-07-17,5,Y,1,N,0,1,0
348750,UA,2672,2025-08-11,ORD,PNS,PNR_100128,2025-07-17,5,Y,1,Y,0,1,1
350462,UA,1576,2025-08-11,ORD,LGA,PNR_100202,2025-03-28,6,Y,0,N,0,1,0
350463,UA,1576,2025-08-11,ORD,LGA,PNR_100202,2025-03-28,6,Y,0,Y,0,1,1
351814,UA,556,2025-08-08,ORD,CLT,PNR_100251,2025-08-04,2,N,0,N,1,0,0
351815,UA,556,2025-08-08,ORD,CLT,PNR_100251,2025-08-04,2,N,0,N,0,0,0
591980,UA,1306,2025-08-09,ORD,TPA,PNR_10041,2025-06-26,4,Y,1,N,0,1,0
591981,UA,1306,2025-08-09,ORD,TPA,PNR_10041,2025-06-26,4,Y,1,Y,0,1,1


In [75]:
key_cols = ['record_locator', 'flight_number','scheduled_departure_date_local','scheduled_arrival_station_code','scheduled_departure_station_code','company_id','total_pax','pnr_creation_date','basic_economy_ind','is_stroller_user','is_child']

# Check for duplicates
duplicates_mask = pnr_flights.duplicated(subset=key_cols, keep=False)  
duplicates = pnr_flights[duplicates_mask].sort_values(key_cols)

print(f"Number of duplicate rows (based on {key_cols}): {len(duplicates)}")  #every column without "lap_child_count"
duplicates.head(20)

Number of duplicate rows (based on ['record_locator', 'flight_number', 'scheduled_departure_date_local', 'scheduled_arrival_station_code', 'scheduled_departure_station_code', 'company_id', 'total_pax', 'pnr_creation_date', 'basic_economy_ind', 'is_stroller_user', 'is_child']): 7634


Unnamed: 0,company_id,flight_number,scheduled_departure_date_local,scheduled_departure_station_code,scheduled_arrival_station_code,record_locator,pnr_creation_date,total_pax,is_child,basic_economy_ind,is_stroller_user,lap_child_count,is_child_ind,is_stroller_user_ind
351814,UA,556,2025-08-08,ORD,CLT,PNR_100251,2025-08-04,2,N,0,N,1,0,0
351815,UA,556,2025-08-08,ORD,CLT,PNR_100251,2025-08-04,2,N,0,N,0,0,0
191740,UA,2133,2025-08-07,ORD,EWR,PNR_100623,2025-08-04,3,N,0,N,0,0,0
191741,UA,2133,2025-08-07,ORD,EWR,PNR_100623,2025-08-04,3,N,0,N,1,0,0
193573,UA,275,2025-08-10,ORD,RAP,PNR_100685,2025-06-18,5,N,0,N,0,0,0
193575,UA,275,2025-08-10,ORD,RAP,PNR_100685,2025-06-18,5,N,0,N,1,0,0
208349,UA,1044,2025-08-13,ORD,LAX,PNR_101247,2025-08-04,2,N,0,N,0,0,0
208350,UA,1044,2025-08-13,ORD,LAX,PNR_101247,2025-08-04,2,N,0,N,1,0,0
32267,UA,660,2025-08-05,ORD,SAN,PNR_101301,2025-04-03,4,N,0,N,0,0,0
32268,UA,660,2025-08-05,ORD,SAN,PNR_101301,2025-04-03,4,N,0,N,1,0,0


In [76]:
key_cols = ['record_locator', 'flight_number','scheduled_departure_date_local','scheduled_arrival_station_code','scheduled_departure_station_code','company_id','total_pax','pnr_creation_date','basic_economy_ind','is_stroller_user','is_child','lap_child_count']

# Check for duplicates
duplicates_mask = pnr_flights.duplicated(subset=key_cols, keep=False)  
duplicates = pnr_flights[duplicates_mask].sort_values(key_cols)

print(f"Number of duplicate rows (based on {key_cols}): {len(duplicates)}")  #for every column
duplicates.head(20)

Number of duplicate rows (based on ['record_locator', 'flight_number', 'scheduled_departure_date_local', 'scheduled_arrival_station_code', 'scheduled_departure_station_code', 'company_id', 'total_pax', 'pnr_creation_date', 'basic_economy_ind', 'is_stroller_user', 'is_child', 'lap_child_count']): 0


Unnamed: 0,company_id,flight_number,scheduled_departure_date_local,scheduled_departure_station_code,scheduled_arrival_station_code,record_locator,pnr_creation_date,total_pax,is_child,basic_economy_ind,is_stroller_user,lap_child_count,is_child_ind,is_stroller_user_ind


it was observed that multiple entries were made if any of "is_child", "lap_child_count", "stroller_used" was true

In [77]:
# so we collapse the data by the key columns and take the max of the boolean flags and lap_child_count
key_cols = [
    'company_id',
    'flight_number',
    'scheduled_departure_date_local',
    'scheduled_departure_station_code',
    'scheduled_arrival_station_code',
    'record_locator',
    'pnr_creation_date',
    'basic_economy_ind'
]

# Convert is_child / is_stroller_user to 0/1 if not already
pnr_flights['is_child_flag'] = pnr_flights['is_child'].astype(str).str.upper().map({'Y':1,'N':0}).fillna(0).astype(int)
pnr_flights['is_stroller_flag'] = pnr_flights['is_stroller_user'].astype(str).str.upper().map({'Y':1,'N':0}).fillna(0).astype(int)

pnr_collapsed = (
    pnr_flights
    .groupby(key_cols, as_index=False)
    .agg({
        'total_pax':'sum',               # or max depending on semantics
        'is_child_flag':'max',
        'is_stroller_flag':'max',
        'lap_child_count':'max'
    })
)


In [78]:
pnr_collapsed.columns

Index(['company_id', 'flight_number', 'scheduled_departure_date_local',
       'scheduled_departure_station_code', 'scheduled_arrival_station_code',
       'record_locator', 'pnr_creation_date', 'basic_economy_ind', 'total_pax',
       'is_child_flag', 'is_stroller_flag', 'lap_child_count'],
      dtype='object')

In [79]:
key_cols = ['record_locator', 'flight_number','scheduled_departure_date_local','scheduled_arrival_station_code','scheduled_departure_station_code','company_id','total_pax','pnr_creation_date','basic_economy_ind']

# Check for duplicates
duplicates_mask = pnr_collapsed.duplicated(subset=key_cols, keep=False)  # keep=False to mark all dupes
duplicates = pnr_collapsed[duplicates_mask].sort_values(key_cols)

print(f"Number of duplicate rows (based on {key_cols}): {len(duplicates)}")   # all multiple entries are collapsed
duplicates.head(20)

Number of duplicate rows (based on ['record_locator', 'flight_number', 'scheduled_departure_date_local', 'scheduled_arrival_station_code', 'scheduled_departure_station_code', 'company_id', 'total_pax', 'pnr_creation_date', 'basic_economy_ind']): 0


Unnamed: 0,company_id,flight_number,scheduled_departure_date_local,scheduled_departure_station_code,scheduled_arrival_station_code,record_locator,pnr_creation_date,basic_economy_ind,total_pax,is_child_flag,is_stroller_flag,lap_child_count


In [80]:
pnr_collapsed.head(10)

Unnamed: 0,company_id,flight_number,scheduled_departure_date_local,scheduled_departure_station_code,scheduled_arrival_station_code,record_locator,pnr_creation_date,basic_economy_ind,total_pax,is_child_flag,is_stroller_flag,lap_child_count
0,G7,4173,2025-08-01,ORD,AVL,PNR_101120,2025-07-24,0,2,0,0,0
1,G7,4173,2025-08-01,ORD,AVL,PNR_101566,2025-07-01,0,1,0,0,0
2,G7,4173,2025-08-01,ORD,AVL,PNR_111719,2025-07-11,0,1,0,0,0
3,G7,4173,2025-08-01,ORD,AVL,PNR_155770,2025-07-23,0,1,0,0,0
4,G7,4173,2025-08-01,ORD,AVL,PNR_188141,2025-04-23,0,1,0,0,0
5,G7,4173,2025-08-01,ORD,AVL,PNR_213016,2025-07-31,0,1,0,0,0
6,G7,4173,2025-08-01,ORD,AVL,PNR_218849,2025-07-02,0,1,0,0,0
7,G7,4173,2025-08-01,ORD,AVL,PNR_228233,2025-05-19,0,1,0,0,0
8,G7,4173,2025-08-01,ORD,AVL,PNR_275644,2025-07-04,0,1,0,0,0
9,G7,4173,2025-08-01,ORD,AVL,PNR_34739,2025-06-08,0,2,0,0,0


In [81]:
# Load airports dataset
airports = pd.read_csv('../Data/Airports Data.csv')

airports.columns = airports.columns.str.strip()

#set of valid IATA codes
valid_iata_codes = set(airports['airport_iata_code'].dropna().str.strip().str.upper())
print(f"Total valid IATA codes: {len(valid_iata_codes)}")

Total valid IATA codes: 5613


In [82]:
invalid_departure_mask = ~pnr_collapsed['scheduled_departure_station_code'].isin(valid_iata_codes)
invalid_departure_count = invalid_departure_mask.sum()

print(f"Invalid departure codes count: {invalid_departure_count}")
if invalid_departure_count > 0:
    print(pnr_collapsed.loc[invalid_departure_mask, 'scheduled_departure_station_code'].value_counts().head(10))

Invalid departure codes count: 0


In [83]:
invalid_arrival_mask = ~pnr_collapsed['scheduled_arrival_station_code'].isin(valid_iata_codes)
invalid_arrival_count = invalid_arrival_mask.sum()

print(f"Invalid arrival codes count: {invalid_arrival_count}")
if invalid_arrival_count > 0:
    print(pnr_collapsed.loc[invalid_arrival_mask, 'scheduled_arrival_station_code'].value_counts().head(10))

Invalid arrival codes count: 0


In [84]:
pnr_collapsed.shape[0]

644315

In [85]:
import duckdb
from pathlib import Path

db_path = Path('../sql_databases/skyhack.duckdb')   # creates file if not exists
con = duckdb.connect(str(db_path))

# Register the DataFrame in this session
con.register('PNRFlight_df', pnr_collapsed)

# Create/replace table in the DB file
con.execute("""
    CREATE OR REPLACE TABLE PNRFlight AS
    SELECT * FROM PNRFlight_df
""")

con.close()
