In [78]:
import pandas as pd
pnr_remarks = pd.read_csv('../Data/PNR Remark Level Data.csv')
print(pnr_remarks.shape)
pnr_remarks.head()


(51698, 4)


Unnamed: 0,record_locator,pnr_creation_date,flight_number,special_service_request
0,PNR_153489,2025-07-04,308,Airport Wheelchair
1,PNR_153489,2025-07-04,2182,Airport Wheelchair
2,PNR_153508,2025-07-10,2649,Airport Wheelchair
3,PNR_153508,2025-07-10,2100,Airport Wheelchair
4,PNR_153521,2025-03-01,970,Unaccompanied Minor


In [79]:
#Columns to treat as strings
str_cols = ['record_locator', 'flight_number', 'special_service_request']

# Normalize strings: strip, convert to pandas StringDtype, and convert none/null/nan into <NA>
for col in str_cols:
    s = pnr_remarks[col].astype('string').str.strip()
    s = s.mask(s.str.len() == 0)  # empty -> <NA>
    s = s.mask(s.str.lower().isin(['nan', 'none', 'null']))  
    if col == 'special_service_request':
        s = s.str.upper()
    pnr_remarks[col] = s

# Parse date independently
pnr_remarks['pnr_creation_date'] = pd.to_datetime(pnr_remarks['pnr_creation_date'], errors='coerce')

# Missing counts
missing = {
    'record_locator': pnr_remarks['record_locator'].isna().sum(),
    'flight_number': pnr_remarks['flight_number'].isna().sum(),
    'special_service_request': pnr_remarks['special_service_request'].isna().sum(),
    'pnr_creation_date': pnr_remarks['pnr_creation_date'].isna().sum(),
}

for col, count in missing.items():
    print(f"{col:10} : {count}")

pnr_remarks.head()

record_locator : 0
flight_number : 0
special_service_request : 0
pnr_creation_date : 0


Unnamed: 0,record_locator,pnr_creation_date,flight_number,special_service_request
0,PNR_153489,2025-07-04,308,AIRPORT WHEELCHAIR
1,PNR_153489,2025-07-04,2182,AIRPORT WHEELCHAIR
2,PNR_153508,2025-07-10,2649,AIRPORT WHEELCHAIR
3,PNR_153508,2025-07-10,2100,AIRPORT WHEELCHAIR
4,PNR_153521,2025-03-01,970,UNACCOMPANIED MINOR


In [80]:
total_rows = pnr_remarks.shape[0]  #check for unique record locators
unique_record_locators = pnr_remarks['record_locator'].nunique()

print(f"Total rows: {total_rows}")
print(f"Unique record locators: {unique_record_locators}")
print(f"Average SSR entries per record_locator: {total_rows / unique_record_locators:.2f}")  #not all unique record locators 

Total rows: 51698
Unique record locators: 16850
Average SSR entries per record_locator: 3.07


In [81]:
unique_flights = pnr_remarks['flight_number'].nunique()
print(f"Unique flights: {unique_flights}")

Unique flights: 4678


In [82]:
duplicate_rows = pnr_remarks[pnr_remarks.duplicated()]  #check for duplicate rows
print(f"Number of duplicate rows: {duplicate_rows.shape[0]}")

#there are some passenger with same record locators as they have the same booking reference number


Number of duplicate rows: 0


In [83]:
key_cols = ['record_locator', 'pnr_creation_date']

# Check for duplicates
duplicates_mask = pnr_remarks.duplicated(subset=key_cols, keep=False)  # keep=False to mark all dupes
duplicates = pnr_remarks[duplicates_mask].sort_values(key_cols)

print(f"Number of duplicate rows (based on {key_cols}): {len(duplicates)}")  #different flight numbers going to different destinations
duplicates.head(10)

Number of duplicate rows (based on ['record_locator', 'pnr_creation_date']): 48348


Unnamed: 0,record_locator,pnr_creation_date,flight_number,special_service_request
21375,PNR_100007,2025-06-17,1528,AIRPORT WHEELCHAIR
21376,PNR_100007,2025-06-17,2011,AIRPORT WHEELCHAIR
21377,PNR_100007,2025-06-17,1118,AIRPORT WHEELCHAIR
21378,PNR_100007,2025-06-17,62,AIRPORT WHEELCHAIR
21379,PNR_100007,2025-06-17,844,AIRPORT WHEELCHAIR
21380,PNR_100081,2025-06-18,4707,AIRPORT WHEELCHAIR
21381,PNR_100081,2025-06-18,2378,AIRPORT WHEELCHAIR
21383,PNR_100187,2025-07-12,1898,AIRPORT WHEELCHAIR
21384,PNR_100187,2025-07-12,5772,AIRPORT WHEELCHAIR
21385,PNR_100187,2025-07-12,617,AIRPORT WHEELCHAIR


In [84]:
key_cols = ['record_locator', 'pnr_creation_date','flight_number']

# Check for duplicates
duplicates_mask = pnr_remarks.duplicated(subset=key_cols, keep=False)  # keep=False to mark all dupes
duplicates = pnr_remarks[duplicates_mask].sort_values(key_cols)

print(f"Number of duplicate rows (based on {key_cols}): {len(duplicates)}")  #most probably going to to the same destination, multiple family members
duplicates.head(10)

Number of duplicate rows (based on ['record_locator', 'pnr_creation_date', 'flight_number']): 5350


Unnamed: 0,record_locator,pnr_creation_date,flight_number,special_service_request
21404,PNR_100211,2025-07-12,2226,AIRPORT WHEELCHAIR
21408,PNR_100211,2025-07-12,2226,MANUAL WHEELCHAIR
21409,PNR_100211,2025-07-12,2306,MANUAL WHEELCHAIR
21410,PNR_100211,2025-07-12,2306,AIRPORT WHEELCHAIR
21405,PNR_100211,2025-07-12,441,AIRPORT WHEELCHAIR
21406,PNR_100211,2025-07-12,441,MANUAL WHEELCHAIR
21407,PNR_100211,2025-07-12,539,MANUAL WHEELCHAIR
21411,PNR_100211,2025-07-12,539,AIRPORT WHEELCHAIR
21420,PNR_100388,2025-05-20,2389,AIRPORT WHEELCHAIR
21421,PNR_100388,2025-05-20,2389,MANUAL WHEELCHAIR


In [85]:
import re

pattern = r'^PNR_\d+$'   #checking if the record locator starts with PNR_ and followed by numbers

# Boolean mask
mask_valid_locator = pnr_remarks['record_locator'].str.match(pattern, na=False)

valid_count = mask_valid_locator.sum()
invalid_count = (~mask_valid_locator).sum()

print(f"Valid record_locators: {valid_count}")
print(f"Invalid record_locators: {invalid_count}")

Valid record_locators: 51698
Invalid record_locators: 0


In [86]:
ssr_counts = pnr_remarks['special_service_request'].value_counts(dropna=False)
print(ssr_counts)     #no. of unique SSR values

special_service_request
AIRPORT WHEELCHAIR     45738
MANUAL WHEELCHAIR       3641
UNACCOMPANIED MINOR     1706
ELECTRIC WHEELCHAIR      613
Name: count, dtype: Int64


In [87]:
''' ssr_map = {
    'AIRPORT WHEELCHAIR': 'WHEELCHAIR',
    'MANUAL WHEELCHAIR': 'WHEELCHAIR',              # substituting all wheelchair variants with wheelchair
    'ELECTRIC WHEELCHAIR': 'WHEELCHAIR',
    'UNACCOMPANIED MINOR': 'UNACCOMPANIED_MINOR',
}

pnr_remarks['special_service_request'] = (
    pnr_remarks['special_service_request']
    .replace(ssr_map)
) '''

# we could have just substituted different variants of wheelchair with wheelchair but upon further research, i understood that people with different wheelchairs require different resources and services


" ssr_map = {\n    'AIRPORT WHEELCHAIR': 'WHEELCHAIR',\n    'MANUAL WHEELCHAIR': 'WHEELCHAIR',              # substituting all wheelchair variants with wheelchair\n    'ELECTRIC WHEELCHAIR': 'WHEELCHAIR',\n    'UNACCOMPANIED MINOR': 'UNACCOMPANIED_MINOR',\n}\n\npnr_remarks['special_service_request'] = (\n    pnr_remarks['special_service_request']\n    .replace(ssr_map)\n) "

In [88]:
pnr_remarks.shape

(51698, 4)

In [89]:
import duckdb

# connecting to the database
con = duckdb.connect('../sql_databases/skyhack.duckdb')

con.register('pnr_remarks_df', pnr_remarks)

# creating the table
con.execute("""
    CREATE OR REPLACE TABLE pnr_remarks_cleaned AS
    SELECT * FROM pnr_remarks_df
""")
con.close()