In [43]:
import pandas as pd
airports = pd.read_csv('../Data/Airports Data.csv')
airports = airports.map(lambda x: x.strip() if isinstance(x, str) else x)
print("airports shape: ",airports.shape)
airports.head()


airports shape:  (5613, 2)


Unnamed: 0,airport_iata_code,iso_country_code
0,DAU,PG
1,MDU,PG
2,MXH,PG
3,MIS,PG
4,TIZ,PG


In [44]:
airports['airport_iata_code'].nunique()  # all unique iata codes


5613

In [45]:
duplicate_rows = airports[airports.duplicated()]    #check for duplicate rows
print(f"Number of duplicate rows: {duplicate_rows.shape[0]}")

Number of duplicate rows: 0


In [46]:
(airports == '').sum()  # check for empty strings


airport_iata_code    0
iso_country_code     0
dtype: int64

In [47]:
airports.isnull().sum()  #check for null values
# got 15 null values in iso_country_code column 

airport_iata_code     0
iso_country_code     15
dtype: int64

In [48]:
# check for invalid iata codes format
mask_len = airports['airport_iata_code'].str.len() != 3     # Check length != 3

mask_case = airports['airport_iata_code'] != airports['airport_iata_code'].str.upper()     # Check not uppercase 

mask_alpha = ~airports['airport_iata_code'].str.match(r'^[A-Z]{3}$', na=False)   # Check non-alphabetic characters 

mask_iata_invalid = mask_len | mask_case | mask_alpha

invalid_iata_rows = airports[mask_iata_invalid]
print(f"Number of invalid IATA codes: {invalid_iata_rows.shape[0]}")    #all iato codes are of valid format

Number of invalid IATA codes: 0


In [49]:
airports[airports['iso_country_code'].isnull()]  # got the rows with null iso code

Unnamed: 0,airport_iata_code,iso_country_code
321,OND,
322,ERS,
323,WDH,
2628,ADI,
2629,GOG,
2630,GFY,
2631,MPA,
2632,KMP,
2633,LUD,
2634,OKU,


In [50]:
'''checked online, all these missing airports for which iso code is missing are in namibia, for which iso code is NA, 
didnt want to remove them as they are valid airports and make this analysis generalizable outside this 2 week data given to us'''

airports.loc[airports['iso_country_code'].isnull(), 'iso_country_code'] = 'NA'

In [51]:
# check for invalid iso code format
mask_len_iso = airports['iso_country_code'].str.len() != 2
mask_case_iso = airports['iso_country_code'] != airports['iso_country_code'].str.upper()
mask_alpha_iso = ~airports['iso_country_code'].str.match(r'^[A-Z]{2}$', na=False)

mask_iso_invalid = mask_len_iso | mask_case_iso | mask_alpha_iso

invalid_iso_rows = airports[mask_iso_invalid]
print(f"Number of invalid ISO codes: {invalid_iso_rows.shape[0]}")

Number of invalid ISO codes: 0


In [52]:
import duckdb
from pathlib import Path

db_path = Path('../sql_databases/skyhack.duckdb')   # creates file if not exists
con = duckdb.connect(str(db_path))

# Register the DataFrame in this session
con.register('airports_df', airports)

# Create/replace table in the DB file
con.execute("""
    CREATE OR REPLACE TABLE airports_cleaned AS
    SELECT * FROM airports_df
""")

con.close()


In [53]:
if 'AUL' in airports['airport_iata_code'].unique():   # this was added while cleaning the bagData as AUL was a valid airport which was there in that dataset, so i manually added this entry in the csv file.
    print("AUL is in the dataset")
else:
    print("AUL is not in the dataset")

AUL is in the dataset
