In [1]:
import psycopg2
import pandas as pd 

In [2]:
def create_database():
    # connect to default database
    conn = psycopg2.connect(
        host = "127.0.0.1",
        dbname = "postgres",
        user = "postgres",
        password = "password123"
    )
    conn.set_session(autocommit = True)
    cur = conn.cursor()
    # create new database
    cur.execute("DROP DATABASE IF EXISTS openflight;")
    cur.execute("CREATE DATABASE openflight;")
    # close connection to default datavase
    conn.close()
    # connect to new database 
    conn = psycopg2.connect(
        host = "127.0.0.1",
        dbname = "openflight",
        user = "postgres",
        password = "password123"
    )
    cur = conn.cursor()
    return cur, conn 

In [3]:
# def drop_tables(cur, conn):
#     for query in drop_table_queries:
#         cur.execute(query)
#         cur.commit()

# def create_tables(cur, conn):
#     for query in create_table_queries:
#         cur.execute(query)
#         cur.commit()

In [4]:
airlines = pd.read_csv(r"open-flight-data\airlines.csv")
airlines = airlines.drop(columns="Alias")
# Encode active column to boolean
airlines["Active"] = airlines.Active.map({"Y": True,"N": False})

In [5]:
airlines.head()

Unnamed: 0,Airline ID,Name,IATA,ICAO,Callsign,Country,Active
0,-1,Unknown,-,,\N,\N,True
1,1,Private flight,-,,,,True
2,2,135 Airways,,GNL,GENERAL,United States,False
3,3,1Time Airline,1T,RNX,NEXTIME,South Africa,True
4,4,2 Sqn No 1 Elementary Flying Training School,,WYT,,United Kingdom,False


In [6]:
airlines.dtypes

Airline ID     int64
Name          object
IATA          object
ICAO          object
Callsign      object
Country       object
Active        object
dtype: object

In [7]:
routes = pd.read_csv(
    r"open-flight-data\routes.csv",
    usecols=[
        'Airline ID', 
        'Source airport ID',
        'Destination airport ID', 
        'Stops',
        'Equipment'
    ]
)

In [8]:
# Problem: have missing value in id(s) columns
# Solution: 1. drop them 2. encode them with Int dtype
routes.loc[routes["Airline ID"] == "\\N", "Airline ID"] = None
routes.loc[routes["Source airport ID"] == "\\N", "Source airport ID"] = None
routes.loc[routes["Destination airport ID"] == "\\N", "Destination airport ID"] = None

In [9]:
routes = routes.reset_index()

In [10]:
routes.head()

Unnamed: 0,index,Airline ID,Source airport ID,Destination airport ID,Stops,Equipment
0,0,410,2965,2990,0,CR2
1,1,410,2966,2990,0,CR2
2,2,410,2966,2962,0,CR2
3,3,410,2968,2990,0,CR2
4,4,410,2968,4078,0,CR2


In [11]:
routes.dtypes

index                      int64
Airline ID                object
Source airport ID         object
Destination airport ID    object
Stops                      int64
Equipment                 object
dtype: object

In [12]:
airports = pd.read_csv(
    r"open-flight-data\airports.csv"
)
airports = airports.drop(columns=["Tz database time zone", "Type", "Source"])

In [13]:
airports["Timezone"] = airports["Timezone"].str.replace("\\N", "NaN").astype("float")

In [14]:
airports.head()

Unnamed: 0,Airport ID,Name,City,Country,IATA,ICAO,Latitude,Longitude,Altitude,Timezone,DST
0,1,Goroka Airport,Goroka,Papua New Guinea,GKA,AYGA,-6.08169,145.391998,5282,10.0,U
1,2,Madang Airport,Madang,Papua New Guinea,MAG,AYMD,-5.20708,145.789001,20,10.0,U
2,3,Mount Hagen Kagamuga Airport,Mount Hagen,Papua New Guinea,HGU,AYMH,-5.82679,144.296005,5388,10.0,U
3,4,Nadzab Airport,Nadzab,Papua New Guinea,LAE,AYNZ,-6.569803,146.725977,239,10.0,U
4,5,Port Moresby Jacksons International Airport,Port Moresby,Papua New Guinea,POM,AYPY,-9.44338,147.220001,146,10.0,U


#### Ingest data 

In [15]:
cur, conn = create_database()

In [16]:
airline_table_create = """
CREATE TABLE IF NOT EXISTS airline(
    id INTEGER PRIMARY KEY,
    name VARCHAR,
    iata VARCHAR,
    icao VARCHAR,
    callsign VARCHAR,
    country VARCHAR,
    active BOOLEAN
);
"""
cur.execute(airline_table_create)
conn.commit()

In [17]:
airport_table_create = """
CREATE TABLE IF NOT EXISTS airport(
    id INTEGER PRIMARY KEY,
    name VARCHAR,
    city VARCHAR,
    country VARCHAR,
    iata VARCHAR,
    icao VARCHAR,
    lat FLOAT,
    long FLOAT,
    altitude INTEGER,
    timezone FLOAT,
    dst VARCHAR
)
"""
cur.execute(airport_table_create)
conn.commit()

In [18]:
route_table_create = """
CREATE TABLE IF NOT EXISTS route(
    id INTEGER PRIMARY KEY,
    airline_id INTEGER REFERENCES airline(id),
    source_airport_id INTEGER REFERENCES airport(id),
    destination_airport_id INTEGER REFERENCES airport(id),
    stops INTEGER,
    equipment VARCHAR
);
"""
cur.execute(route_table_create)
conn.commit()

In [19]:
airlines.loc[airlines.Active.isna(), 'Active'] = None

In [20]:
airline_table_insert = """
INSERT INTO airline 
VALUES (%s, %s, %s, %s, %s, %s, %s);
"""
try:
    for i, row in airlines.iterrows():
        cur.execute(airline_table_insert, list(row))
    conn.commit()
except Exception as e:
    # Roll back the transaction
    conn.rollback()
    # Log the exception
    print("Error occurred during transaction: ", e)
    print("Type of exception: ", type(e))

In [21]:
airport_table_insert = """
INSERT INTO airport 
VALUES (%s, %s, %s, %s, %s, %s, %s, %s, %s, %s, %s);
"""
try:
    for i, row in airports.iterrows():
        cur.execute(airport_table_insert, list(row))
    conn.commit()
except Exception as e:
    # Roll back the transaction
    conn.rollback()
    # Log the exception
    print("Error occurred during transaction: ", e)

In [22]:
route_table_insert = """
INSERT INTO route 
VALUES (%s, %s, %s, %s, %s, %s);
"""
dropped_rows = []
for i, row in routes.iterrows():
    try:
        cur.execute(route_table_insert, list(row))
        conn.commit()
    except psycopg2.errors.ForeignKeyViolation as e:
        # Roll back the transaction
        conn.rollback()
        # Log the exception
        print("Error: ", e)
        print("Skipped no reference record:", row)
        dropped_rows.append(row)
# Drawback of sql: strict error protection (if error during transction, must be rollback)


Error:  insert or update on table "route" violates foreign key constraint "route_destination_airport_id_fkey"
DETAIL:  Key (destination_airport_id)=(7167) is not present in table "airport".

Skipped no reference record: index                      170
Airline ID                 146
Source airport ID         3531
Destination airport ID    7167
Stops                        0
Equipment                  BNI
Name: 170, dtype: object
Error:  insert or update on table "route" violates foreign key constraint "route_destination_airport_id_fkey"
DETAIL:  Key (destination_airport_id)=(7168) is not present in table "airport".

Skipped no reference record: index                      171
Airline ID                 146
Source airport ID         3531
Destination airport ID    7168
Stops                        0
Equipment                  BNI
Name: 171, dtype: object
Error:  insert or update on table "route" violates foreign key constraint "route_destination_airport_id_fkey"
DETAIL:  Key (destination_ai

In [23]:
len(dropped_rows)

476

In [24]:
cur.execute("""
SELECT COUNT(1)
FROM route
;
""")
results = cur.fetchone()

In [25]:
results

(67187,)