# Flightly

Modelling 2015 US commercial airline flight queries using SQL and Neo4j

### Setup Neo4j Query Object Wrapper
This object will allow us to receive the result of the query and also measure the time it takes for X number of queries to be completed.

In [85]:
LINE_LIMIT = 10000

In [43]:
from neo4j import GraphDatabase
import timeit

neoconn = ('bolt://localhost:7687', 'neo4j', 'password')

class NeoQuery(object):
    def __init__(self, query):
        url, user, password = neoconn
        self._driver = GraphDatabase.driver(url, auth=(user, password))
        self._query = query

    def close(self):
        self._driver.close()
        
    def query(self, times=1):
        results = self._run_cypher()
        time = timeit.timeit(self._run_cypher, number=times)
        self.close()
        return results, time
        
    def _run_cypher(self):
        with self._driver.session() as session:
            return session.run(self._query)

### Setup Postgres Query Wrapper

In [44]:
tmp = !pwd
current_path = tmp[0]

import psycopg2
from psycopg2 import Error
import timeit

pgconn = ('kevin', '', '127.0.0.1', '5432', 'flightly')

class PostgresQuery(object):
    def __init__(self, query):
        user, password, host, port, database = pgconn
        self._query = query
        try:
            self._connection = psycopg2.connect(
                user = user,
                password = password,
                host = host,
                port = port,
                database = database
            )
            self._cursor = self._connection.cursor()
        except:
            print("Failed to connect to PostgreSQL DB")

    def query(self, times=1):
        try:
            results = self._run_sql()
            time = timeit.timeit(self._run_sql, number=times)
            self._connection.commit()
            return results, time
        except (Exception, psycopg2.DatabaseError) as error :
            print ("Error while executing SQL command", error)
        finally:
            self._cursor.close()
            self._connection.close()
    
    def _run_sql(self):      
        self._cursor.execute(self._query)
        try:
            return self._cursor.fetchall()
        except:
            return []

### Initialization queries
Delete all of the existing nodes and relationships to start with clean slate.
Then, load all the csv files into the DB

In [45]:
init_queries = {
    'delete_all': """
                   MATCH (n)
                   DETACH DELETE n
                   """,
    'load_airlines': """
            LOAD CSV FROM 'file:///flightly/airlines.csv' AS line
            CREATE (:Airline { name: line[1], iata: line[0]})
            """,
    'load_airports': """
            LOAD CSV FROM 'file:///flightly/airports.csv' AS line
            CREATE (:Airport { name: line[1], iata: line[0], city: line[2], state: line[3], country: line[4], latitude: line[5], longitude: line[6]})
            """,
    'load_flights': f"""
            USING PERIODIC COMMIT 1000
            LOAD CSV FROM 'file:///flightly/flights.csv' AS line
            WITH line LIMIT {LINE_LIMIT}
            CREATE (:Flight {{ 
                year: line[0],
                month: line[1],
                day: line[2],
                day_of_week: line[3],
                airline: line[4],
                flight_number: line[5],
                tail_number: line[6],
                origin_airport: line[7],
                destination_airport: line[8],
                scheduled_departure: line[9],
                departure_time: line[10],
                departure_delay: line[11],
                taxi_out: line[12],
                wheels_off: line[13],
                scheduled_time: line[14],
                elapsed_time: line[15],
                air_time: line[16],
                distance: line[17],
                wheels_on: line[18],
                taxi_in: line[19],
                scheduled_arrival: line[20],
                arrival_time: line[21],
                arrival_delay: line[22],
                diverted: line[23],
                cancelled: line[24],
                cancellation_reason: line[25],
                air_system_delay: line[26],
                security_delay: line[27],
                airline_delay: line[28],
                late_aircraft_delay: line[29],
                weather_delay: line[30]
            }})
            CREATE (:Tail {{
                number: line[6]
            }})
            """
}

for query in init_queries.items():
    NeoQuery(query[1]).query(0)

In [46]:
init_queries = {
    'drop_all_tables': """
        DROP SCHEMA public CASCADE;
        CREATE SCHEMA public;
        
        GRANT ALL ON SCHEMA public TO postgres;
        GRANT ALL ON SCHEMA public TO public;
        """,
    'create_airlines_table': """
        CREATE TABLE IF NOT EXISTS airlines (
          iata text,
          name text);
        """,
    'create_airports_table': """
        CREATE TABLE IF NOT EXISTS airports (
          iata text,
          name text,
          city text,
          state text,
          country text,
          latitude text,
          longitude text
          );
        """,
    'create_flights_table': """
        CREATE TABLE IF NOT EXISTS flights (
            year text,
            month text,
            day text,
            day_of_week text,
            airline text,
            flight_number text,
            tail_number text,
            origin_airport text,
            destination_airport text,
            scheduled_departure text,
            departure_time text,
            departure_delay text,
            taxi_out text,
            wheels_off text,
            scheduled_time text,
            elapsed_time text,
            air_time text,
            distance text,
            wheels_on text,
            taxi_in text,
            scheduled_arrival text,
            arrival_time text,
            arrival_delay text,
            diverted text,
            cancelled text,
            cancellation_reason text,
            air_system_delay text,
            security_delay text,
            airline_delay text,
            late_aircraft_delay text,
            weather_delay text
        );
        """,
    'load_airlines_csv': f"""
        COPY airlines
        FROM '{current_path}/csv/airlines.csv' DELIMITER ',' HEADER CSV;
        """,
    'load_airports_csv': f"""
        COPY airports
        FROM '{current_path}/csv/airports.csv' DELIMITER ',' HEADER CSV;
        """,
    'load_flights_csv': f"""
        COPY flights
        FROM PROGRAM 'head -{LINE_LIMIT} {current_path}/csv/flights.csv' DELIMITER ',' HEADER CSV;
        """
}

for query in init_queries.items():
    PostgresQuery(query[1]).query(0)

### Match queries
Create the relationships that...

In [47]:
match_queries = {
    'match_origin_airport': """
            MATCH (fl:Flight),(ap:Airport)
            WHERE fl.origin_airport = ap.iata
            CREATE (ap)-[r:HAS_DEPARTURE]->(fl)
            RETURN type(r)
            """,
        
    'match_destination_airport': """
            MATCH (fl:Flight),(ap:Airport)
            WHERE fl.destination_airport = ap.iata
            CREATE (fl)-[:FLIES_TO]->(ap)
            """,
    'match_airline': """
            MATCH (fl:Flight),(al:Airline)
            WHERE fl.airline = al.iata
            CREATE (fl)-[:OPERATED_BY]->(al)
            """,
    'match_tail': """
            MATCH (fl:Flight),(tl:Tail)
            WHERE fl.tail_number = tl.number
            CREATE (fl)-[:USES]->(tl)
            """
}

for query in match_queries.items():
    NeoQuery(query[1]).query(0)

### Single table queries

In [119]:
# all flights out of ORD

nresults, ntime = NeoQuery(
    """
    MATCH (fl:Flight {origin_airport: 'ORD'})
    RETURN fl.flight_number
    """
).query(100)

presults, ptime = PostgresQuery(
    """
    SELECT flight_number from flights where origin_airport = 'ORD'
    """
).query(100)

#time and results

print(ntime)
nresults = [record[0] for record in nresults]

print(ptime)
presults = [record[0] for record in presults]

list(set(presults) - set(nresults))

1.1077358109996567
0.4192824809997546


[]

In [126]:
# all flights with a delay of more than 100 minutes

nresults, ntime = NeoQuery(
    """
    MATCH (fl:Flight)
    WHERE toInteger(fl.departure_delay) > 100
    RETURN fl.flight_number
    """
).query(100)

presults, ptime = PostgresQuery(
    """
    SELECT flight_number from flights WHERE CAST (departure_delay AS INTEGER) > 100
    """
).query(100)

print(ntime)
nresults = [record[0] for record in nresults]

print(ptime)
presults = [record[0] for record in presults]

print(nresults)

list(set(presults) - set(nresults))

1.8421962289994553
0.9709363669990125
['1057', '824', '5976', '5547', '247', '1205', '943', '6420', '1492', '1966', '2522', '3432', '1207', '595', '705', '1195', '531', '1109', '2393', '110', '5242', '1499', '2567', '240', '4978', '3047', '2413', '6283', '337', '6458', '395', '974', '416', '567', '992', '258', '5518', '164', '749', '4712', '6323', '1623', '442', '6279', '4992', '169', '2591', '56', '3200', '1590', '5387', '43', '5315', '3492', '1532', '1221', '570', '5561', '5529', '1422', '2413', '2055', '239', '597', '4671', '1410', '5365', '123', '2393', '4479', '2389', '6183', '1531', '452', '4619', '4556', '1683', '5160', '1189', '1218', '3274', '306', '5919', '2317', '510', '1081', '2072', '7', '5384', '4264', '2773', '1437', '4540', '1605', '3214', '170', '5502', '5163', '5578', '6161', '820', '395', '1180', '2417', '4479', '5', '4706', '5565', '2470', '399', '4447', '5315', '6011', '5183', '1391', '1263', '785', '489', '690', '5421', '1697', '821', '5997', '5204', '4654', '2773

[]

### Using Relationships

In [130]:
# flights departing from Atlanta and arriving in Charlotte

nresults, ntime = NeoQuery(
    """
    MATCH (atl:Airport {iata: 'ATL'})-[:HAS_DEPARTURE]->(fl:Flight)-[:FLIES_TO]->(clt:Airport {iata: 'CLT'})
    RETURN fl.flight_number
    """
).query(200)

presults, ptime = PostgresQuery(
    """
    SELECT flight_number from flights WHERE origin_airport = 'ATL' AND destination_airport = 'CLT'
    """
).query(200)

print(ntime)
nresults = [record[0] for record in nresults]

print(ptime)
presults = [record[0] for record in presults]

print(nresults)

list(set(presults) - set(nresults))

0.47477876500124694
0.8188420369988307
['329', '847', '1136', '1267', '335', '343', '331', '337', '463', '317', '325', '689', '711', '345', '741', '315']


[]

In [142]:
# flights departing from Chicago and arriving in Los Angeles operated by United

nresults, ntime = NeoQuery(
    """
    MATCH (ord:Airport {iata: 'ORD'})-[:HAS_DEPARTURE]->(fl:Flight)-[:FLIES_TO]->(lax:Airport {iata: 'LAX'}),
          (fl)-[:OPERATED_BY]->(ua:Airline {iata: 'UA'})
    RETURN fl.flight_number
    """
).query(200)

presults, ptime = PostgresQuery(
    """
    SELECT flight_number from flights WHERE origin_airport = 'ORD' AND destination_airport = 'LAX' AND airline = 'UA'
    """
).query(200)

print(ntime)
nresults = [record[0] for record in nresults]

print(ptime)
presults = [record[0] for record in presults]

print(nresults)

list(set(presults) - set(nresults))

0.5618747160006023
0.8553509030007262
['1004', '1058', '1151', '1665', '661', '521', '865']


[]

In [None]:
# flights departing from Chicago and arriving in Los Angeles operated by United with Tail Number N33209

nresults, ntime = NeoQuery(
    """
    MATCH (ord:Airport {iata: 'ORD'})-[:HAS_DEPARTURE]->(fl:Flight)-[:FLIES_TO]->(lax:Airport {iata: 'LAX'}),
          (fl)-[:OPERATED_BY]->(ua:Airline {iata: 'UA'}),
          (fl)-[:USES]->(tl:Tail {number: 'N33209'})
    RETURN DISTINCT fl.flight_number
    """
).query(200)

presults, ptime = PostgresQuery(
    """
    SELECT flight_number from flights WHERE origin_airport = 'ORD' AND destination_airport = 'LAX'
    AND airline = 'UA' AND tail_number = 'N33209'
    """
).query(200)

print(ntime)
nresults = [record[0] for record in nresults]

print(ptime)
presults = [record[0] for record in presults]

print(nresults)

list(set(presults) - set(nresults))

### Joining across tables

In [148]:
# get names of the airlines operating flights from Chicago to Los Angeles

nresults, ntime = NeoQuery(
    """
    MATCH (ord:Airport {iata: 'ORD'})-[:HAS_DEPARTURE]->(fl:Flight)-[:FLIES_TO]->(lax:Airport {iata: 'LAX'}),
          (fl)-[:OPERATED_BY]->(al:Airline)
    RETURN DISTINCT al.name
    """
).query(200)

presults, ptime = PostgresQuery(
    """
    SELECT name from airlines INNER JOIN flights ON (airlines.iata = flights.airline)
    WHERE origin_airport = 'ORD' AND destination_airport = 'LAX'
    """
).query(200)

print(ntime)
nresults = [record[0] for record in nresults]

print(ptime)
presults = [record[0] for record in presults]

print(nresults)

list(set(presults) - set(nresults))

0.6086883070001932
0.9530343480000738
['American Airlines Inc.', 'Spirit Air Lines', 'United Air Lines Inc.', 'Virgin America']


[]

In [152]:
# get the names of destination airports from all flights originating in Hawaii

nresults, ntime = NeoQuery(
    """
    MATCH (hi:Airport {state: 'HI'})-[:HAS_DEPARTURE]->(fl:Flight)-[:FLIES_TO]->(ap:Airport)
    RETURN DISTINCT ap.name
    """
).query(200)

presults, ptime = PostgresQuery(
    """
    SELECT name from airports INNER JOIN flights ON (airports.iata = flights.origin_airport)
    WHERE airports.state = 'HI'
    """
).query(200)

print(ntime)
nresults = [record[0] for record in nresults]

print(ptime)
presults = [record[0] for record in presults]

print(nresults)

list(set(presults) - set(nresults))

0.5896564880004007
1.1598360319985659
['Hilo International Airport', 'Guam International Airport', 'Washington Dulles International Airport', 'John F. Kennedy International Airport\xa0(New York International Airport)', 'Kahului Airport', 'Seattle-Tacoma International Airport', 'Phoenix Sky Harbor International Airport', 'Sacramento International Airport', 'Honolulu International Airport', 'San Francisco International Airport', 'McCarran International Airport', 'Portland International Airport', "Chicago O'Hare International Airport", 'Norman Y. Mineta San José International Airport', 'Los Angeles International Airport', 'Bellingham International Airport', 'Kona International Airport at Keahole', 'Lihue Airport', 'Oakland International Airport', 'San Diego International Airport\xa0(Lindbergh Field)']


[]

In [27]:
results, time = NeoQuery(
    """
    MATCH (ord:Airport {iata: 'ATL'})-[:HAS_DEPARTURE]->(fl:Flight)-[:OPERATED_BY]->(al:Airline {iata: 'AA'})
    RETURN fl
    """
).query(500)

print(time)
[record['fl']['flight_number'] for record in results]

2.3094601739999234


['356', '1473', '1652', '1635', '314', '17', '1513', '1455', '194']

In [26]:
results, time = NeoQuery(
    """
    MATCH (:Airport {iata: 'ATL'})-[:HAS_DEPARTURE]->(fl:Flight)-[:FLIES_TO]->(ar:Airport {iata: 'CLT'})
    RETURN fl
    """
).query(500)

print(time)
[record['fl']['flight_number'] for record in results]

1.3964611820001664


['1808', '1095', '461', '889', '1999', '2065', '1441', '2133']

Query for All flights with a specific tail number

In [93]:
results, time = NeoQuery(
    """
    MATCH (ar:Airport)-[:HAS_DEPARTURE]->(fl:Flight)-[:USES]->(tl:Tail {number: 'N129DL'})
    RETURN distinct fl
    """
).query(200)

print(time)
[record for record in results]

1.1943601919992943


[<Record fl=<Node id=466784 labels={'Flight'} properties={'wheels_off': '1422', 'taxi_in': '7', 'distance': '2139', 'year': '2015', 'scheduled_time': '269', 'scheduled_arrival': '2139', 'diverted': '0', 'taxi_out': '12', 'tail_number': 'N129DL', 'elapsed_time': '258', 'air_time': '239', 'airline': 'DL', 'departure_time': '1410', 'day': '1', 'departure_delay': '0', 'day_of_week': '4', 'arrival_time': '2128', 'wheels_on': '2121', 'flight_number': '61', 'origin_airport': 'SFO', 'destination_airport': 'ATL', 'month': '1', 'cancelled': '0', 'scheduled_departure': '1410', 'arrival_delay': '-11'}>>,
 <Record fl=<Node id=460084 labels={'Flight'} properties={'wheels_off': '1100', 'taxi_in': '8', 'distance': '2139', 'year': '2015', 'scheduled_time': '330', 'scheduled_arrival': '1310', 'diverted': '0', 'taxi_out': '17', 'tail_number': 'N129DL', 'elapsed_time': '302', 'air_time': '277', 'airline': 'DL', 'departure_time': '1043', 'day': '1', 'departure_delay': '3', 'day_of_week': '4', 'arrival_time

Query for all flights with specific tail number, but this query is slower

In [94]:
results, time = NeoQuery(
    """
    MATCH (ar:Airport)-[:HAS_DEPARTURE]->(fl:Flight {tail_number: 'N129DL'})
    RETURN fl
    """
).query(200)

print(time)
[record for record in results]

3.127421900000627


[<Record fl=<Node id=460084 labels={'Flight'} properties={'wheels_off': '1100', 'taxi_in': '8', 'distance': '2139', 'year': '2015', 'scheduled_time': '330', 'scheduled_arrival': '1310', 'diverted': '0', 'taxi_out': '17', 'tail_number': 'N129DL', 'elapsed_time': '302', 'air_time': '277', 'airline': 'DL', 'departure_time': '1043', 'day': '1', 'departure_delay': '3', 'day_of_week': '4', 'arrival_time': '1245', 'wheels_on': '1237', 'flight_number': '1680', 'origin_airport': 'ATL', 'destination_airport': 'SFO', 'month': '1', 'cancelled': '0', 'scheduled_departure': '1040', 'arrival_delay': '-25'}>>,
 <Record fl=<Node id=466784 labels={'Flight'} properties={'wheels_off': '1422', 'taxi_in': '7', 'distance': '2139', 'year': '2015', 'scheduled_time': '269', 'scheduled_arrival': '2139', 'diverted': '0', 'taxi_out': '12', 'tail_number': 'N129DL', 'elapsed_time': '258', 'air_time': '239', 'airline': 'DL', 'departure_time': '1410', 'day': '1', 'departure_delay': '0', 'day_of_week': '4', 'arrival_ti