# Flightly

Modelling 2015 US commercial airline flight queries using SQL and Neo4j

### Setup Neo4j Query Object Wrapper
This object will allow us to receive the result of the query and also measure the time it takes for X number of queries to be completed.

In [1]:
LINE_LIMIT = 20000

In [2]:
from neo4j import GraphDatabase
import timeit

neoconn = ('bolt://localhost:7687', 'neo4j', 'password')

class NeoQuery(object):
    def __init__(self, query):
        url, user, password = neoconn
        self._driver = GraphDatabase.driver(url, auth=(user, password))
        self._query = query

    def close(self):
        self._driver.close()
        
    def query(self, times=1):
        results = self._run_cypher()
        time = timeit.timeit(self._run_cypher, number=times)
        self.close()
        return results, time
        
    def _run_cypher(self):
        with self._driver.session() as session:
            return session.run(self._query)

### Setup Postgres Query Wrapper

In [3]:
tmp = !pwd
current_path = tmp[0]

import psycopg2
from psycopg2 import Error
import timeit

pgconn = ('kevin', '', '127.0.0.1', '5432', 'flightly')

class PostgresQuery(object):
    def __init__(self, query):
        user, password, host, port, database = pgconn
        self._query = query
        try:
            self._connection = psycopg2.connect(
                user = user,
                password = password,
                host = host,
                port = port,
                database = database
            )
            self._cursor = self._connection.cursor()
        except:
            print("Failed to connect to PostgreSQL DB")

    def query(self, times=1):
        try:
            results = self._run_sql()
            time = timeit.timeit(self._run_sql, number=times)
            self._connection.commit()
            return results, time
        except (Exception, psycopg2.DatabaseError) as error :
            print ("Error while executing SQL command", error)
        finally:
            self._cursor.close()
            self._connection.close()
    
    def _run_sql(self):      
        self._cursor.execute(self._query)
        try:
            return self._cursor.fetchall()
        except:
            return []

### Initialization queries
Delete all of the existing nodes and relationships to start with clean slate.
Then, load all the csv files into the DB

In [4]:
init_queries = {
    'delete_all': """
                   MATCH (n)
                   DETACH DELETE n
                   """,
    'load_airlines': """
            LOAD CSV FROM 'file:///flightly/airlines.csv' AS line
            CREATE (:Airline { name: line[1], iata: line[0]})
            """,
    'load_airports': """
            LOAD CSV FROM 'file:///flightly/airports.csv' AS line
            CREATE (:Airport { name: line[1], iata: line[0], city: line[2], state: line[3], country: line[4], latitude: line[5], longitude: line[6]})
            """,
    'load_flights': f"""
            USING PERIODIC COMMIT 1000
            LOAD CSV FROM 'file:///flightly/flights.csv' AS line
            WITH line LIMIT {LINE_LIMIT}
            CREATE (:Flight {{ 
                year: line[0],
                month: line[1],
                day: line[2],
                day_of_week: line[3],
                airline: line[4],
                flight_number: line[5],
                tail_number: line[6],
                origin_airport: line[7],
                destination_airport: line[8],
                scheduled_departure: line[9],
                departure_time: line[10],
                departure_delay: line[11],
                taxi_out: line[12],
                wheels_off: line[13],
                scheduled_time: line[14],
                elapsed_time: line[15],
                air_time: line[16],
                distance: line[17],
                wheels_on: line[18],
                taxi_in: line[19],
                scheduled_arrival: line[20],
                arrival_time: line[21],
                arrival_delay: line[22],
                diverted: line[23],
                cancelled: line[24],
                cancellation_reason: line[25],
                air_system_delay: line[26],
                security_delay: line[27],
                airline_delay: line[28],
                late_aircraft_delay: line[29],
                weather_delay: line[30]
            }})
            CREATE (:Tail {{
                number: line[6]
            }})
            """
}

for query in init_queries.items():
    NeoQuery(query[1]).query(0)

In [5]:
init_queries = {
    'drop_all_tables': """
        DROP SCHEMA public CASCADE;
        CREATE SCHEMA public;
        
        GRANT ALL ON SCHEMA public TO postgres;
        GRANT ALL ON SCHEMA public TO public;
        """,
    'create_airlines_table': """
        CREATE TABLE IF NOT EXISTS airlines (
          iata text,
          name text);
        """,
    'create_airports_table': """
        CREATE TABLE IF NOT EXISTS airports (
          iata text,
          name text,
          city text,
          state text,
          country text,
          latitude text,
          longitude text
          );
        """,
    'create_flights_table': """
        CREATE TABLE IF NOT EXISTS flights (
            year text,
            month text,
            day text,
            day_of_week text,
            airline text,
            flight_number text,
            tail_number text,
            origin_airport text,
            destination_airport text,
            scheduled_departure text,
            departure_time text,
            departure_delay text,
            taxi_out text,
            wheels_off text,
            scheduled_time text,
            elapsed_time text,
            air_time text,
            distance text,
            wheels_on text,
            taxi_in text,
            scheduled_arrival text,
            arrival_time text,
            arrival_delay text,
            diverted text,
            cancelled text,
            cancellation_reason text,
            air_system_delay text,
            security_delay text,
            airline_delay text,
            late_aircraft_delay text,
            weather_delay text
        );
        """,
    'load_airlines_csv': f"""
        COPY airlines
        FROM '{current_path}/csv/airlines.csv' DELIMITER ',' HEADER CSV;
        """,
    'load_airports_csv': f"""
        COPY airports
        FROM '{current_path}/csv/airports.csv' DELIMITER ',' HEADER CSV;
        """,
    'load_flights_csv': f"""
        COPY flights
        FROM PROGRAM 'head -{LINE_LIMIT+1} {current_path}/csv/flights.csv' DELIMITER ',' HEADER CSV;
        """
}

for query in init_queries.items():
    PostgresQuery(query[1]).query(0)

### Match queries
Create the relationships that...

In [6]:
match_queries = {
    'match_origin_airport': """
            MATCH (fl:Flight),(ap:Airport)
            WHERE fl.origin_airport = ap.iata
            CREATE (ap)-[r:HAS_DEPARTURE]->(fl)
            RETURN type(r)
            """,
        
    'match_destination_airport': """
            MATCH (fl:Flight),(ap:Airport)
            WHERE fl.destination_airport = ap.iata
            CREATE (fl)-[:FLIES_TO]->(ap)
            """,
    'match_airline': """
            MATCH (fl:Flight),(al:Airline)
            WHERE fl.airline = al.iata
            CREATE (fl)-[:OPERATED_BY]->(al)
            """,
    'match_tail': """
            MATCH (fl:Flight),(tl:Tail)
            WHERE fl.tail_number = tl.number
            CREATE (fl)-[:USES]->(tl)
            """
}

for query in match_queries.items():
    NeoQuery(query[1]).query(0)

### Single table queries

In [7]:
# all flights out of ORD

nresults, ntime = NeoQuery(
    """
    MATCH (fl:Flight {origin_airport: 'ORD'})
    RETURN fl.flight_number
    """
).query(100)

presults, ptime = PostgresQuery(
    """
    SELECT flight_number from flights where origin_airport = 'ORD'
    """
).query(100)

#time and results

print(ntime)
nresults = [record[0] for record in nresults]

print(ptime)
presults = [record[0] for record in presults]

print(nresults)

list(set(presults) - set(nresults))

2.8112205250000013
0.9193670300000001
['602', '1500', '409', '1167', '5498', '1012', '224', '977', '1256', '654', '5370', '5764', '1748', '729', '424', '1061', '296', '761', '725', '5299', '661', '4546', '1967', '731', '1070', '1524', '1001', '4498', '5622', '1440', '1610', '1620', '23', '5556', '1454', '6407', '6379', '6389', '709', '4401', '4993', '454', '1665', '4158', '815', '6378', '1161', '1688', '523', '1477', '792', '395', '5446', '549', '121', '6459', '245', '538', '3940', '6119', '5349', '5248', '382', '1272', '4284', '1684', '4712', '691', '761', '5447', '5626', '2259', '1185', '893', '195', '231', '632', '384', '2224', '3212', '1463', '139', '357', '6200', '2931', '3242', '1538', '3437', '1933', '1582', '4756', '6122', '2855', '5298', '1590', '1168', '810', '1551', '2485', '2839', '331', '2315', '2872', '3256', '3415', '906', '1437', '1534', '2813', '3138', '1703', '1169', '1412', '1759', '2246', '4722', '5904', '5694', '219', '2183', '2784', '3624', '5479', '4980', '5431',

[]

In [8]:
# all flights with a delay of more than 100 minutes

nresults, ntime = NeoQuery(
    """
    MATCH (fl:Flight)
    WHERE toInteger(fl.departure_delay) > 100
    RETURN DISTINCT fl.flight_number
    """
).query(100)

presults, ptime = PostgresQuery(
    """
    SELECT flight_number from flights WHERE CAST (departure_delay AS INTEGER) > 100
    """
).query(100)

print(ntime)
nresults = [record[0] for record in nresults]

print(ptime)
presults = [record[0] for record in presults]

print(nresults)

list(set(presults) - set(nresults))

5.520864553000003
1.3462678709999985
['1057', '824', '5976', '5547', '247', '1205', '943', '6420', '1492', '1966', '2522', '3432', '1207', '595', '705', '1195', '531', '1109', '2393', '110', '5242', '1499', '2567', '240', '4978', '3047', '2413', '6283', '337', '6458', '395', '974', '416', '567', '992', '258', '5518', '164', '749', '4712', '6323', '1623', '442', '6279', '4992', '169', '2591', '56', '3200', '1590', '5387', '43', '5315', '3492', '1532', '1221', '570', '5561', '5529', '1422', '2055', '239', '597', '4671', '1410', '5365', '123', '4479', '2389', '6183', '1531', '452', '4619', '4556', '1683', '5160', '1189', '1218', '3274', '306', '5919', '2317', '510', '1081', '2072', '7', '5384', '4264', '2773', '1437', '4540', '1605', '3214', '170', '5502', '5163', '5578', '6161', '820', '1180', '2417', '5', '4706', '5565', '2470', '399', '4447', '6011', '5183', '1391', '1263', '785', '489', '690', '5421', '1697', '821', '5997', '5204', '4654', '551', '305', '5171', '398', '386', '221', '2

[]

### Using Relationships

In [9]:
# flights departing from Atlanta and arriving in Charlotte

nresults, ntime = NeoQuery(
    """
    MATCH (atl:Airport {iata: 'ATL'})-[:HAS_DEPARTURE]->(fl:Flight)-[:FLIES_TO]->(clt:Airport {iata: 'CLT'})
    RETURN fl.flight_number
    """
).query(200)

presults, ptime = PostgresQuery(
    """
    SELECT flight_number from flights WHERE origin_airport = 'ATL' AND destination_airport = 'CLT'
    """
).query(200)

print(ntime)
nresults = [record[0] for record in nresults]

print(ptime)
presults = [record[0] for record in presults]

print(nresults)

list(set(presults) - set(nresults))

1.104647173
2.1296228699999986
['889', '447', '1805', '1808', '1808', '461', '1744', '1999', '2052', '1441', '2065', '1095', '2133', '1076', '461', '2251', '1095', '2065', '658']


[]

In [10]:
# flights departing from Chicago and arriving in Los Angeles operated by United

nresults, ntime = NeoQuery(
    """
    MATCH (ord:Airport {iata: 'ORD'})-[:HAS_DEPARTURE]->(fl:Flight)-[:FLIES_TO]->(lax:Airport {iata: 'LAX'}),
          (fl)-[:OPERATED_BY]->(ua:Airline {iata: 'UA'})
    RETURN fl.flight_number
    """
).query(200)

presults, ptime = PostgresQuery(
    """
    SELECT flight_number from flights WHERE origin_airport = 'ORD' AND destination_airport = 'LAX' AND airline = 'UA'
    """
).query(200)

print(ntime)
nresults = [record[0] for record in nresults]

print(ptime)
presults = [record[0] for record in presults]

print(nresults)

list(set(presults) - set(nresults))

2.190588023
2.6001490329999974
['531', '1004', '661', '1168', '1256', '1058', '1037', '521', '1151', '1665', '1591', '661', '1037', '1760', '865']


[]

In [11]:
# flights departing from Chicago and arriving in Los Angeles operated by United with Tail Number N33209

nresults, ntime = NeoQuery(
    """
    MATCH (ord:Airport {iata: 'ORD'})-[:HAS_DEPARTURE]->(fl:Flight)-[:FLIES_TO]->(lax:Airport {iata: 'LAX'}),
          (fl)-[:OPERATED_BY]->(ua:Airline {iata: 'UA'}),
          (fl)-[:USES]->(tl:Tail {number: 'N33209'})
    RETURN DISTINCT fl.flight_number
    """
).query(200)

presults, ptime = PostgresQuery(
    """
    SELECT flight_number from flights WHERE origin_airport = 'ORD' AND destination_airport = 'LAX'
    AND airline = 'UA' AND tail_number = 'N33209'
    """
).query(200)

print(ntime)
nresults = [record[0] for record in nresults]

print(ptime)
presults = [record[0] for record in presults]

print(nresults)

list(set(presults) - set(nresults))

3.345743089999999
2.7668735359999985
['1058']


[]

### Joining across tables

In [12]:
# get names of the airlines operating flights from Chicago to Los Angeles

nresults, ntime = NeoQuery(
    """
    MATCH (ord:Airport {iata: 'ORD'})-[:HAS_DEPARTURE]->(fl:Flight)-[:FLIES_TO]->(lax:Airport {iata: 'LAX'}),
          (fl)-[:OPERATED_BY]->(al:Airline)
    RETURN DISTINCT al.name
    """
).query(200)

presults, ptime = PostgresQuery(
    """
    SELECT name from airlines INNER JOIN flights ON (airlines.iata = flights.airline)
    WHERE origin_airport = 'ORD' AND destination_airport = 'LAX'
    """
).query(200)

print(ntime)
nresults = [record[0] for record in nresults]

print(ptime)
presults = [record[0] for record in presults]

print(nresults)

list(set(presults) - set(nresults))

1.7107797620000014
3.2483375779999974
['American Airlines Inc.', 'Spirit Air Lines', 'United Air Lines Inc.', 'Virgin America']


[]

In [20]:
# get the names of destination airports from all flights originating in Wyoming

nresults, ntime = NeoQuery(
    """
    MATCH (hi:Airport {state: 'WY'})-[:HAS_DEPARTURE]->(fl:Flight)-[:FLIES_TO]->(ap:Airport)
    RETURN DISTINCT ap.name
    """
).query(200)

presults, ptime = PostgresQuery(
    """
    SELECT name from airports
    JOIN flights ON (airports.iata = flights.destination_airport)
    where flights.origin_airport IN (SELECT iata from airports WHERE airports.state = 'WY')
    """
).query(200)

print(ntime)
nresults = [record[0] for record in nresults]

print(ptime)
presults = [record[0] for record in presults]

print(nresults)

list((set(presults) - set(nresults)))

0.3012527409999848
2.547119858999963
['San Francisco International Airport', 'Rock Springs-Sweetwater County Airport', 'Gillette-Campbell County Airport', 'Salt Lake City International Airport', "Chicago O'Hare International Airport", 'Los Angeles International Airport', 'Dallas/Fort Worth International Airport', 'Minneapolis-Saint Paul International Airport', 'Hartsfield-Jackson Atlanta International Airport', 'Newark Liberty International Airport', 'George Bush Intercontinental Airport', 'Denver International Airport']


[]

In [69]:
# get possible connection airports on a flight from Alask to Florida

nresults, ntime = NeoQuery(
    """
    MATCH (ap1:Airport {state: 'AL'})-[:HAS_DEPARTURE]->(fl:Flight)-[:FLIES_TO]->(ap:Airport),
          (ap)-[:HAS_DEPARTURE]->(fl2:Flight)-[:FLIES_TO]->(ap2:Airport {state: 'FL'})
          
    WITH fl, fl2, ap, ap1, ap2,
         toInteger(left(fl.scheduled_arrival, 2)) * 60 + toInteger(right(fl.scheduled_arrival, 2)) as fl_arrival,
         toInteger(left(fl2.scheduled_departure, 2)) * 60 + toInteger(right(fl2.scheduled_departure, 2)) as fl2_departure  
    
    WHERE (fl2_departure - fl_arrival) > 60 AND (fl2_departure - fl_arrival) < 120 AND ap.state <> 'FL' AND fl.day = '1'
    
    RETURN DISTINCT ap1.name, fl.scheduled_departure, fl.scheduled_arrival, ap.name,
                    fl2.scheduled_departure, fl2.scheduled_arrival, ap2.name, (fl2_departure - fl_arrival) as layover_time
    """
).query(10)

print(ntime)
for record in nresults:
    print(f"""
    Flight departs from {record[0]} at {record[1]}.
    Arrival at {record[2]} in {record[3]}
    {record[7]} minutes layover time.
    Next leg departs at {record[4]} and lands at {record[5]} in {record[6]}
    """)

1.1603698410008292

    Flight departs from Birmingham-Shuttlesworth International Airport at 0645.
    Arrival at 0900 in Dallas/Fort Worth International Airport
    80 minutes layover time.
    Next leg departs at 1020 and lands at 1400 in Fort Lauderdale-Hollywood International Airport
    

    Flight departs from Birmingham-Shuttlesworth International Airport at 0645.
    Arrival at 0900 in Dallas/Fort Worth International Airport
    115 minutes layover time.
    Next leg departs at 1055 and lands at 1435 in Miami International Airport
    

    Flight departs from Birmingham-Shuttlesworth International Airport at 0645.
    Arrival at 0900 in Dallas/Fort Worth International Airport
    95 minutes layover time.
    Next leg departs at 1035 and lands at 1213 in Pensacola International Airport (Pensacola Gulf Coast Regional Airport)
    

    Flight departs from Birmingham-Shuttlesworth International Airport at 0645.
    Arrival at 0900 in Dallas/Fort Worth International Airport
    

In [14]:
results, time = NeoQuery(
    """
    MATCH (ord:Airport {iata: 'ATL'})-[:HAS_DEPARTURE]->(fl:Flight)-[:OPERATED_BY]->(al:Airline {iata: 'AA'})
    RETURN fl
    """
).query(500)

print(time)
[record['fl']['flight_number'] for record in results]

6.334895058999997


['314',
 '17',
 '356',
 '1513',
 '1652',
 '276',
 '1635',
 '1513',
 '1473',
 '194',
 '1455',
 '232',
 '1455',
 '349',
 '1635',
 '1652',
 '17',
 '125']

In [15]:
results, time = NeoQuery(
    """
    MATCH (:Airport {iata: 'ATL'})-[:HAS_DEPARTURE]->(fl:Flight)-[:FLIES_TO]->(ar:Airport {iata: 'CLT'})
    RETURN fl
    """
).query(500)

print(time)
[record['fl']['flight_number'] for record in results]

4.48958657


['889',
 '447',
 '1805',
 '1808',
 '1808',
 '461',
 '1744',
 '1999',
 '2052',
 '1441',
 '2065',
 '1095',
 '2133',
 '1076',
 '461',
 '2251',
 '1095',
 '2065',
 '658']

Query for All flights with a specific tail number

In [16]:
results, time = NeoQuery(
    """
    MATCH (ar:Airport)-[:HAS_DEPARTURE]->(fl:Flight)-[:USES]->(tl:Tail {number: 'N129DL'})
    RETURN distinct fl
    """
).query(200)

print(time)
[record for record in results]

2.9167485919999976


[<Record fl=<Node id=466784 labels={'Flight'} properties={'wheels_off': '1422', 'taxi_in': '7', 'distance': '2139', 'year': '2015', 'scheduled_time': '269', 'scheduled_arrival': '2139', 'diverted': '0', 'taxi_out': '12', 'tail_number': 'N129DL', 'elapsed_time': '258', 'air_time': '239', 'airline': 'DL', 'departure_time': '1410', 'day': '1', 'departure_delay': '0', 'day_of_week': '4', 'arrival_time': '2128', 'wheels_on': '2121', 'flight_number': '61', 'origin_airport': 'SFO', 'destination_airport': 'ATL', 'month': '1', 'cancelled': '0', 'scheduled_departure': '1410', 'arrival_delay': '-11'}>>,
 <Record fl=<Node id=494253 labels={'Flight'} properties={'wheels_off': '0729', 'taxi_in': '7', 'distance': '1590', 'year': '2015', 'scheduled_time': '260', 'scheduled_arrival': '0940', 'diverted': '0', 'taxi_out': '11', 'tail_number': 'N129DL', 'elapsed_time': '238', 'air_time': '220', 'airline': 'DL', 'departure_time': '0718', 'day': '2', 'departure_delay': '-2', 'day_of_week': '5', 'arrival_tim

Query for all flights with specific tail number, but this query is slower

In [17]:
results, time = NeoQuery(
    """
    MATCH (ar:Airport)-[:HAS_DEPARTURE]->(fl:Flight {tail_number: 'N129DL'})
    RETURN fl
    """
).query(200)

print(time)
[record for record in results]

9.947114249000009


[<Record fl=<Node id=460084 labels={'Flight'} properties={'wheels_off': '1100', 'taxi_in': '8', 'distance': '2139', 'year': '2015', 'scheduled_time': '330', 'scheduled_arrival': '1310', 'diverted': '0', 'taxi_out': '17', 'tail_number': 'N129DL', 'elapsed_time': '302', 'air_time': '277', 'airline': 'DL', 'departure_time': '1043', 'day': '1', 'departure_delay': '3', 'day_of_week': '4', 'arrival_time': '1245', 'wheels_on': '1237', 'flight_number': '1680', 'origin_airport': 'ATL', 'destination_airport': 'SFO', 'month': '1', 'cancelled': '0', 'scheduled_departure': '1040', 'arrival_delay': '-25'}>>,
 <Record fl=<Node id=494253 labels={'Flight'} properties={'wheels_off': '0729', 'taxi_in': '7', 'distance': '1590', 'year': '2015', 'scheduled_time': '260', 'scheduled_arrival': '0940', 'diverted': '0', 'taxi_out': '11', 'tail_number': 'N129DL', 'elapsed_time': '238', 'air_time': '220', 'airline': 'DL', 'departure_time': '0718', 'day': '2', 'departure_delay': '-2', 'day_of_week': '5', 'arrival_t