# 13. ETL Project

## Project Report

At the end of the week, your team will submit a Final Report that describes the following:

* **E**xtract: your original data sources and how the data was formatted (CSV, JSON, pgAdmin 4, etc).

* **T**ransform: what data cleaning or transformation was required.

* **L**oad: the final database, tables/collections, and why this was chosen.

In [1]:
# Python SQL toolkit and Object Relational Mapper
import sqlalchemy
from sqlalchemy.ext.automap import automap_base
from sqlalchemy.orm import Session
from sqlalchemy import create_engine, inspect, func

import pandas as pd

from config import pwd #pwd for mysql stored

In [2]:
# connect to mysql DB ontime_performance
engine = create_engine(f"mysql://root:{pwd}@127.0.0.1/transportation")

In [3]:
# Inspect
inspector = inspect(engine)
inspector.get_table_names()

['airlines', 'airports', 'flights_2015_01_01_300']

In [4]:
# Get a list of column names and types
columns = inspector.get_columns('airports')
for c in columns:
    print(c['name'], c['type'])
# columns

IATA_CODE TEXT
AIRPORT TEXT
CITY TEXT
STATE TEXT
COUNTRY TEXT
LATITUDE DOUBLE
LONGITUDE DOUBLE
ID INTEGER(11)


In [5]:
# Reflect Database into ORM classes
Base = automap_base()
Base.prepare(engine, reflect=True)

In [6]:
# Map Flight_2015 class
Flights_2015 = Base.classes.flights_2015_01_01_300

In [7]:
# Map airport class
Airports = Base.classes.airports

In [8]:
# Map airline class
Airlines = Base.classes.airlines

In [9]:
# create a session
session = Session(engine)

In [10]:
pd.read_sql(session.query(Flights_2015).statement, engine).head()

Unnamed: 0,YEAR,MONTH,DAY,DAY_OF_WEEK,AIRLINE,FLIGHT_NUMBER,TAIL_NUMBER,ORIGIN_AIRPORT,DESTINATION_AIRPORT,SCHEDULED_DEPARTURE,...,ARRIVAL_DELAY,DIVERTED,CANCELLED,CANCELLATION_REASON,AIR_SYSTEM_DELAY,SECURITY_DELAY,AIRLINE_DELAY,LATE_AIRCRAFT_DELAY,WEATHER_DELAY,ID
0,2015,1,1,4,AS,98,N407AS,ANC,SEA,5,...,-22,0,0,,,,,,,1
1,2015,1,1,4,AA,2336,N3KUAA,LAX,PBI,10,...,-9,0,0,,,,,,,2
2,2015,1,1,4,US,840,N171US,SFO,CLT,20,...,5,0,0,,,,,,,3
3,2015,1,1,4,AA,258,N3HYAA,LAX,MIA,20,...,-9,0,0,,,,,,,4
4,2015,1,1,4,AS,135,N527AS,SEA,ANC,25,...,-21,0,0,,,,,,,5


In [11]:
pd.read_sql(session.query(Airports).statement, engine).head()

Unnamed: 0,IATA_CODE,AIRPORT,CITY,STATE,COUNTRY,LATITUDE,LONGITUDE,ID
0,ABE,Lehigh Valley International Airport,Allentown,PA,USA,40.65236,-75.4404,1
1,ABI,Abilene Regional Airport,Abilene,TX,USA,32.41132,-99.6819,2
2,ABQ,Albuquerque International Sunport,Albuquerque,NM,USA,35.04022,-106.60919,3
3,ABR,Aberdeen Regional Airport,Aberdeen,SD,USA,45.44906,-98.42183,4
4,ABY,Southwest Georgia Regional Airport,Albany,GA,USA,31.53552,-84.19447,5


In [12]:
pd.read_sql(session.query(Airlines).statement, engine).head()

Unnamed: 0,IATA_CODE,AIRLINE,ID
0,UA,United Air Lines Inc.,1
1,AA,American Airlines Inc.,2
2,US,US Airways Inc.,3
3,F9,Frontier Airlines Inc.,4
4,B6,JetBlue Airways,5


# Flight Delay

In [42]:
# departure_delay by airline
session.query(Airlines.AIRLINE,func.avg(Flights_2015.DEPARTURE_DELAY)).\
filter(Flights_2015.AIRLINE==Airlines.IATA_CODE).\
group_by(Flights_2015.AIRLINE).\
order_by(func.avg(Flights_2015.DEPARTURE_DELAY).desc()).all()

[('Spirit Air Lines', Decimal('9.0000')),
 ('American Airlines Inc.', Decimal('8.1707')),
 ('Skywest Airlines Inc.', Decimal('7.2647')),
 ('United Air Lines Inc.', Decimal('7.1750')),
 ('Atlantic Southeast Airlines', Decimal('5.7778')),
 ('Delta Air Lines Inc.', Decimal('5.6216')),
 ('JetBlue Airways', Decimal('1.7273')),
 ('Frontier Airlines Inc.', Decimal('1.6000')),
 ('Southwest Airlines Co.', Decimal('0.3750')),
 ('US Airways Inc.', Decimal('-0.2353')),
 ('American Eagle Airlines Inc.', Decimal('-3.8000')),
 ('Hawaiian Airlines Inc.', Decimal('-6.4286')),
 ('Alaska Airlines Inc.', Decimal('-7.2632'))]

In [43]:
# departure_delay by airport
session.query(Airports.AIRPORT,func.avg(Flights_2015.DEPARTURE_DELAY)).\
filter(Flights_2015.ORIGIN_AIRPORT==Airports.IATA_CODE).\
group_by(Flights_2015.ORIGIN_AIRPORT).\
order_by(func.avg(Flights_2015.DEPARTURE_DELAY).desc()).all()

[('Buffalo Niagara International Airport', Decimal('104.0000')),
 ('Atlantic City International Airport', Decimal('102.0000')),
 ('Savannah/Hilton Head International Airport', Decimal('86.0000')),
 ('McClellan-Palomar Airport', Decimal('54.0000')),
 ('Phoenix Sky Harbor International Airport', Decimal('47.5556')),
 ('Ontario International Airport', Decimal('40.6667')),
 ('Luis Muñoz Marín International Airport', Decimal('23.8571')),
 ('Boise Airport\xa0(Boise Air Terminal)', Decimal('22.0000')),
 ('Dallas/Fort Worth International Airport', Decimal('21.5000')),
 ('LaGuardia Airport (Marine Air Terminal)', Decimal('18.2500')),
 ('Hartsfield-Jackson Atlanta International Airport', Decimal('17.5000')),
 ('Cleveland Hopkins International Airport', Decimal('17.5000')),
 ('Baltimore-Washington International Airport', Decimal('17.0000')),
 ('Erie International Airport', Decimal('16.0000')),
 ('Denver International Airport', Decimal('15.3750')),
 ('George Bush Intercontinental Airport', Decimal

In [40]:
# arrival_delay by airline
session.query(Airlines.AIRLINE,func.avg(Flights_2015.ARRIVAL_DELAY)).\
filter(Flights_2015.AIRLINE==Airlines.IATA_CODE).\
group_by(Flights_2015.AIRLINE).\
order_by(func.avg(Flights_2015.ARRIVAL_DELAY).desc()).all()

[('Skywest Airlines Inc.', Decimal('8.6471')),
 ('Spirit Air Lines', Decimal('8.4118')),
 ('American Airlines Inc.', Decimal('6.2927')),
 ('Atlantic Southeast Airlines', Decimal('3.8889')),
 ('United Air Lines Inc.', Decimal('2.6500')),
 ('JetBlue Airways', Decimal('-0.8182')),
 ('US Airways Inc.', Decimal('-1.0000')),
 ('Hawaiian Airlines Inc.', Decimal('-2.7143')),
 ('Frontier Airlines Inc.', Decimal('-3.4000')),
 ('Delta Air Lines Inc.', Decimal('-6.1351')),
 ('American Eagle Airlines Inc.', Decimal('-6.4000')),
 ('Southwest Airlines Co.', Decimal('-11.5000')),
 ('Alaska Airlines Inc.', Decimal('-13.7895'))]

In [44]:
# arrival_delay by airport
session.query(Airports.AIRPORT,func.avg(Flights_2015.ARRIVAL_DELAY)).\
filter(Flights_2015.DESTINATION_AIRPORT==Airports.IATA_CODE).\
group_by(Flights_2015.DESTINATION_AIRPORT).\
order_by(func.avg(Flights_2015.ARRIVAL_DELAY).desc()).all()

[('Southwest Florida International Airport', Decimal('86.0000')),
 ('Bradley International Airport', Decimal('49.0000')),
 ('Trenton Mercer Airport', Decimal('24.0000')),
 ('William P. Hobby Airport', Decimal('22.0000')),
 ('Luis Muñoz Marín International Airport', Decimal('19.0000')),
 ('Honolulu International Airport', Decimal('15.0000')),
 ('Los Angeles International Airport', Decimal('14.1000')),
 ('LaGuardia Airport (Marine Air Terminal)', Decimal('12.5000')),
 ('Denver International Airport', Decimal('12.4348')),
 ('Newark Liberty International Airport', Decimal('9.9333')),
 ('Miami International Airport', Decimal('9.5500')),
 ('George Bush Intercontinental Airport', Decimal('8.2083')),
 ('Kansas City International Airport', Decimal('6.0000')),
 ('Phoenix Sky Harbor International Airport', Decimal('5.7500')),
 ('Dallas/Fort Worth International Airport', Decimal('4.1818')),
 ('Orlando International Airport', Decimal('4.0000')),
 ('Salt Lake City International Airport', Decimal('0.