# ETL PROJECT by Huy and Soyoung

Object: To collect data around airports, airlines and the web and perform ETL into a usuable database

In [1]:
import pandas as pd
from sqlalchemy import create_engine, inspect, func
from bs4 import BeautifulSoup as bs
from sqlalchemy.orm import Session
from sqlalchemy import Column, Integer, String, Float
import string
from pwd import pwd

# Extraction (Part 1 - Airport details data)

In [2]:
# Airport Details data from https://www.kaggle.com/open-flights/airports-train-stations-and-ferry-terminals
csv_file = "airports-extended.csv"
airport_data_df = pd.read_csv(csv_file, encoding="latin-1")
airport_data_df.head()

Unnamed: 0,ID,Facility Name,City,Country,IATA_Code,ICAO_Code,Lat,Lng,Alt(Feet),Hours__from UTC,DLT,TimeZone,Type,Source
0,1,Goroka Airport,Goroka,Papua New Guinea,GKA,AYGA,-6.08169,145.391998,5282,10,U,Pacific/Port_Moresby,airport,OurAirports
1,2,Madang Airport,Madang,Papua New Guinea,MAG,AYMD,-5.20708,145.789001,20,10,U,Pacific/Port_Moresby,airport,OurAirports
2,3,Mount Hagen Kagamuga Airport,Mount Hagen,Papua New Guinea,HGU,AYMH,-5.82679,144.296005,5388,10,U,Pacific/Port_Moresby,airport,OurAirports
3,4,Nadzab Airport,Nadzab,Papua New Guinea,LAE,AYNZ,-6.569803,146.725977,239,10,U,Pacific/Port_Moresby,airport,OurAirports
4,5,Port Moresby Jacksons International Airport,Port Moresby,Papua New Guinea,POM,AYPY,-9.44338,147.220001,146,10,U,Pacific/Port_Moresby,airport,OurAirports


# Transform (Part 1 - Airport details data)

In [3]:
# Cleaning up Airport Details data
NEW_airport_data_df = airport_data_df[['Facility Name','City', 'Country', 'IATA_Code', 'ICAO_Code', 'Lat', 'Lng','Alt(Feet)','Hours__from UTC', 'TimeZone','Type']].copy()
NEW_airport_data_df = NEW_airport_data_df.loc[NEW_airport_data_df['Type'] == 'airport']
NEW_airport_data_df = NEW_airport_data_df.loc[NEW_airport_data_df['Country'] == 'United States']
NEW_airport_data_df.head()

Unnamed: 0,Facility Name,City,Country,IATA_Code,ICAO_Code,Lat,Lng,Alt(Feet),Hours__from UTC,TimeZone,Type
3223,Barter Island LRRS Airport,Barter Island,United States,BTI,PABA,70.134003,-143.582001,2,-9,America/Anchorage,airport
3225,Cape Lisburne LRRS Airport,Cape Lisburne,United States,LUR,PALU,68.875099,-166.110001,16,-9,America/Anchorage,airport
3226,Point Lay LRRS Airport,Point Lay,United States,PIZ,PPIZ,69.732903,-163.005005,22,-9,America/Anchorage,airport
3227,Hilo International Airport,Hilo,United States,ITO,PHTO,19.721399,-155.048004,38,-10,Pacific/Honolulu,airport
3228,Orlando Executive Airport,Orlando,United States,ORL,KORL,28.5455,-81.332901,113,-5,America/New_York,airport


In [4]:
NEW_airport_data_df.dtypes

Facility Name       object
City                object
Country             object
IATA_Code           object
ICAO_Code           object
Lat                float64
Lng                float64
Alt(Feet)            int64
Hours__from UTC     object
TimeZone            object
Type                object
dtype: object

In [5]:
#NEW_airport_data_df['City'].map(lambda x: x.encode("utf-8"))

In [6]:
NEW_airport_data_df.head()

Unnamed: 0,Facility Name,City,Country,IATA_Code,ICAO_Code,Lat,Lng,Alt(Feet),Hours__from UTC,TimeZone,Type
3223,Barter Island LRRS Airport,Barter Island,United States,BTI,PABA,70.134003,-143.582001,2,-9,America/Anchorage,airport
3225,Cape Lisburne LRRS Airport,Cape Lisburne,United States,LUR,PALU,68.875099,-166.110001,16,-9,America/Anchorage,airport
3226,Point Lay LRRS Airport,Point Lay,United States,PIZ,PPIZ,69.732903,-163.005005,22,-9,America/Anchorage,airport
3227,Hilo International Airport,Hilo,United States,ITO,PHTO,19.721399,-155.048004,38,-10,Pacific/Honolulu,airport
3228,Orlando Executive Airport,Orlando,United States,ORL,KORL,28.5455,-81.332901,113,-5,America/New_York,airport


In [7]:
#still cleaning up Airport Details data
#NEW_airport_data_df['Facility Name'] = NEW_airport_data_df['Facility Name'].str.encode('utf-8')
#NEW_airport_data_df['City'] = NEW_airport_data_df['City'].str.encode('utf-8')
#NEW_airport_data_df['City'] = NEW_airport_data_df['City'].astype(str)
#NEW_airport_data_df['Country'] = NEW_airport_data_df['Country'].str.encode('utf-8')
#NEW_airport_data_df['IATA_Code'] = NEW_airport_data_df['IATA_Code'].str.encode('utf-8')
#NEW_airport_data_df['ICAO_Code'] = NEW_airport_data_df['ICAO_Code'].str.encode('utf-8')
NEW_airport_data_df['Lat'] = pd.to_numeric(NEW_airport_data_df['Lat'], errors="coerce")
NEW_airport_data_df['Lng'] = pd.to_numeric(NEW_airport_data_df['Lng'], errors="coerce")
NEW_airport_data_df['Alt(Feet)'] = pd.to_numeric(NEW_airport_data_df['Alt(Feet)'], errors="coerce")
NEW_airport_data_df['Hours__from UTC'] = pd.to_numeric(NEW_airport_data_df['Hours__from UTC'], errors="coerce")
NEW_airport_data_df = NEW_airport_data_df.replace(r'\\N','', regex=True) 

#Export new clean data to CSV
NEW_airport_data_df.to_csv('airport_details_clean.csv', index=False)


# Load (Part 1 - Airport details data)

In [8]:
engine = create_engine(f'mysql+mysqldb://root:{pwd}@127.0.0.1/flights_db')

In [9]:
# Confirm tables
#engine.table_names()
# Inspect
inspector = inspect(engine)
inspector.get_table_names()

['airlines', 'airport_details', 'flight_history_2015', 'trips_scraped']

In [10]:
NEW_airport_data_df.to_sql(name='airport_details', con=engine, if_exists='append', index=False)

# Extraction (Part 2 - Scraping)

In [11]:
from bs4 import BeautifulSoup
import requests
import time
from splinter import Browser
from datetime import datetime


In [12]:
browser = Browser("chrome", executable_path="chromedriver", headless=False)

url = "https://www.kayak.com/flights/SFO-SGN/2019-12-20/2020-01-04?sort=bestflight_a"
browser.visit(url)
time.sleep(35)

html = browser.html
soup = BeautifulSoup(html, "html.parser")

In [13]:
# Create container object of information to scrape
resultInner = soup.find_all("div",class_="resultInner")

In [14]:
# Create file to write to
filename = "trips.csv"
f = open(filename, "w")

headers = "Departure_Airline, Return_Airline, Depature_time, Duration, Price, Date_Checked\n"
f.write(headers)

#Create dataframe to load results

dep_air = []
ret_air =[]
dep_time = []
cost = []
durat = []
Date = []



# Transform (part 2 - Scraping)


In [15]:
#loop through html container
for container in resultInner:
    departure = container.findAll("div",{"class":"bottom"})
    departure_airline = departure[0].text
    return_airline = departure[3].text
    dep_air.append(departure_airline)
    ret_air.append(return_airline)
    
    departure_time_container = container.findAll("span",{"class":"time-pair"})
    departure_time = departure_time_container[0].text.strip()
    departure_time = str.replace(departure_time,'\n','')
    dep_time.append(departure_time)
    
    price_container = container.findAll("span",{"class":"price option-text"})
    price = price_container[0].text
    price = str.replace(price, '\n','')
    price = str.replace(price, '$','')
    cost.append(price)
    
    duration_container = container.findAll("div",{"class":"top"})
    duration_time = duration_container[2].text
    durat.append(duration_time)
    
    date = datetime.today().strftime('%Y-%m-%d, %H:%M %p')
    Date.append(date)
    
    f.write(departure_airline +  "," + return_airline + "," + departure_time + "," + duration_time + "," + price + "," + date + "\n" )
    
f.close()
    


In [16]:
    trips_dict = {'Departure_Airline':dep_air,
              'Return_Airline':ret_air,
              'Departure_time':dep_time,
              'Price':cost,
              'Duration':durat,
              'Date_Checked':Date
             }

In [21]:
len("trips.csv")

9

In [18]:
trips_df = pd.DataFrame.from_dict(trips_dict)

In [19]:
trips_df['Price'] = trips_df['Price'].astype(float)
trips_df

Unnamed: 0,Departure_Airline,Return_Airline,Departure_time,Price,Duration,Date_Checked
0,Multiple Airlines,Cathay Pacific,12:20 pm,1712.0,18h 40m,"2019-05-09, 12:16 PM"
1,Japan Airlines,EVA Air,8:30 am,1353.0,30h 20m,"2019-05-09, 12:16 PM"
2,China Airlines,China Airlines,12:05 am,1520.0,18h 55m,"2019-05-09, 12:16 PM"
3,Vietnam Airlines,Vietnam Airlines,12:05 am,1514.0,19h 20m,"2019-05-09, 12:16 PM"
4,China Airlines,China Airlines,12:05 am,1520.0,18h 55m,"2019-05-09, 12:16 PM"
5,China Airlines,China Airlines,12:05 am,1562.0,19h 20m,"2019-05-09, 12:16 PM"
6,Korean Air,Korean Air,11:30 am,1554.0,20h 15m,"2019-05-09, 12:16 PM"
7,EVA Air,EVA Air,12:20 pm,1583.0,18h 40m,"2019-05-09, 12:16 PM"
8,EVA Air,EVA Air,12:20 pm,1583.0,18h 40m,"2019-05-09, 12:16 PM"
9,China Airlines,Cathay Pacific,12:05 am,1671.0,18h 55m,"2019-05-09, 12:16 PM"


# Load (part 2 - Scraping)

In [None]:
engine = create_engine(f'mysql+mysqldb://root:{pwd}@127.0.0.1/flights_db')

In [None]:
#Load scaped trips df into flights_db
trips_df.to_sql(name='trips_scraped', con=engine, if_exists='append', index=False)

# Extraction (Part 3 - Airport details data)


In [None]:
#Uploads flight delay data
#upload airline Data
