# ETL PROJECT by Huy and Soyoung

Object: To collect data around airports, airlines and the web and perform ETL into a usuable database

In [1]:
import pandas as pd
from sqlalchemy import create_engine, inspect, func
from bs4 import BeautifulSoup as bs
from sqlalchemy.orm import Session
from sqlalchemy import Column, Integer, String, Float
import string
from pwd import pwd

ModuleNotFoundError: No module named 'pwd'

# Extraction (Part 1 - Airport details data)

In [None]:
# Airport Details data from https://www.kaggle.com/open-flights/airports-train-stations-and-ferry-terminals
csv_file = "airports-extended.csv"
airport_data_df = pd.read_csv(csv_file, encoding="latin-1")
airport_data_df.head()

# Transform (Part 1 - Airport details data)

In [None]:
# Cleaning up Airport Details data
NEW_airport_data_df = airport_data_df[['Facility Name','City', 'Country', 'IATA_Code', 'ICAO_Code', 'Lat', 'Lng','Alt(Feet)','Hours__from UTC', 'TimeZone','Type']].copy()
NEW_airport_data_df = NEW_airport_data_df.loc[NEW_airport_data_df['Type'] == 'airport']
NEW_airport_data_df = NEW_airport_data_df.loc[NEW_airport_data_df['Country'] == 'United States']
NEW_airport_data_df.head()

In [None]:
NEW_airport_data_df.dtypes

In [None]:
#NEW_airport_data_df['City'].map(lambda x: x.encode("utf-8"))

In [None]:
NEW_airport_data_df.head()

In [None]:
#still cleaning up Airport Details data
#NEW_airport_data_df['Facility Name'] = NEW_airport_data_df['Facility Name'].str.encode('utf-8')
#NEW_airport_data_df['City'] = NEW_airport_data_df['City'].str.encode('utf-8')
#NEW_airport_data_df['City'] = NEW_airport_data_df['City'].astype(str)
#NEW_airport_data_df['Country'] = NEW_airport_data_df['Country'].str.encode('utf-8')
#NEW_airport_data_df['IATA_Code'] = NEW_airport_data_df['IATA_Code'].str.encode('utf-8')
#NEW_airport_data_df['ICAO_Code'] = NEW_airport_data_df['ICAO_Code'].str.encode('utf-8')
NEW_airport_data_df['Lat'] = pd.to_numeric(NEW_airport_data_df['Lat'], errors="coerce")
NEW_airport_data_df['Lng'] = pd.to_numeric(NEW_airport_data_df['Lng'], errors="coerce")
NEW_airport_data_df['Alt(Feet)'] = pd.to_numeric(NEW_airport_data_df['Alt(Feet)'], errors="coerce")
NEW_airport_data_df['Hours__from UTC'] = pd.to_numeric(NEW_airport_data_df['Hours__from UTC'], errors="coerce")
NEW_airport_data_df = NEW_airport_data_df.replace(r'\\N','', regex=True) 

#Export new clean data to CSV
NEW_airport_data_df.to_csv('airport_details_clean.csv', index=False)


# Load (Part 1 - Airport details data)

In [None]:
engine = create_engine(f'mysql+mysqldb://root:{pwd}@127.0.0.1/flights_db')

In [None]:
# Confirm tables
#engine.table_names()
# Inspect
inspector = inspect(engine)
inspector.get_table_names()

In [None]:
NEW_airport_data_df.to_sql(name='airport_details', con=engine, if_exists='append', index=False)

# Extraction (Part 2 - Scraping)

In [2]:
from bs4 import BeautifulSoup
import requests
import time
from splinter import Browser
from datetime import datetime
import os.path


In [3]:
browser = Browser("chrome", executable_path="chromedriver", headless=False)

url = "https://www.kayak.com/flights/SFO-SGN/2019-12-20/2020-01-04?sort=bestflight_a"
browser.visit(url)
time.sleep(35)

html = browser.html
soup = BeautifulSoup(html, "html.parser")

In [4]:
# Create container object of information to scrape
resultInner = soup.find_all("div",class_="resultInner")

In [53]:
if os.path.exists('trips.csv'):
    trips_file_df = pd.read_csv('trips.csv')
    
    
    #
    dep_air = []
    ret_air =[]
    dep_time = []
    cost = []
    durat = []
    Date = []
    
    for container in resultInner:
        departure = container.findAll("div",{"class":"bottom"})
        departure_airline = departure[0].text
        return_airline = departure[3].text
        dep_air.append(departure_airline)
        ret_air.append(return_airline)

        departure_time_container = container.findAll("span",{"class":"time-pair"})
        departure_time = departure_time_container[0].text.strip()
        departure_time = str.replace(departure_time,'\n','')
        dep_time.append(departure_time)

        price_container = container.findAll("span",{"class":"price option-text"})
        price = price_container[0].text
        price = str.replace(price, '\n','')
        price = str.replace(price, '$','')
        cost.append(price)

        duration_container = container.findAll("div",{"class":"top"})
        duration_time = duration_container[2].text
        durat.append(duration_time)

        date = datetime.today().strftime('%Y-%m-%d %I:%M %p')
        Date.append(date)
    
    trips_dict = {'Departure_Airline':dep_air,
              'Return_Airline':ret_air,
              'Departure_time':dep_time,
              'Price':cost,
              'Duration':durat,
              'Date_Checked':Date
             }
    trips_df = pd.DataFrame.from_dict(trips_dict)
    trips_merged_df = pd.concat([trips_file_df,trips_df], axis=0, sort = True)
    col =['Departure_Airline','Return_Airline','Departure_time','Duration','Price','Date_Checked']
    
    trips_merged_df[col].to_csv('trips.csv',index=False)
else:
    # Create file to write to
    filename = "trips.csv"
    f = open(filename, "w")

    headers = "Departure_Airline,Return_Airline,Departure_time,Duration,Price,Date_Checked\n"
    f.write(headers)
    
    #Create dataframe to load results

    dep_air = []
    ret_air =[]
    dep_time = []
    cost = []
    durat = []
    Date = []
    
    for container in resultInner:
        departure = container.findAll("div",{"class":"bottom"})
        departure_airline = departure[0].text
        return_airline = departure[3].text
        dep_air.append(departure_airline)
        ret_air.append(return_airline)

        departure_time_container = container.findAll("span",{"class":"time-pair"})
        departure_time = departure_time_container[0].text.strip()
        departure_time = str.replace(departure_time,'\n','')
        dep_time.append(departure_time)

        price_container = container.findAll("span",{"class":"price option-text"})
        price = price_container[0].text
        price = str.replace(price, '\n','')
        price = str.replace(price, '$','')
        cost.append(price)

        duration_container = container.findAll("div",{"class":"top"})
        duration_time = duration_container[2].text
        durat.append(duration_time)

        date = datetime.today().strftime('%Y-%m-%d %I:%M %p')
        Date.append(date)

        f.write(departure_airline +  "," + return_airline + "," + departure_time + "," + duration_time + "," + price + "," + date + "\n" )

    f.close()
    trips_dict = {'Departure_Airline':dep_air,
              'Return_Airline':ret_air,
              'Departure_time':dep_time,
              'Price':cost,
              'Duration':durat,
              'Date_Checked':Date
             }
    trips_df = pd.DataFrame.from_dict(trips_dict)

Departure_Airline    object
Return_Airline       object
Departure_time       object
Price                object
Duration             object
Date_Checked         object
dtype: object

In [48]:
trips_merged_df

Unnamed: 0,Date_Checked,Departure_Airline,Departure_time,Depature_time,Duration,Price,Return_Airline
0,2019-05-09 03:28 PM,China Airlines,,12:05 am,18h 55m,1523,China Airlines
1,2019-05-09 03:28 PM,Japan Airlines,,8:30 am,30h 20m,1353,EVA Air
2,2019-05-09 03:28 PM,China Airlines,,12:05 am,18h 55m,1520,China Airlines
3,2019-05-09 03:28 PM,China Airlines,,12:05 am,18h 55m,1520,China Airlines
4,2019-05-09 03:28 PM,China Airlines,,12:05 am,19h 20m,1562,China Airlines
5,2019-05-09 03:28 PM,Korean Air,,11:30 pm,22h 20m,1406,Korean Air
6,2019-05-09 03:28 PM,Vietnam Airlines,,12:05 am,19h 20m,1514,Vietnam Airlines
7,2019-05-09 03:28 PM,Korean Air,,11:30 am,20h 15m,1535,Korean Air
8,2019-05-09 03:28 PM,EVA Air,,12:20 pm,18h 40m,1583,EVA Air
9,2019-05-09 03:28 PM,EVA Air,,12:20 pm,18h 40m,1583,EVA Air


In [None]:
trips_merged_df = pd.concat([trips_file_df,trips_df], axis=1, sort = True)
cols = ['Departure_Airline','Return_Airline','Departure_time','Duration','Price','Date_Checked']
trips_merged_df=trips_merged_df.reindex(columns=cols)
trips_merged_df

In [25]:
print(len(trips_file_df))
print(len(trips_df))
print(len(trips_merged_df))




17
17
17


# Transform (part 2 - Scraping)


In [None]:
#loop through html container
for container in resultInner:
    departure = container.findAll("div",{"class":"bottom"})
    departure_airline = departure[0].text
    return_airline = departure[3].text
    dep_air.append(departure_airline)
    ret_air.append(return_airline)
    
    departure_time_container = container.findAll("span",{"class":"time-pair"})
    departure_time = departure_time_container[0].text.strip()
    departure_time = str.replace(departure_time,'\n','')
    dep_time.append(departure_time)
    
    price_container = container.findAll("span",{"class":"price option-text"})
    price = price_container[0].text
    price = str.replace(price, '\n','')
    price = str.replace(price, '$','')
    cost.append(price)
    
    duration_container = container.findAll("div",{"class":"top"})
    duration_time = duration_container[2].text
    durat.append(duration_time)
    
    date = datetime.today().strftime('%Y-%m-%d %H:%M %p')
    Date.append(date)
    
    f.write(departure_airline +  "," + return_airline + "," + departure_time + "," + duration_time + "," + price + "," + date + "\n" )
    
f.close()
    


In [None]:
    trips_dict = {'Departure_Airline':dep_air,
              'Return_Airline':ret_air,
              'Departure_time':dep_time,
              'Price':cost,
              'Duration':durat,
              'Date_Checked':Date
             }

In [None]:
len("trips.csv")

In [None]:
trips_df = pd.DataFrame.from_dict(trips_dict)

In [None]:
trips_df['Price'] = trips_df['Price'].astype(float)
trips_df

# Load (part 2 - Scraping)

In [None]:
engine = create_engine(f'mysql+mysqldb://root:{pwd}@127.0.0.1/flights_db')

In [None]:
#Load scaped trips df into flights_db
trips_df.to_sql(name='trips_scraped', con=engine, if_exists='append', index=False)

# Extraction (Part 3 - Airport details data)


In [None]:
#Uploads flight delay data
#upload airline Data
