# Airfare Scraping
Developing and cleaning a Kaggle-ready dataset for advanced analysis on airline flight information.

Resources:
1. Headless Selenium testing with Python and PhantomJS: <br>
https://realpython.com/blog/python/headless-selenium-testing-with-python-and-phantomjs/
2. Setting PhantomJS user agent string: <br>
https://coderwall.com/p/9jgaeq/set-phantomjs-user-agent-string
3. Another helpful reference for airfare scraping: <br>
https://github.com/hakanmhmd/air-fare-scraper/blob/master/flight_price_scrape.ipynb

### Dependencies

In [1]:
!pip install selenium



In [2]:
!brew install PhantomJS



In [3]:
from selenium import webdriver
from selenium.webdriver.common.desired_capabilities import DesiredCapabilities

import requests
import bs4 as bs
import pandas as pd

### Core Scraping Function

In [4]:
def google_flights_to_pandas(google_flights_url):
    '''
    Creates a table with columns for airline, departure/arrival city, 
    departure/arrival time, price, duration, number of stops (or nonstop)
    
    Input:
        google_flights_url (string): Google Flights url to scrape (see example below)
    Example Input:
        https://www.google.com/flights/#search;f=SFO;t=EWR;d=2018-04-01;tt=o;a=UA;s=0
        f = from this airport (ex. SFO)
        t = to this airport (ex. JFK)
        d = date of flight (ex. yyyy-mm-dd)
        tt = travel type (ex. o for one-way, m for multi-city)
        a = airline (ex. UA for United)
        s = # of stops (ex. 0 for nonstop)
    Output:
        google_flights_pd (pandas DataFrame): Airfare information including 
            date, airline, price, duration, and number of stops
    '''
    assert requests.get(url).status_code == 200

    dcap = dict(DesiredCapabilities.PHANTOMJS)
    dcap["phantomjs.page.settings.userAgent"] = (
        "Mozilla/5.0 (Macintosh; Intel Mac OS X 10_10_5) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/62.0.3202.94")

    driver = webdriver.PhantomJS(desired_capabilities=dcap)
    driver.get(google_flights_url)
    
    # Read in source.content to beautifulsoup 
    # we pass in the source content and choose a parser
    soup = bs.BeautifulSoup(driver.page_source, 'lxml') 

    # the airline is divided by <div class="LJV2HGB-d-j">
    airlines = []
    for airline in soup.findAll('div','LJV2HGB-d-j'):
        airlines.append(airline.text)
    
    # scrape the departure and arrival city and date
    size = len(soup.findAll('div','LJV2HGB-d-j'))
    departure_city = []
    arrival_city = []
    date = []
    for i in range(size):
        departure_index = google_flights_url.find("f=")
        departure_city.append(google_flights_url[departure_index + 2: departure_index + 5])
        arrival_index = google_flights_url.find("t=")
        arrival_city.append(google_flights_url[arrival_index + 2: arrival_index + 5])
        date_index = google_flights_url.find("d=")
        date.append(google_flights_url[date_index + 2: date_index + 12])
        
    # time of departure/arrival is divided by <div class="LJV2HGB-d-Zb">
    departure_times = []
    arrival_times = []
    for time in soup.findAll('div', 'LJV2HGB-d-Zb'):
        time = time.text.split(' – ')
        departure_times.append(time[0])
        arrival_times.append(time[1])
    
    # the price is between <div class="LJV2HGB-d-Ab">
    prices = []
    for price in soup.findAll('div', 'LJV2HGB-d-Ab'):
        prices.append(price.text.replace('$','').replace(',',''))
    
    # length of the flight is divided by <div class="LJV2HGB-d-E">
    duration = []
    for time in soup.findAll('div', 'LJV2HGB-d-E'):
        duration.append(time.text)
       
    # the number of stops is divided by <div class="LJV2HGB-d-Qb">
    stops = []
    for stop in soup.findAll('div', 'LJV2HGB-d-Qb'):
        stops.append(stop.text)
    
    return pd.DataFrame(list(zip(airlines, departure_city, arrival_city, date, 
                                 departure_times, arrival_times, prices,duration, stops)),
                      columns = ['Airline', 'Departure City', 'Arrival City', 'Date', 
                                 'Departure Time', 'Arrival Time', 'Price', 'Duration', 'Stops'])

### Choose the Google Flights Links to Scrape

In [5]:
url_list = []
url_list.append("https://www.google.com/flights/#search;f=SFO;t=EWR;d=2018-04-01;tt=o;a=UA;s=0")
url_list.append("https://www.google.com/flights/#search;f=SFO;t=HNL;d=2018-04-01;tt=o;a=UA;s=0")

df_list = []
for url in url_list:
    df_list.append(google_flights_to_pandas(url))

### Saving as CSV

In [6]:
for i in range(len(df_list)):
    df = df_list[i]
    file_path = str(i+1) + ".csv"
    df.to_csv(file_path)