In [1]:
# Reference: https://github.com/hakanmhmd/air-fare-scraper/blob/master/flight_price_scrape.ipynb

In [2]:
!pip install selenium

Collecting selenium
  Using cached selenium-3.7.0-py2.py3-none-any.whl
Installing collected packages: selenium
Successfully installed selenium-3.7.0


In [1]:
from selenium import webdriver
from selenium.webdriver.common.desired_capabilities import DesiredCapabilities

import requests
import bs4 as bs
import pandas as pd

In [7]:
# f = from this airport (ex. SFO)
# t = to this airport (ex. JFK)
# d = date of flight (ex. yyyy-mm-dd)
# tt = travel type (ex. o for one-way, m for multi-city)
# a = airline (ex. UA for United)
# s = # of stops (ex. 0 for nonstop)

google_flights_url = '''
https://www.google.com/flights/#search;f=SFO;t=EWR;d=2018-04-01;tt=o;a=UA;s=0
'''

# This should return <Response [200]>
requests.get(google_flights_url)

<Response [200]>

In [8]:
# Sources: 
# https://realpython.com/blog/python/headless-selenium-testing-with-python-and-phantomjs/
# https://coderwall.com/p/9jgaeq/set-phantomjs-user-agent-string
# if you have trouble downloading phantomjs:
# https://apple.stackexchange.com/questions/41542/adding-a-new-executable-to-the-path-environment-variable 
dcap = dict(DesiredCapabilities.PHANTOMJS)
dcap["phantomjs.page.settings.userAgent"] = (
    "Mozilla/5.0 (Macintosh; Intel Mac OS X 10_10_5) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/62.0.3202.94")

driver = webdriver.PhantomJS(desired_capabilities=dcap)
driver.get(google_flights_url)

In [9]:
# Read in source.content to beautifulsoup 
# beautifulsoup can parse (extract specific information) HTML code

soup = bs.BeautifulSoup(driver.page_source, 'lxml') 
# we pass in the source content and choose a parser

# features specifies what type of code we are parsing, 
# here 'lxml' specifies an HTML parser

In [10]:
# If you click into the printed text and then CTRL+F "$2",
# you can see that it has the prices so the webscraping works
#print(soup.prettify())

In [11]:
# the price is between <div class="LJV2HGB-d-Ab">
prices = []
for price in soup.findAll('div', 'LJV2HGB-d-Ab'):
    prices.append(price.text.replace('$','').replace(',',''))
    
prices

[]

In [12]:
# Time of departure is divided by <div class="LJV2HGB-d-Zb">
times = soup.findAll('div', 'LJV2HGB-d-Zb')
for i in times:
    print(i.text)

In [13]:
# Length of the flight is divided by <div class="LJV2HGB-d-E">
length = soup.findAll('div', 'LJV2HGB-d-E')
length

[<div class="LJV2HGB-d-E">5h 22m</div>,
 <div class="LJV2HGB-d-E">5h 15m</div>,
 <div class="LJV2HGB-d-E">5h 20m</div>,
 <div class="LJV2HGB-d-E">5h 22m</div>,
 <div class="LJV2HGB-d-E">5h 18m</div>,
 <div class="LJV2HGB-d-E">5h 16m</div>,
 <div class="LJV2HGB-d-E">5h 27m</div>,
 <div class="LJV2HGB-d-E">5h 21m</div>,
 <div class="LJV2HGB-d-E">5h 30m</div>,
 <div class="LJV2HGB-d-E">5h 17m</div>]

In [14]:
# the number of stops is divided by <div class="LJV2HGB-d-Qb">
stops = soup.findAll('div', 'LJV2HGB-d-Qb')
stops

[<div class="LJV2HGB-d-Qb">Nonstop</div>,
 <div class="LJV2HGB-d-Qb">Nonstop</div>,
 <div class="LJV2HGB-d-Qb">Nonstop</div>,
 <div class="LJV2HGB-d-Qb">Nonstop</div>,
 <div class="LJV2HGB-d-Qb">Nonstop</div>,
 <div class="LJV2HGB-d-Qb">Nonstop</div>,
 <div class="LJV2HGB-d-Qb">Nonstop</div>,
 <div class="LJV2HGB-d-Qb">Nonstop</div>,
 <div class="LJV2HGB-d-Qb">Nonstop</div>,
 <div class="LJV2HGB-d-Qb">Nonstop</div>]

In [31]:
#the airline is divided by <div class="LJV2HGB-d-j">
airline = soup.findAll('div','LJV2HGB-d-j')
airline

[<div class="LJV2HGB-d-j"><span>United</span></div>,
 <div class="LJV2HGB-d-j"><span>United</span></div>,
 <div class="LJV2HGB-d-j"><span>United</span></div>,
 <div class="LJV2HGB-d-j"><span>United</span></div>,
 <div class="LJV2HGB-d-j"><span>United</span></div>,
 <div class="LJV2HGB-d-j"><span>United</span></div>,
 <div class="LJV2HGB-d-j"><span>United</span></div>,
 <div class="LJV2HGB-d-j"><span>United</span></div>,
 <div class="LJV2HGB-d-j"><span>United</span></div>,
 <div class="LJV2HGB-d-j"><span>United</span></div>]

In [38]:
# Now that we know where to find the necessary information,
# we need to automate cleaning the output and put it into a DF
airline=[]
for airl in soup.findAll('div','LJV2HGB-d-j'):
    airline.append(airl.text)
    
airline

['United',
 'United',
 'United',
 'United',
 'United',
 'United',
 'United',
 'United',
 'United',
 'United']

In [52]:
#Can you help split the departure and arrivals time into different lists?
#will also need additional columns for departure and arrival airports
#but we'll deal with that when we make the list of airports we want to use
def table(soup):
    """Creates a table with columns for airline, price, duration, 
    number of stops (or nonstop), departures and arrivals"""
    airlines = []
    for airline in soup.findAll('div','LJV2HGB-d-j'):
        airlines.append(airline.text)
        
    prices = []
    for price in soup.findAll('div', 'LJV2HGB-d-Ab'):
        prices.append(price.text.replace('$','').replace(',',''))
    
    duration = []
    for time in soup.findAll('div', 'LJV2HGB-d-E'):
        duration.append(time.text)
        
    stops = []
    for stop in soup.findAll('div', 'LJV2HGB-d-Qb'):
        stops.append(stop.text)
    
    tbl = pd.DataFrame(list(zip(airlines,prices,duration,stops)),
                      columns = ['Airline', 'Price', 'Duration', 'Stops'])

    return tbl
   
    

In [51]:
table(soup)

Unnamed: 0,Airline,Price,Duration,Stops
0,United,240,5h 22m,Nonstop
1,United,240,5h 15m,Nonstop
2,United,240,5h 20m,Nonstop
3,United,265,5h 22m,Nonstop
4,United,307,5h 18m,Nonstop
5,United,342,5h 16m,Nonstop
6,United,427,5h 27m,Nonstop
7,United,449,5h 21m,Nonstop
8,United,554,5h 30m,Nonstop
9,United,554,5h 17m,Nonstop
