# Setup

In [None]:
# run cmd 'Jupter notebook'

In [33]:
import time
from bs4 import BeautifulSoup
import requests
import re
from selenium import webdriver
from selenium.webdriver.chrome.options import Options
from urllib.parse import urlparse
import datetime
import json
import pandas as pd
import numpy as np
import os

# Configure driver

In [34]:
## initalize headless driver config
user_agent = "Mozilla/5.0 (Windows NT 10.0; Win64; x64) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/85.0.4183.83 Safari/537.36"

options = webdriver.ChromeOptions()
options.headless = True
options.add_argument(f'user-agent={user_agent}')
options.add_argument("--window-size=1920,1080")
options.add_argument('--ignore-certificate-errors')
options.add_argument('--allow-running-insecure-content')
options.add_argument("--disable-extensions")
options.add_argument("--proxy-server='direct://'")
options.add_argument("--proxy-bypass-list=*")
options.add_argument("--start-maximized")
options.add_argument('--disable-gpu')
options.add_argument('--disable-dev-shm-usage')
options.add_argument('--no-sandbox')

## config
dyear = '2023'
dmonth = '03'
ddate = '01'
ryear = '2023'
rmonth = '03'
rdate = '15'

## raw url
expedia_url = 'https://www.expedia.com.hk/Flights-Search?journeysContinuationId=AQr6AgrkAnY1LXNvcy02NWRhOWZkMzQ4MjU4YTA1NmZhY2NjZjEwZDUyZTU2Zi0wLTAtMX4yLlN-QVFvQ0NCSVNCd2pITWhBSkdEOG9BbGdDY0FBfkFRcFlDaW9JekxBQkVnUTVOVEUxR01jeUlMazhLUGFEcEFJd3FvcWtBamhVUUFCWUFXb0lSVU5QVTBGV1JWSUtLZ2pNc0FFU0F6TXhOaGk1UENEcmlRRW8xNHFrQWpERmk2UUNPRlJBQVZnQmFnaEZRMDlUUVZaRlVncFhDaW9JekxBQkVnTXpNemtZNjRrQklMazhLTFdqcFFJd21hU2xBamhUUUFCWUFXb0lSVU5QVTBGV1JWSUtLUWpNc0FFU0F6RXpPQmk1UENESE1paTByNlVDTU91MHBRSTRVMEFCV0FGcUNFVkRUMU5CVmtWU0Vnb0lBUkFCR0FFcUFreFlHQUVpQkFnQkVBRW9BaWdES0FRd0FREQAAAAAAi7lAIgEBKgUSAwoBMQ%3D%3D&leg1=from%3AHong%20Kong%20%28HKG-Hong%20Kong%20Intl.%29%2Cto%3ALondon%20%28LHR-Heathrow%29%2Cdeparture%3A2023%2F02%2F05TANYT&leg2=from%3ALondon%20%28LHR-Heathrow%29%2Cto%3AHong%20Kong%20%28HKG-Hong%20Kong%20Intl.%29%2Cdeparture%3A2023%2F02%2F19TANYT&mode=search&options=carrier%3A%2A%2Ccabinclass%3A%2Cmaxhops%3A1%2Cnopenalty%3AN&pageId=1&passengers=adults%3A1%2Cchildren%3A0%2Cinfantinlap%3AN&trip=roundtrip'
skyscanner_url = 'https://www.skyscanner.com.hk/transport/flights/hkg/lhr/230205/?adults=1&adultsv2=1&cabinclass=economy&children=0&childrenv2=&destinationentityid=27544008&inboundaltsenabled=false&infants=0&originentityid=27542065&outboundaltsenabled=false&preferdirects=false&ref=home&rtn=0'


# Main Functions

In [184]:

def update_url(url):
    ## @dev: setup url per config date
    domain = urlparse(url).netloc
    if domain == 'www.expedia.com.hk':
        url = url.replace("2023%2F02%2F05TANYT&leg2",f"{dyear}%2F{dmonth}%2F{ddate}TANYT&leg2").replace("2023%2F02%2F19TANYT&mode=search&options",f"{ryear}%2F{rmonth}%2F{rdate}TANYT&mode=search&options")
    if domain == 'www.skyscanner.com.hk':
        url = url.replace("230205/230219",f"{dyear[2:4]}{dmonth}{ddate}/{ryear[2:4]}{rmonth}{rdate}")
    return url

def fetch_data(site):
    driver = webdriver.Chrome(options=options, executable_path=r'.\chromedriver_mac64\chromedriver')
    if site == 'expedia':
        url = expedia_url
    if site == 'skyscanner':
        url = skyscanner_url

    driver.get(url)
    # driver.get_screenshot_as_file('/temp/fetch.png') 
    page_source = driver.page_source
    driver.quit()
    return page_source

def convert_to_json(site, page_source):

    ## expedia
    if site == 'expedia':
        print(f'..........start {site}:')
        soup = BeautifulSoup(page_source, 'lxml')
        items = soup.find_all('li', attrs={'data-test-id': 'offer-listing'})
        count = 0
        options_dict = {}

        for index, item in enumerate(items):

            option = f'{site}_{index + 1}'
            search_start = f'{dyear}{dmonth}{ddate}'
            search_end = f'{ryear}{rmonth}{rdate}'
            departure_time = item.find('span', attrs={'data-test-id': 'departure-time'}).text            
            arrival_departure_texts = item.find('div', attrs={'data-test-id': 'arrival-departure'}).text.split('(')
            from_port = arrival_departure_texts[1].split(')')[0]
            to_port = arrival_departure_texts[2].split(')')[0]
            arrival_departure = f'{from_port} - {to_port}'
            journey_duration_texts = item.find('div', attrs={'data-test-id': 'journey-duration'}).text.split('(')
            journey_duration = journey_duration_texts[0]
            stop = journey_duration_texts[1][0:len(journey_duration_texts[1])-1]
            flight_operated = item.find('div', attrs={'data-test-id': 'flight-operated'}).text
            lookup_price_texts = item.find('span', class_='uitk-price-a11y is-visually-hidden').text.split('HK$')
            lookup_price = lookup_price_texts[1].replace(",", "")
            created_at= str(datetime.datetime.now())

            options_dict[option] = {'search_start:' search_start,
                                    'search_end:' search_end,
                                    'departure_time': departure_time,
                                    'arrival_departure':  arrival_departure,
                                    'journey_duration': journey_duration,
                                    'stop': stop,
                                    'flight_operated': flight_operated,
                                    'lookup_price': lookup_price,
                                    'created_at': created_at}

            count += 1
        
        with open(f'./results/flight_{site}.json', 'w+') as f:
            f.write(json.dumps(options_dict))
        print(f'--------------Search in {site} completed, {count} record returned.--------------')

    if site == 'skyscanner':
        print(f'..........start {site}:')
        soup = BeautifulSoup(page_source, 'lxml')
        items = soup.find_all('div', class_='BpkTicket_bpk-ticket__NTM0M')
        count = 0
        options_dict = {}

        for index, item in enumerate(items):

            option = f'{site}_{index + 1}'
            from_time = item.find('div', class_ ='LegInfo_routePartialDepart__NzEwY').find('span').find('div').find('span').text
            to_time = item.find('div', class_ ='LegInfo_routePartialArrive__Y2U1N').find('span').find('div').find('span').text
            departure_time= f"{from_time} - {to_time}"
            from_port = item.find('div', class_ ='LegInfo_routePartialDepart__NzEwY').find('span',class_='BpkText_bpk-text__ZWIzZ BpkText_bpk-text--body-default__MzkyN').find('div').find('span').text
            to_port = item.find('div', class_ ='LegInfo_routePartialArrive__Y2U1N').find('span',class_='BpkText_bpk-text__ZWIzZ BpkText_bpk-text--body-default__MzkyN').find('div').find('span').text
            arrival_departure = f"{from_port} - {to_port}"
            journey_duration= item.find('div',class_='LegInfo_stopsContainer__NWIyN').find('span', class_ ='BpkText_bpk-text__ZWIzZ BpkText_bpk-text--xs__MTAxY Duration_duration__NmUyM').text
            stop = item.find('div',class_='LegInfo_stopsLabelContainer__MmM0Z').find('span').text
            flight_operated= item.find('div', class_='LegLogo_legImage__MmY0Z').find('div').find('img')['alt']
            lookup_price_texts=item.find('div', class_='Price_mainPriceContainer__MDM3O').find('span').text.split('HK$')
            lookup_price = lookup_price_texts[1].replace(",", "")
            created_at= str(datetime.datetime.now())

            options_dict[option] = {'departure_time': departure_time,
                                    'arrival_departure':  arrival_departure,
                                    'journey_duration': journey_duration,
                                    'stop': stop,
                                    'flight_operated': flight_operated,
                                    'lookup_price': lookup_price,
                                    'created_at': created_at}

            count += 1
        with open(f'./results/flight_{site}.json', 'w+') as f:
            f.write(json.dumps(options_dict))
        print(f'--------------Search in {site} completed, {count} record returned.--------------')

def run_pipline():
    print('--------------Pieline Start--------------')
    pwd = os.getcwd()
    filepath = os.path.join(pwd,"results/flight_expedia.json")
    expedia_df = pd.read_json(filepath, orient = 'index')
    filepath = os.path.join(pwd,"results/flight_skyscanner.json")
    skyscanner_df = pd.read_json(filepath, orient = 'index')
    frames = [expedia_df, skyscanner_df]
    df = pd.concat(frames)
    print('--------------Pieline completed--------------')
    return df

# Opeations

In [111]:
expedia_url = update_url(expedia_url)
skyscanner_url = update_url(skyscanner_url)

expedia_page_source = fetch_data('expedia')
skyscanner_page_source = fetch_data('skyscanner')

convert_to_json('expedia',expedia_page_source)
convert_to_json('skyscanner',skyscanner_page_source)


  driver = webdriver.Chrome(options=options, executable_path=r'.\chromedriver_mac64\chromedriver')
  driver = webdriver.Chrome(options=options, executable_path=r'.\chromedriver_mac64\chromedriver')


..........start expedia:
--------------Search in expedia completed, 5 record returned.--------------
..........start skyscanner:
--------------Search in skyscanner completed, 11 record returned.--------------


In [176]:
df = run_pipline()

working.......


# Explore data

In [177]:
df.info()

<class 'pandas.core.frame.DataFrame'>
Index: 16 entries, expedia_1 to skyscanner_11
Data columns (total 7 columns):
 #   Column             Non-Null Count  Dtype         
---  ------             --------------  -----         
 0   departure_time     16 non-null     object        
 1   arrival_departure  16 non-null     object        
 2   journey_duration   16 non-null     object        
 3   stop               16 non-null     object        
 4   flight_operated    16 non-null     object        
 5   lookup_price       16 non-null     int64         
 6   created_at         16 non-null     datetime64[ns]
dtypes: datetime64[ns](1), int64(1), object(5)
memory usage: 1.0+ KB


In [178]:
df.head(20)

Unnamed: 0,departure_time,arrival_departure,journey_duration,stop,flight_operated,lookup_price,created_at
expedia_1,23:45 - 9:45,HKG - LHR,18h 0m,1 stop,Lufthansa,6687,2023-01-22 18:23:25.681538
expedia_2,23:45 - 10:45,HKG - LHR,19h 0m,1 stop,Lufthansa,6687,2023-01-22 18:23:25.681691
expedia_3,23:45 - 12:45,HKG - LHR,21h 0m,1 stop,Lufthansa,6687,2023-01-22 18:23:25.681833
expedia_4,8:30 - 15:00,HKG - LHR,14h 30m,Direct,Cathay Pacific,7162,2023-01-22 18:23:25.681962
expedia_5,13:50 - 20:20,HKG - LHR,14h 30m,Direct,Cathay Pacific,7162,2023-01-22 18:23:25.682093
skyscanner_1,21:00 - 11:40,HKG - LHR,22h 40,2 stops,Emirates,11302,2023-01-22 18:23:25.707281
skyscanner_2,08:30 - 15:00,HKG - LHR,14h 30,Direct,Cathay Pacific,7036,2023-01-22 18:23:25.707935
skyscanner_3,08:30 - 15:00,HKG - LHR,14h 30,Direct,Cathay Pacific,7036,2023-01-22 18:23:25.708574
skyscanner_4,13:50 - 20:20,HKG - LHR,14h 30,Direct,Cathay Pacific,7036,2023-01-22 18:23:25.709211
skyscanner_5,13:50 - 20:20,HKG - LHR,14h 30,Direct,Cathay Pacific,7036,2023-01-22 18:23:25.709851


# Apply functions

In [179]:
# df['journey_duration_sec'] = df['journey_duration'].map(lambda a: text_to_seconds(a))
df['m_journey_duration_sec'] = df['journey_duration'].map(lambda a: text_to_seconds(a))

In [180]:
df.head(20).sort_values(by = 'lookup_price')

Unnamed: 0,departure_time,arrival_departure,journey_duration,stop,flight_operated,lookup_price,created_at,m_journey_duration_sec
skyscanner_8,23:45 - 09:45,HKG - LHR,18h,1 stop,Lufthansa,5565,2023-01-22 18:23:25.711767,64800
expedia_1,23:45 - 9:45,HKG - LHR,18h 0m,1 stop,Lufthansa,6687,2023-01-22 18:23:25.681538,64800
expedia_2,23:45 - 10:45,HKG - LHR,19h 0m,1 stop,Lufthansa,6687,2023-01-22 18:23:25.681691,68400
expedia_3,23:45 - 12:45,HKG - LHR,21h 0m,1 stop,Lufthansa,6687,2023-01-22 18:23:25.681833,75600
skyscanner_2,08:30 - 15:00,HKG - LHR,14h 30,Direct,Cathay Pacific,7036,2023-01-22 18:23:25.707935,52200
skyscanner_3,08:30 - 15:00,HKG - LHR,14h 30,Direct,Cathay Pacific,7036,2023-01-22 18:23:25.708574,52200
skyscanner_4,13:50 - 20:20,HKG - LHR,14h 30,Direct,Cathay Pacific,7036,2023-01-22 18:23:25.709211,52200
skyscanner_5,13:50 - 20:20,HKG - LHR,14h 30,Direct,Cathay Pacific,7036,2023-01-22 18:23:25.709851,52200
expedia_4,8:30 - 15:00,HKG - LHR,14h 30m,Direct,Cathay Pacific,7162,2023-01-22 18:23:25.681962,52200
expedia_5,13:50 - 20:20,HKG - LHR,14h 30m,Direct,Cathay Pacific,7162,2023-01-22 18:23:25.682093,52200


# Get Inslight

In [181]:
# best price
df[df['lookup_price']==min(df['lookup_price'])]

Unnamed: 0,departure_time,arrival_departure,journey_duration,stop,flight_operated,lookup_price,created_at,m_journey_duration_sec
skyscanner_8,23:45 - 09:45,HKG - LHR,18h,1 stop,Lufthansa,5565,2023-01-22 18:23:25.711767,64800


In [182]:
# shortest
df[df['m_journey_duration_sec']==min(df['m_journey_duration_sec'])].sort_values(by = 'lookup_price')

Unnamed: 0,departure_time,arrival_departure,journey_duration,stop,flight_operated,lookup_price,created_at,m_journey_duration_sec
skyscanner_2,08:30 - 15:00,HKG - LHR,14h 30,Direct,Cathay Pacific,7036,2023-01-22 18:23:25.707935,52200
skyscanner_3,08:30 - 15:00,HKG - LHR,14h 30,Direct,Cathay Pacific,7036,2023-01-22 18:23:25.708574,52200
skyscanner_4,13:50 - 20:20,HKG - LHR,14h 30,Direct,Cathay Pacific,7036,2023-01-22 18:23:25.709211,52200
skyscanner_5,13:50 - 20:20,HKG - LHR,14h 30,Direct,Cathay Pacific,7036,2023-01-22 18:23:25.709851,52200
expedia_4,8:30 - 15:00,HKG - LHR,14h 30m,Direct,Cathay Pacific,7162,2023-01-22 18:23:25.681962,52200
expedia_5,13:50 - 20:20,HKG - LHR,14h 30m,Direct,Cathay Pacific,7162,2023-01-22 18:23:25.682093,52200
skyscanner_7,08:30 - 15:00,HKG - LHR,14h 30,Direct,Cathay Pacific,7487,2023-01-22 18:23:25.711149,52200
skyscanner_9,13:50 - 20:20,HKG - LHR,14h 30,Direct,Cathay Pacific,7487,2023-01-22 18:23:25.712405,52200
skyscanner_10,08:30 - 15:00,HKG - LHR,14h 30,Direct,Cathay Pacific,7487,2023-01-22 18:23:25.713042,52200


# Transform data (utility)

In [53]:
# experdia_1

s2 = '18h 0m (1 stop)'
s = '14h 30m (Direct)'
s = s.split('(')
s[1] = s[1][0:len(s[1])-1]

print(s[0])
print(s[1])

14h 30m 
Direct


In [55]:
# experdia_1
s = 'HK$6,687'
s = s.split('HK$')

print(s[1])



6,687


In [109]:
# experdia_1
s = 'Hong Kong (HKG) - London (LHR)'
s = s.split('(')

print(s)
from_port=s[1].split(')')[0]
to_port=s[2].split(')')[0]
print(f'{from_port} - {to_port}')

['Hong Kong ', 'HKG) - London ', 'LHR)']
HKG - LHR


In [169]:
# skyscanner
# @dev: to convert the text into seconds and missing unit 'm' if the last char is a digit
# s = '22h 40'
# s1 = '19h 15'
# s2 = '18h'
# s3 = '14h 30m'
# s4 = '1d 14h 30m'

def text_to_seconds(text):
    import re
    if re.search(r'\d+$', text) is not None:
        text = text+'m'

    in_seconds = {'d': 60 *60* 60, 'h': 60 * 60, 'm': 60}
    seconds = sum(int(num) * in_seconds[weight] for num, weight in re.findall(r'(\d+)\s?(m|d|h)', text))
    return seconds

#@debug:
#print(re.findall(r'(\d+)\s?(m|d|h)', text))
#print(0*60*60*60+22*60*60+40*60)
#print(seconds)