# Setup

In [2]:
# run cmd 'Jupter notebook'

In [3]:
import time
from bs4 import BeautifulSoup
import requests
import re
from selenium import webdriver
from selenium.webdriver.chrome.options import Options
from urllib.parse import urlparse
import datetime
import json
import pandas as pd
import numpy as np
import os

# Configure driver

In [4]:
## initalize headless driver config
user_agent = "Mozilla/5.0 (Windows NT 10.0; Win64; x64) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/85.0.4183.83 Safari/537.36"

options = webdriver.ChromeOptions()
options.headless = False
options.add_argument(f'user-agent={user_agent}')
options.add_argument("--window-size=1920,1080")
options.add_argument('--ignore-certificate-errors')
options.add_argument('--allow-running-insecure-content')
options.add_argument("--disable-extensions")
options.add_argument("--proxy-server='direct://'")
options.add_argument("--proxy-bypass-list=*")
options.add_argument("--start-maximized")
options.add_argument('--disable-gpu')
options.add_argument('--disable-dev-shm-usage')
options.add_argument('--no-sandbox')

## config
dyear = '2023'
dmonth = '02'
ddate = '01'

# dyear = '2023'
# dmonth = '03'
# ddate = '11'
# ryear = '2023'
# rmonth = '03'
# rdate = '21'
from_port = 'hkg'
to_port = 'lhr'

# from_port = 'lhr'
# to_port = 'hkg'

# 11/3 Hong Kong to Paris
# 21/3 London to Hong Kong

# Main Functions

In [8]:
############utility functions

def text_to_seconds(text):
    import re
    if re.search(r'\d+$', text) is not None:
        text = text+'m'

    in_seconds = {'d': 60 *60* 60, 'h': 60 * 60, 'm': 60}
    seconds = sum(int(num) * in_seconds[weight] for num, weight in re.findall(r'(\d+)\s?(m|d|h)', text))
    return seconds

def get_count_in_directory(directory_name):
    import os
    path = os.path.join(os.getcwd(),directory_name) # /Users/doge/python-web-scraping/path
    return len(os.listdir(path))

###############

def map_port_name(site, port_code):
        
    if site == 'expedia':
        match port_code:
            case 'hkg':
                port_name = '%3AHong%20Kong%20%28HKG-Hong%20Kong%20Intl.%29%2C'
            case 'cdg':
                port_name = '%3AParis%20%28CDG%20-%20Roissy-Charles%20de%20Gaulle%29%2C'
            case 'lhr':
                port_name = '%3ALondon%20%28LHR-Heathrow%29%2C'
            case _:
                ''
    return port_name

def generate_url(site, from_port, to_port,dyear,dmonth,ddate):
    ## @dev: setup url per config date
    if site == 'expedia':
        from_port = map_port_name(site,from_port)
        to_port = map_port_name(site,to_port)
        departure = f'%3A{dyear}%2F{dmonth}%2F{ddate}'
        url = f'https://www.expedia.com.hk/Flights-Search?leg1=from{from_port}to{to_port}departure{departure}TANYT&mode=search&options=carrier%3A%2A%2Ccabinclass%3A%2Cmaxhops%3A1%2Cnopenalty%3AN&pageId=0&passengers=adults%3A1%2Cchildren%3A0%2Cinfantinlap%3AN&trip=oneway'

    if site == 'skyscanner':
        departure = f'{dyear[2:4]}{dmonth}{ddate}'
        url = f'https://www.skyscanner.com.hk/transport/flights/{from_port}/{to_port}/{departure}/?adults=1&adultsv2=1&cabinclass=economy&children=0&childrenv2=&destinationentityid=27544008&inboundaltsenabled=false&infants=0&originentityid=27542065&outboundaltsenabled=false&preferdirects=false&ref=home&rtn=0'

    return url

def fetch_data(site):
    driver = webdriver.Chrome(options=options, executable_path=r'.\chromedriver_mac64\chromedriver')
    if site == 'expedia':
        url = expedia_url
    if site == 'skyscanner':
        url = skyscanner_url

    driver.get(url)
    # driver.get_screenshot_as_file('/temp/fetch.png') 
    time.sleep((1 * 60))
    page_source = driver.page_source
    driver.quit()
    return page_source

def convert_to_json(site, page_source):

    ## expedia
    if site == 'expedia':
        soup = BeautifulSoup(page_source, 'lxml')
        items = soup.find_all('li', attrs={'data-test-id': 'offer-listing'})
        count = 0
        options_dict = {}

        for index, item in enumerate(items):

            option = f'{site}_{index + 1}'
            search_start = f'{dyear}{dmonth}{ddate}'
            departure_time = item.find('span', attrs={'data-test-id': 'departure-time'}).text            
            arrival_departure_texts = item.find('div', attrs={'data-test-id': 'arrival-departure'}).text.split('(')
            from_port = arrival_departure_texts[1].split(')')[0]
            to_port = arrival_departure_texts[2].split(')')[0]
            arrival_departure = f'{from_port} - {to_port}'
            journey_duration_texts = item.find('div', attrs={'data-test-id': 'journey-duration'}).text.split('(')
            journey_duration = journey_duration_texts[0]
            stop = journey_duration_texts[1][0:len(journey_duration_texts[1])-1]
            flight_operated = item.find('div', attrs={'data-test-id': 'flight-operated'}).text
            lookup_price_texts = item.find('span', class_='uitk-price-a11y is-visually-hidden').text.split('HK$')
            lookup_price = lookup_price_texts[1].replace(",", "")
            created_at= str(datetime.datetime.now())

            options_dict[option] = {'search_start': search_start,
                                    'departure_time': departure_time,
                                    'arrival_departure':  arrival_departure,
                                    'journey_duration': journey_duration,
                                    'stop': stop,
                                    'flight_operated': flight_operated,
                                    'lookup_price': lookup_price,
                                    'created_at': created_at}

            count += 1
        file_sequence = get_count_in_directory('results') + 1
        with open(f'./results/flight_{site}_{file_sequence}.json', 'w+') as f:
            f.write(json.dumps(options_dict))
        print(f'--------------Search in {site} completed, {count} record returned.--------------')

    if site == 'skyscanner':
        soup = BeautifulSoup(page_source, 'lxml')
        items = soup.find_all('div', class_='BpkTicket_bpk-ticket__NTM0M')
        count = 0
        options_dict = {}

        for index, item in enumerate(items):

            option = f'{site}_{index + 1}'
            search_start = f'{dyear}{dmonth}{ddate}'
            from_time = item.find('div', class_ ='LegInfo_routePartialDepart__NzEwY').find('span').find('div').find('span').text
            to_time = item.find('div', class_ ='LegInfo_routePartialArrive__Y2U1N').find('span').find('div').find('span').text
            departure_time= f"{from_time} - {to_time}"
            from_port = item.find('div', class_ ='LegInfo_routePartialDepart__NzEwY').find('span',class_='BpkText_bpk-text__ZWIzZ BpkText_bpk-text--body-default__MzkyN').find('div').find('span').text
            to_port = item.find('div', class_ ='LegInfo_routePartialArrive__Y2U1N').find('span',class_='BpkText_bpk-text__ZWIzZ BpkText_bpk-text--body-default__MzkyN').find('div').find('span').text
            arrival_departure = f"{from_port} - {to_port}"
            journey_duration= item.find('div',class_='LegInfo_stopsContainer__NWIyN').find('span', class_ ='BpkText_bpk-text__ZWIzZ BpkText_bpk-text--xs__MTAxY Duration_duration__NmUyM').text
            stop = item.find('div',class_='LegInfo_stopsLabelContainer__MmM0Z').find('span').text
            flight_operated= item.find('div', class_='LegLogo_legImage__MmY0Z').find('div').find('img')['alt']
            lookup_price_texts=item.find('div', class_='Price_mainPriceContainer__MDM3O').find('span').text.split('HK$')
            lookup_price = lookup_price_texts[1].replace(",", "")
            created_at= str(datetime.datetime.now())

            options_dict[option] = {'search_start': search_start,
                                    'departure_time': departure_time,
                                    'arrival_departure':  arrival_departure,
                                    'journey_duration': journey_duration,
                                    'stop': stop,
                                    'flight_operated': flight_operated,
                                    'lookup_price': lookup_price,
                                    'created_at': created_at}
            count += 1
        file_sequence = get_count_in_directory('results') + 1
        with open(f'./results/flight_{site}_{file_sequence}.json', 'w+') as f:
            f.write(json.dumps(options_dict))
        print(f'--------------Search in {site} completed, {count} record returned.--------------')

def run_pipline():
    print('--------------Pieline Start--------------')
    # Import libraries
    import glob
    import os
    import pandas as pd

    # Get CSV files list from a folder
    pwd = os.getcwd()
    path = os.path.join(pwd,"results")
    json_files = glob.glob(path + "/*.json")

    # Read each CSV file into DataFrame
    # This creates a list of dataframes
    df_list = (pd.read_json(file, orient = 'index') for file in json_files)

    # Concatenate all DataFrames
    big_df   = pd.concat(df_list, ignore_index=True)

    print('--------------Pieline completed--------------')
    return big_df

# Opeations

In [6]:
expedia_url = generate_url('expedia', from_port, to_port,dyear,dmonth,ddate)
skyscanner_url = generate_url('skyscanner', from_port, to_port,dyear,dmonth,ddate)

expedia_page_source = fetch_data('expedia')
skyscanner_page_source = fetch_data('skyscanner')

convert_to_json('expedia',expedia_page_source)
convert_to_json('skyscanner',skyscanner_page_source)

In [9]:
df = run_pipline()

--------------Pieline Start--------------
--------------Pieline completed--------------


# Explore data

In [10]:
df.info()

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 36 entries, 0 to 35
Data columns (total 8 columns):
 #   Column             Non-Null Count  Dtype         
---  ------             --------------  -----         
 0   search_start       36 non-null     int64         
 1   departure_time     36 non-null     object        
 2   arrival_departure  36 non-null     object        
 3   journey_duration   36 non-null     object        
 4   stop               36 non-null     object        
 5   flight_operated    36 non-null     object        
 6   lookup_price       36 non-null     int64         
 7   created_at         36 non-null     datetime64[ns]
dtypes: datetime64[ns](1), int64(2), object(5)
memory usage: 2.4+ KB


In [11]:
df.head(20)

Unnamed: 0,search_start,departure_time,arrival_departure,journey_duration,stop,flight_operated,lookup_price,created_at
0,20230201,19:10 - 06:20,HKG - LHR,19h 10,1 stop,Qatar Airways,5427,2023-01-22 21:29:24.575107
1,20230201,23:50 - 06:20,HKG - LHR,14h 30,Direct,Cathay Pacific,4541,2023-01-22 21:29:24.575689
2,20230201,13:50 - 20:20,HKG - LHR,14h 30,Direct,Cathay Pacific,4541,2023-01-22 21:29:24.576256
3,20230201,18:50 - 05:55,HKG - LHR,19h 05,1 stop,Singapore Airlines,4946,2023-01-22 21:29:24.576841
4,20230201,23:00 - 05:50,HKG - LHR,14h 50,Direct,British Airways,5850,2023-01-22 21:29:24.577417
5,20230201,21:00 - 11:40,HKG - LHR,22h 40,2 stops,Emirates,6574,2023-01-22 21:29:24.578019
6,20230201,22:20 - 05:00,HKG - LHR,14h 40,Direct,Cathay Pacific,5991,2023-01-22 21:29:24.578609
7,20230201,19:45 - 06:20,HKG - LHR,18h 35,1 stop,Thai Airways,4707,2023-01-22 21:29:24.579207
8,20230201,21:00 - 11:40,HKG - LHR,22h 40,2 stops,Emirates,4206,2023-01-22 21:29:24.579811
9,20230201,21:00 - 13:50,HKG - LHR,24h 50,2 stops,Emirates,3917,2023-01-22 21:29:24.580412


# Apply functions

In [12]:
# df['journey_duration_sec'] = df['journey_duration'].map(lambda a: text_to_seconds(a))
df['m_journey_duration_sec'] = df['journey_duration'].map(lambda a: text_to_seconds(a))

In [13]:
df.head(20).sort_values(by = 'lookup_price')

Unnamed: 0,search_start,departure_time,arrival_departure,journey_duration,stop,flight_operated,lookup_price,created_at,m_journey_duration_sec
9,20230201,21:00 - 13:50,HKG - LHR,24h 50,2 stops,Emirates,3917,2023-01-22 21:29:24.580412,89400
8,20230201,21:00 - 11:40,HKG - LHR,22h 40,2 stops,Emirates,4206,2023-01-22 21:29:24.579811,81600
11,20230201,19:10 - 22:00,HKG - LHR,34 小時 50 分鐘,1 個停站,卡塔爾航空,4345,2023-01-22 21:29:24.553906,0
1,20230201,23:50 - 06:20,HKG - LHR,14h 30,Direct,Cathay Pacific,4541,2023-01-22 21:29:24.575689,52200
2,20230201,13:50 - 20:20,HKG - LHR,14h 30,Direct,Cathay Pacific,4541,2023-01-22 21:29:24.576256,52200
7,20230201,19:45 - 06:20,HKG - LHR,18h 35,1 stop,Thai Airways,4707,2023-01-22 21:29:24.579207,66900
12,20230201,13:50 - 20:20,HKG - LHR,14 小時 30 分鐘,直航,國泰航空,4712,2023-01-22 21:29:24.554062,0
13,20230201,23:50 - 06:20,HKG - LHR,14 小時 30 分鐘,直航,國泰航空,4712,2023-01-22 21:29:24.554209,0
15,20230201,12:45 - 06:20,HKG - LHR,25 小時 35 分鐘,1 個停站,泰國國際航空,4863,2023-01-22 21:29:24.554509,0
14,20230201,19:45 - 06:20,HKG - LHR,18 小時 35 分鐘,1 個停站,泰國國際航空,4863,2023-01-22 21:29:24.554358,0


# Get Inslight

In [14]:
# best price
df[df['lookup_price']==min(df['lookup_price'])]

Unnamed: 0,search_start,departure_time,arrival_departure,journey_duration,stop,flight_operated,lookup_price,created_at,m_journey_duration_sec
9,20230201,21:00 - 13:50,HKG - LHR,24h 50,2 stops,Emirates,3917,2023-01-22 21:29:24.580412,89400


In [15]:
# shortest
df[df['m_journey_duration_sec']==min(df['m_journey_duration_sec'])].sort_values(by = 'lookup_price')

Unnamed: 0,search_start,departure_time,arrival_departure,journey_duration,stop,flight_operated,lookup_price,created_at,m_journey_duration_sec
11,20230201,19:10 - 22:00,HKG - LHR,34 小時 50 分鐘,1 個停站,卡塔爾航空,4345,2023-01-22 21:29:24.553906,0
12,20230201,13:50 - 20:20,HKG - LHR,14 小時 30 分鐘,直航,國泰航空,4712,2023-01-22 21:29:24.554062,0
13,20230201,23:50 - 06:20,HKG - LHR,14 小時 30 分鐘,直航,國泰航空,4712,2023-01-22 21:29:24.554209,0
14,20230201,19:45 - 06:20,HKG - LHR,18 小時 35 分鐘,1 個停站,泰國國際航空,4863,2023-01-22 21:29:24.554358,0
15,20230201,12:45 - 06:20,HKG - LHR,25 小時 35 分鐘,1 個停站,泰國國際航空,4863,2023-01-22 21:29:24.554509,0
16,20230201,19:40 - 19:20,HKG - LHR,31 小時 40 分鐘,2 個停站,長榮航空,4981,2023-01-22 21:29:24.554657,0
17,20230201,13:35 - 19:20,HKG - LHR,37 小時 45 分鐘,2 個停站,長榮航空,4981,2023-01-22 21:29:24.554805,0
18,20230201,11:15 - 19:20,HKG - LHR,40 小時 5 分鐘,2 個停站,長榮航空,4981,2023-01-22 21:29:24.554956,0
19,20230201,19:45 - 19:35,HKG - LHR,31 小時 50 分鐘,1 個停站,泰國國際航空,5039,2023-01-22 21:29:24.555109,0
20,20230201,12:45 - 19:35,HKG - LHR,38 小時 50 分鐘,1 個停站,泰國國際航空,5039,2023-01-22 21:29:24.555248,0


# Transform data (utility)

In [None]:
# experdia_1

s2 = '18h 0m (1 stop)'
s = '14h 30m (Direct)'
s = s.split('(')
s[1] = s[1][0:len(s[1])-1]

print(s[0])
print(s[1])

14h 30m 
Direct


In [None]:
# experdia_1
s = 'HK$6,687'
s = s.split('HK$')

print(s[1])



6,687


In [None]:
# experdia_1
s = 'Hong Kong (HKG) - London (LHR)'
s = s.split('(')

print(s)
from_port=s[1].split(')')[0]
to_port=s[2].split(')')[0]
print(f'{from_port} - {to_port}')

['Hong Kong ', 'HKG) - London ', 'LHR)']
HKG - LHR


In [None]:
# skyscanner
# @dev: to convert the text into seconds and missing unit 'm' if the last char is a digit
# s = '22h 40'
# s1 = '19h 15'
# s2 = '18h'
# s3 = '14h 30m'
# s4 = '1d 14h 30m'

def text_to_seconds(text):
    import re
    if re.search(r'\d+$', text) is not None:
        text = text+'m'

    in_seconds = {'d': 60 *60* 60, 'h': 60 * 60, 'm': 60}
    seconds = sum(int(num) * in_seconds[weight] for num, weight in re.findall(r'(\d+)\s?(m|d|h)', text))
    return seconds

#@debug:
#print(re.findall(r'(\d+)\s?(m|d|h)', text))
#print(0*60*60*60+22*60*60+40*60)
#print(seconds)

Test

In [None]:

def text_to_seconds(text):
    import re
    if re.search(r'\d+$', text) is not None:
        text = text+'m'

    in_seconds = {'d': 60 *60* 60, 'h': 60 * 60, 'm': 60}
    seconds = sum(int(num) * in_seconds[weight] for num, weight in re.findall(r'(\d+)\s?(m|d|h)', text))
    return seconds

def get_count_in_directory(directory_name):
    import os
    path = os.path.join(os.getcwd(),directory_name) # /Users/doge/python-web-scraping/path
    return len(os.listdir(path))

###############

def map_port_name(site, port_code):
        
    if site == 'expedia':
        match port_code:
            case 'hkg':
                port_name = '%3AHong%20Kong%20%28HKG-Hong%20Kong%20Intl.%29%2C'
            case 'cdg':
                port_name = '%3AParis%20%28CDG%20-%20Roissy-Charles%20de%20Gaulle%29%2C'
            case 'lhr':
                port_name = '%3ALondon%20%28LHR-Heathrow%29%2C'
            case _:
                ''
    return port_name

def generate_url(site, from_port, to_port,dyear,dmonth,ddate):
    ## @dev: setup url per config date
    if site == 'expedia':
        from_port = map_port_name(site,from_port)
        to_port = map_port_name(site,to_port)
        departure = f'%3A{dyear}%2F{dmonth}%2F{ddate}'
        url = f'https://www.expedia.com.hk/Flights-Search?langid=2057&leg1=from{from_port}to{to_port}departure{departure}TANYT&mode=search&options=carrier%3A%2A%2Ccabinclass%3A%2Cmaxhops%3A1%2Cnopenalty%3AN&pageId=0&passengers=adults%3A1%2Cchildren%3A0%2Cinfantinlap%3AN&trip=oneway'

    if site == 'skyscanner':
        departure = f'{dyear[2:4]}{dmonth}{ddate}'
        url = f'https://www.skyscanner.com.hk/transport/flights/{from_port}/{to_port}/{departure}/?adults=1&adultsv2=1&cabinclass=economy&children=0&childrenv2=&destinationentityid=27544008&inboundaltsenabled=false&infants=0&originentityid=27542065&outboundaltsenabled=false&preferdirects=false&ref=home&rtn=0'

    return url

def fetch_data(site):
    driver = webdriver.Chrome(options=options, executable_path=r'.\chromedriver_mac64\chromedriver')
    if site == 'expedia':
        url = expedia_url
    if site == 'skyscanner':
        url = skyscanner_url

    driver.get(url)
    # driver.get_screenshot_as_file('/temp/fetch.png') 
    time.sleep((1 * 60))
    page_source = driver.page_source
    driver.quit()
    return page_source

def convert_to_json(site, page_source):

    ## expedia
    if site == 'expedia':
        soup = BeautifulSoup(page_source, 'lxml')
        items = soup.find_all('li', attrs={'data-test-id': 'offer-listing'})
        count = 0
        options_dict = {}

        for index, item in enumerate(items):

            option = f'{site}_{index + 1}'
            search_start = f'{dyear}{dmonth}{ddate}'
            departure_time = item.find('span', attrs={'data-test-id': 'departure-time'}).text            
            arrival_departure_texts = item.find('div', attrs={'data-test-id': 'arrival-departure'}).text.split('(')
            from_port = arrival_departure_texts[1].split(')')[0]
            to_port = arrival_departure_texts[2].split(')')[0]
            arrival_departure = f'{from_port} - {to_port}'
            journey_duration_texts = item.find('div', attrs={'data-test-id': 'journey-duration'}).text.split('(')
            journey_duration = journey_duration_texts[0]
            stop = journey_duration_texts[1][0:len(journey_duration_texts[1])-1]
            flight_operated = item.find('div', attrs={'data-test-id': 'flight-operated'}).text
            lookup_price_texts = item.find('span', class_='uitk-price-a11y is-visually-hidden').text.split('HK$')
            lookup_price = lookup_price_texts[1].replace(",", "")
            created_at= str(datetime.datetime.now())

            options_dict[option] = {'search_start': search_start,
                                    'departure_time': departure_time,
                                    'arrival_departure':  arrival_departure,
                                    'journey_duration': journey_duration,
                                    'stop': stop,
                                    'flight_operated': flight_operated,
                                    'lookup_price': lookup_price,
                                    'created_at': created_at}

            count += 1
        file_sequence = get_count_in_directory('results') + 1
        with open(f'./results/flight_{site}_{file_sequence}.json', 'w+') as f:
            f.write(json.dumps(options_dict))
        print(f'--------------Search in {site} completed, {count} record returned.--------------')

    if site == 'skyscanner':
        soup = BeautifulSoup(page_source, 'lxml')
        items = soup.find_all('div', class_='BpkTicket_bpk-ticket__NTM0M')
        count = 0
        options_dict = {}

        for index, item in enumerate(items):

            option = f'{site}_{index + 1}'
            search_start = f'{dyear}{dmonth}{ddate}'
            from_time = item.find('div', class_ ='LegInfo_routePartialDepart__NzEwY').find('span').find('div').find('span').text
            to_time = item.find('div', class_ ='LegInfo_routePartialArrive__Y2U1N').find('span').find('div').find('span').text
            departure_time= f"{from_time} - {to_time}"
            from_port = item.find('div', class_ ='LegInfo_routePartialDepart__NzEwY').find('span',class_='BpkText_bpk-text__ZWIzZ BpkText_bpk-text--body-default__MzkyN').find('div').find('span').text
            to_port = item.find('div', class_ ='LegInfo_routePartialArrive__Y2U1N').find('span',class_='BpkText_bpk-text__ZWIzZ BpkText_bpk-text--body-default__MzkyN').find('div').find('span').text
            arrival_departure = f"{from_port} - {to_port}"
            journey_duration= item.find('div',class_='LegInfo_stopsContainer__NWIyN').find('span', class_ ='BpkText_bpk-text__ZWIzZ BpkText_bpk-text--xs__MTAxY Duration_duration__NmUyM').text
            stop = item.find('div',class_='LegInfo_stopsLabelContainer__MmM0Z').find('span').text
            flight_operated= item.find('div', class_='LegLogo_legImage__MmY0Z').find('div').find('img')['alt']
            lookup_price_texts=item.find('div', class_='Price_mainPriceContainer__MDM3O').find('span').text.split('HK$')
            lookup_price = lookup_price_texts[1].replace(",", "")
            created_at= str(datetime.datetime.now())

            options_dict[option] = {'search_start': search_start,
                                    'departure_time': departure_time,
                                    'arrival_departure':  arrival_departure,
                                    'journey_duration': journey_duration,
                                    'stop': stop,
                                    'flight_operated': flight_operated,
                                    'lookup_price': lookup_price,
                                    'created_at': created_at}
            count += 1
        file_sequence = get_count_in_directory('results') + 1
        with open(f'./results/flight_{site}_{file_sequence}.json', 'w+') as f:
            f.write(json.dumps(options_dict))
        print(f'--------------Search in {site} completed, {count} record returned.--------------')

def run_pipline():
    print('--------------Pieline Start--------------')
    # Import libraries
    import glob
    import os
    import pandas as pd

    # Get CSV files list from a folder
    pwd = os.getcwd()
    path = os.path.join(pwd,"results")
    json_files = glob.glob(path + "/*.json")

    # Read each CSV file into DataFrame
    # This creates a list of dataframes
    df_list = (pd.read_json(file, orient = 'index') for file in json_files)

    # Concatenate all DataFrames
    big_df   = pd.concat(df_list, ignore_index=True)

    print('--------------Pieline completed--------------')
    return big_df



  driver = webdriver.Chrome(options=options, executable_path=r'.\chromedriver_mac64\chromedriver')


--------------Search in expedia completed, 0 record returned.--------------
