## Import packages

In [1]:
import itertools
import re
import pandas as pd
import numpy as np
import datetime
import time
import json

from selenium import webdriver
from selenium.webdriver.support.ui import WebDriverWait 
from selenium.webdriver.support import expected_conditions
from selenium.webdriver.chrome.options import Options
from bs4 import BeautifulSoup
from datetime import date, timedelta, datetime
from time import sleep, strftime
from random import randint
from tqdm import tqdm
from math import factorial

## User inputs

In [28]:
executable_path = '/Users/junerodriguez/Downloads/chromedriver_mac_arm64/chromedriver'

# List the cities you want to travel to and from, how long you'd like to stay in each, and the appropriate start/end dates
start_city = 'Amsterdam'
end_city = 'Amsterdam'
start_date = '2023-02-14'

cities = ['Warsaw', 'Sofia', 'Milan', 'Belgrade']
days = [3,3,2,3]

# depart_time_interval = ['1000','2000']
# arrive_time_interval = ['1000','2000']

takeoff_constraint = 'takeoff=0900,2000__0900,2000__0900,2000__0900,2000__0900,2000'
landing_constraint = 'landing=1000,2000__1000,2000__1000,2000__1000,2000__1000,1700'

## Functions

In [4]:
def generate_permutations(cities, days, start_city, end_city, start_date):
    """
    Description:
    Returns a df showing all possible journeys using the user-input arguments 
    
    Arguments:
    • cities: list of desired cities to travel to e.g., cities = ['Warsaw', 'Sofia', 'Belgrade', 'Milan'] 
    • days: list of days in each of the cities e.g., days = [3,2,3,2], meaning 3 days in Warsaw, 2 days in Sofia etc.
    • start_city: string of the city you're starting your journey from e.g., 'Amsterdam'
    • end_city: string of the city you're ending your journey in, probably the same as start_city e.g., 'Amsterdam'
    • start_date: string of the date the journey is starting on in 'YYYY-MM-DD' format e.g., '2023-02-15'
    """
    with open("iata_codes.json") as f:
        iata = json.load(f)

    permutations = [
        (start_city,) + p + (end_city,) for p in itertools.permutations(cities)
    ]
    flight_dates = pd.to_datetime(start_date) + pd.to_timedelta(
        np.array([0] + days).cumsum(),
        unit="D",
    )

    # Generate the URLs
    urls = []
    for p in permutations:
        # The pattern for each segment is
        #     START-END,nearby/yyyy-dd-dd
        mid_url = "/".join(
            [
                f"{iata[s]}-{iata[e]},nearby/{fd:%Y-%m-%d}"
                for s, e, fd in zip(p[:-1], p[1:], flight_dates)
            ]
        )
        urls.append(f"https://www.kayak.com/flights/{mid_url}/?sort=bestflight_a&fs={landing_constraint};{takeoff_constraint}")

    # Generate the resulting dataframe
    return (
        pd.DataFrame(
            permutations,
            columns=["origin", *[f"city{i+1}" for i in range(len(cities))], "end"],
        )
        .merge(
            pd.DataFrame(
                flight_dates,
                index=[f"flight_dt_{i+1}" for i in range(len(flight_dates))],
            ).T,
            how="cross",
        )
        .assign(kayak_search_url=urls)
    )

In [109]:
def scrape_permutations(executable_path, urls):
    """
    Description: 
    Scrapes prices and URLs for the quickest i.e., "best" and cheapest journey options for all permutations and returns a df 
    
    Arguments:
    • urls: this is a list made from the 'kayak_search_url' column in the df returned from the generate_permutations function 
    """
    # Grabbing best & cheapeast flight info (price and link) for one iteration
    xp_prices = """//div[@class='above-button']//a[contains(@class,'booking-link')]/span[@class='price option-text']"""
    xp_prices_2 = """//div[contains(@class, 'price-text')]"""
    xp_urls = """//div[@class='col col-best']//a[@href]"""
    xp_urls_2 = """//div[contains(@class, 'main-btn-wrap')]//a[@href]"""
    
    total_time = len(urls)*45
    minutes, seconds = divmod(total_time, 60)
    now = datetime.now()
    
    print(f"Function was run at: {now.strftime('%Y-%m-%d %H:%M:%S')}")
    
    if minutes > 0:
        print(f"This scraper was exectuted on {now.strftime('%Y-%m-%d %H:%M:%S')} and is estimated to finalize scraping all data in {minutes} minutes and {seconds} seconds.")
    else:
        print(f"This scraper was exectuted on {now.strftime('%Y-%m-%d %H:%M:%S')} and is estimated to finalize scraping all data in {seconds} seconds.")
    
    dfs = []

    for url in urls:
        try:
            requests = 0

            agents = ["Firefox/66.0.3","Chrome/73.0.3683.68","Edge/16.16299"]
            chrome_options = webdriver.ChromeOptions()
            chrome_options.add_argument('--headless')
            chrome_options.add_argument('--user-agent=' + agents[(requests%len(agents))] + '"')    
            chrome_options.add_experimental_option('useAutomationExtension', False)

            driver = webdriver.Chrome(chrome_options=chrome_options, executable_path=executable_path)
            driver.implicitly_wait(10)
            driver.get(url)
            sleep(randint(8,10))

            ##################
            ##################
            # Get prices:
            prices = driver.find_elements_by_xpath(xp_prices)
            prices_list = [price.text.replace('$','') for price in prices if price.text != '']
            prices_list = [price.replace(',','') for price in prices_list]
            prices_list = list(map(float, prices_list))

            if not prices_list:
                prices = driver.find_elements_by_xpath(xp_prices_2)
                prices_list = [price.text.replace('$','') for price in prices if price.text != '']
                prices_list = [price.replace(',','') for price in prices_list]
                prices_list = list(map(float, prices_list))
            else:
                prices_list

            ##################
            ##################
            # Get links:        
            data = []
            elems = driver.find_elements_by_xpath(xp_urls)

            for elem in elems:
                data.append(elem.get_attribute("href"))

            if not data:
                elems = driver.find_elements_by_xpath(xp_urls_2)

                for elem in elems:
                    data.append(elem.get_attribute("href"))
            else:
                data

            df_elem = pd.DataFrame(data, columns=['Links'])

            ##################
            ##################
            # Make df and append to list: 
            new_df = pd.DataFrame({'quickest_price': [prices_list[0]],
                                   'cheapest_price': [prices_list[1]],
                                   'quickest_link': [df_elem['Links'][0]],
                                   'cheapest_link': [df_elem['Links'][1]]})

            driver.close()

            dfs.append(new_df)

        except IndexError:
            pass

    total_df = pd.concat(dfs).reset_index(drop=True)

    # convert URL list to pandas series
    series = pd.Series(urls, name='kayak_search_url')

    # concatenate the series and dataframe
    df_scrape = pd.concat([series, total_df], axis=1)

    return df_scrape

In [5]:
def scrape_permutations(executable_path, urls):
    """
    Description: 
    Scrapes prices and URLs for the quickest i.e., "best" and cheapest journey options for all permutations and returns a df 
    
    Arguments:
    • urls: this is a list made from the 'kayak_search_url' column in the df returned from the generate_permutations function 
    """
    # Grabbing best & cheapeast flight info (price and link) for one iteration
    xp_prices = """//div[@class='above-button']//a[contains(@class,'booking-link')]/span[@class='price option-text']"""
    xp_prices_2 = """//div[contains(@class, 'price-text')]"""
    xp_urls = """//div[@class='col col-best']//a[@href]"""
    xp_urls_2 = """//div[contains(@class, 'main-btn-wrap')]//a[@href]"""
    
    total_time = len(urls)*45
    minutes, seconds = divmod(total_time, 60)
    now = datetime.now()
    
    print(f"Function was run at: {now.strftime('%Y-%m-%d %H:%M:%S')}")
    
    if minutes > 0:
        print(f"This scraper was exectuted on {now.strftime('%Y-%m-%d %H:%M:%S')} and is estimated to finalize scraping all data in {minutes} minutes and {seconds} seconds.")
    else:
        print(f"This scraper was exectuted on {now.strftime('%Y-%m-%d %H:%M:%S')} and is estimated to finalize scraping all data in {seconds} seconds.")
    
    dfs = []

    for url in urls:
        try:
            requests = 0

            agents = ["Firefox/66.0.3","Chrome/73.0.3683.68","Edge/16.16299"]
            chrome_options = webdriver.ChromeOptions()
            chrome_options.add_argument('--headless')
            chrome_options.add_argument('--user-agent=' + agents[(requests%len(agents))] + '"')    
            chrome_options.add_experimental_option('useAutomationExtension', False)

            driver = webdriver.Chrome(chrome_options=chrome_options, executable_path=executable_path)
            driver.implicitly_wait(10)
            driver.get(url)
            sleep(randint(8,10))

            ##################
            ##################
            # Get prices:
            prices = driver.find_elements_by_xpath(xp_prices)
            prices_list = [price.text.replace('$','') for price in prices if price.text != '']
            prices_list = [price.replace(',','') for price in prices_list]
            prices_list = list(map(float, prices_list))

            if not prices_list:
                prices = driver.find_elements_by_xpath(xp_prices_2)
                prices_list = [price.text.replace('$','') for price in prices if price.text != '']
                prices_list = [price.replace(',','') for price in prices_list]
                prices_list = list(map(float, prices_list))
            else:
                prices_list

            ##################
            ##################
            # Get links:        
            data = []
            elems = driver.find_elements_by_xpath(xp_urls)

            for elem in elems:
                data.append(elem.get_attribute("href"))

            if not data:
                elems = driver.find_elements_by_xpath(xp_urls_2)

                for elem in elems:
                    data.append(elem.get_attribute("href"))
            else:
                data

            df_elem = pd.DataFrame(data, columns=['Links'])

            ##################
            ##################
            # Make df and append to list: 
            quickest_price = prices_list[0] if prices_list else 'Not Available'
            cheapest_price = prices_list[1] if prices_list else 'Not Available'
            quickest_link = df_elem['Links'][0] if df_elem['Links'][0] else 'Not Available'
            cheapest_link = df_elem['Links'][1] if df_elem['Links'][1] else 'Not Available'

            new_df = pd.DataFrame({'kayak_search_url': [url],
                                   'quickest_price': [quickest_price],
                                   'cheapest_price': [cheapest_price],
                                   'quickest_link': [quickest_link],
                                   'cheapest_link': [cheapest_link]})
            
            driver.close()

            dfs.append(new_df)

        except IndexError:
            pass

    total_df = pd.concat(dfs).reset_index(drop=True)

    # # convert URL list to pandas series
    # series = pd.Series(urls, name='kayak_search_url')

    # # concatenate the series and dataframe
    # df_scrape = pd.concat([series, total_df], axis=1)

    return df_scrape

In [12]:
def scrape_permutations(executable_path, urls):
    """
    Description: 
    Scrapes prices and URLs for the quickest i.e., "best" and cheapest journey options for all permutations and returns a df 
    
    Arguments:
    • urls: this is a list made from the 'kayak_search_url' column in the df returned from the generate_permutations function 
    """
    # Grabbing best & cheapeast flight info (price and link) for one iteration
    xp_prices = """//div[@class='above-button']//a[contains(@class,'booking-link')]/span[@class='price option-text']"""
    xp_prices_2 = """//div[contains(@class, 'price-text')]"""
    xp_urls = """//div[@class='col col-best']//a[@href]"""
    xp_urls_2 = """//div[contains(@class, 'main-btn-wrap')]//a[@href]"""
    
    total_time = len(urls)*45
    minutes, seconds = divmod(total_time, 60)
    now = datetime.now()
    
    print(f"Function was run at: {now.strftime('%Y-%m-%d %H:%M:%S')}")
    
    if minutes > 0:
        print(f"This scraper was exectuted on {now.strftime('%Y-%m-%d %H:%M:%S')} and is estimated to finalize scraping all data in {minutes} minutes and {seconds} seconds.")
    else:
        print(f"This scraper was exectuted on {now.strftime('%Y-%m-%d %H:%M:%S')} and is estimated to finalize scraping all data in {seconds} seconds.")
    
    dfs = []

    for url in urls:
        try:
            requests = 0

            agents = ["Firefox/66.0.3","Chrome/73.0.3683.68","Edge/16.16299"]
            chrome_options = webdriver.ChromeOptions()
            chrome_options.add_argument('--headless')
            chrome_options.add_argument('--user-agent=' + agents[(requests%len(agents))] + '"')    
            chrome_options.add_experimental_option('useAutomationExtension', False)

            driver = webdriver.Chrome(chrome_options=chrome_options, executable_path=executable_path)
            driver.implicitly_wait(10)
            driver.get(url)
            sleep(randint(8,10))

            ##################
            ##################
            # Get prices:
            prices = driver.find_elements_by_xpath(xp_prices)
            prices_list = [price.text.replace('$','') for price in prices if price.text != '']
            prices_list = [price.replace(',','') for price in prices_list]
            prices_list = list(map(float, prices_list))

            if not prices_list:
                prices = driver.find_elements_by_xpath(xp_prices_2)
                prices_list = [price.text.replace('$','') for price in prices if price.text != '']
                prices_list = [price.replace(',','') for price in prices_list]
                prices_list = list(map(float, prices_list))
            else:
                prices_list

            ##################
            ##################
            # Get links:        
            data = []
            elems = driver.find_elements_by_xpath(xp_urls)

            for elem in elems:
                data.append(elem.get_attribute("href"))

            if not data:
                elems = driver.find_elements_by_xpath(xp_urls_2)

                for elem in elems:
                    data.append(elem.get_attribute("href"))
            else:
                data

            df_elem = pd.DataFrame(data, columns=['Links'])

            ##################
            ##################
            # Make df and append to list: 
            try:
                quickest_price = prices_list[0] if prices_list[0] else 'Not Available'
            except (IndexError, KeyError, ValueError):
                quickest_price = 'Not Available'

            try:
                cheapest_price = prices_list[1] if prices_list[1] else 'Not Available'
            except (IndexError, KeyError, ValueError):
                cheapest_price = 'Not Available'

            try: 
                quickest_link = df_elem['Links'][0] if df_elem['Links'][0] else 'Not Available'
            except (IndexError, KeyError, ValueError):
                quickest_link = 'Not Available'

            try:
                cheapest_link = df_elem['Links'][1] if df_elem['Links'][1] else 'Not Available'
            except (IndexError, KeyError, ValueError):
                cheapest_link = 'Not Available'

            new_df = pd.DataFrame({'kayak_search_url': [url],
                                   'quickest_price': [quickest_price],
                                   'cheapest_price': [cheapest_price],
                                   'quickest_link': [quickest_link],
                                   'cheapest_link': [cheapest_link]})
            
            driver.close()

            dfs.append(new_df)

        except IndexError:
            pass

    total_df = pd.concat(dfs).reset_index(drop=True)

    # # convert URL list to pandas series
    # series = pd.Series(urls, name='kayak_search_url')

    # # concatenate the series and dataframe
    # df_scrape = pd.concat([series, total_df], axis=1)

    return total_df

In [7]:
def merge_dfs(df_perm, df_scrape):
    """
    Description: Merge scraped df and permutations df
    
    Arguments:
    • df_perm: df with all permutations
    • df_scrape: df with all scraped details of permutations
    """
    merged_df = pd.merge(df_perm, df_scrape, on='kayak_search_url', how='left')

    return merged_df

In [217]:
# url = 'https://raw.githubusercontent.com/mborsetti/airportsdata/main/airportsdata/airports.csv'
# df_IATA = pd.read_csv(url, sep = ',')
# df_IATA.to_csv('iata_codes.csv', index=False)

## Test the functions

In [29]:
%%time

df_perms = generate_permutations(cities, days, start_city, end_city, start_date)
df_scrape = scrape_permutations(executable_path=executable_path, urls=df_perms['kayak_search_url'].tolist())
df_merged = merge_dfs(df_perms, df_scrape)

Function was run at: 2023-02-05 15:51:09
This scraper was exectuted on 2023-02-05 15:51:09 and is estimated to finalize scraping all data in 18 minutes and 0 seconds.
CPU times: user 298 ms, sys: 669 ms, total: 967 ms
Wall time: 19min 7s


In [30]:
df_merged.to_csv('flights_warsaw.csv', index=False)

In [157]:
# Grabbing best & cheapeast flight info (price and link) for one iteration
xp_prices = """//div[@class='above-button']//a[contains(@class,'booking-link')]/span[@class='price option-text']"""
xp_prices_2 = """//div[contains(@class, 'price-text')]"""
xp_urls = """//div[@class='col col-best']//a[@href]"""
xp_urls_2 = """//div[contains(@class, 'main-btn-wrap')]//a[@href]"""

urls = ['https://www.kayak.com/flights/AMS-KRK,nearby/2023-02-14/KRK-SOF,nearby/2023-02-17/SOF-AMS,nearby/2023-02-20/?sort=bestflight_a&fs=landing=1000,2000__1000,2000__1000,2000__1000,2000__1000,1700;takeoff=0900,2000__0900,2000__0900,2000__0900,2000__0900,2000']
total_time = len(urls)*45
minutes, seconds = divmod(total_time, 60)
now = datetime.now()

print(f"Function was run at: {now.strftime('%Y-%m-%d %H:%M:%S')}")

if minutes > 0:
    print(f"This scraper was exectuted on {now.strftime('%Y-%m-%d %H:%M:%S')} and is estimated to finalize scraping all data in {minutes} minutes and {seconds} seconds.")
else:
    print(f"This scraper was exectuted on {now.strftime('%Y-%m-%d %H:%M:%S')} and is estimated to finalize scraping all data in {seconds} seconds.")

dfs = []

for url in urls:
    try:
        requests = 0

        agents = ["Firefox/66.0.3","Chrome/73.0.3683.68","Edge/16.16299"]
        chrome_options = webdriver.ChromeOptions()
        chrome_options.add_argument('--headless')
        chrome_options.add_argument('--user-agent=' + agents[(requests%len(agents))] + '"')    
        chrome_options.add_experimental_option('useAutomationExtension', False)

        driver = webdriver.Chrome(chrome_options=chrome_options, executable_path=executable_path)
        driver.implicitly_wait(10)
        driver.get(url)
        sleep(randint(8,10))

        ##################
        ##################
        # Get prices:
        prices = driver.find_elements_by_xpath(xp_prices)
        prices_list = [price.text.replace('$','') for price in prices if price.text != '']
        prices_list = [price.replace(',','') for price in prices_list]
        prices_list = list(map(float, prices_list))

        if not prices_list:
            prices = driver.find_elements_by_xpath(xp_prices_2)
            prices_list = [price.text.replace('$','') for price in prices if price.text != '']
            prices_list = [price.replace(',','') for price in prices_list]
            prices_list = list(map(float, prices_list))
        else:
            prices_list

        ##################
        ##################
        # Get links:        
        data = []
        elems = driver.find_elements_by_xpath(xp_urls)

        for elem in elems:
            data.append(elem.get_attribute("href"))

        if not data:
            elems = driver.find_elements_by_xpath(xp_urls_2)

            for elem in elems:
                data.append(elem.get_attribute("href"))
        else:
            data

        df_elem = pd.DataFrame(data, columns=['Links'])

        ##################
        ##################
        # Make df and append to list: 
        try:
            quickest_price = prices_list[0] if prices_list[0] else 'Not Available'
        except (IndexError, KeyError, ValueError):
            quickest_price = 'Not Available'
        
        try:
            cheapest_price = prices_list[1] if prices_list[1] else 'Not Available'
        except (IndexError, KeyError, ValueError):
            cheapest_price = 'Not Available'
            
        try: 
            quickest_link = df_elem['Links'][0] if df_elem['Links'][0] else 'Not Available'
        except (IndexError, KeyError, ValueError):
            quickest_link = 'Not Available'
        
        try:
            cheapest_link = df_elem['Links'][1] if df_elem['Links'][1] else 'Not Available'
        except (IndexError, KeyError, ValueError):
            cheapest_link = 'Not Available'

        new_df = pd.DataFrame({'kayak_search_url': [url],
                               'quickest_price': [quickest_price],
                               'cheapest_price': [cheapest_price],
                               'quickest_link': [quickest_link],
                               'cheapest_link': [cheapest_link]})

        driver.close()

        dfs.append(new_df)

    except IndexError:
        pass

total_df = pd.concat(dfs).reset_index(drop=True)

# # convert URL list to pandas series
# series = pd.Series(urls, name='kayak_search_url')

# # concatenate the series and dataframe
# df_scrape = pd.concat([series, total_df], axis=1)

# df_scrape

Function was run at: 2023-02-05 11:01:27
This scraper was exectuted on 2023-02-05 11:01:27 and is estimated to finalize scraping all data in 45 seconds.


In [158]:
total_df

Unnamed: 0,kayak_search_url,quickest_price,cheapest_price,quickest_link,cheapest_link
0,"https://www.kayak.com/flights/AMS-KRK,nearby/2...",673.0,Not Available,javascript:void(0),Not Available


In [125]:
df_scrape

Unnamed: 0,kayak_search_url,quickest_price,cheapest_price,quickest_link,cheapest_link
0,"https://www.kayak.com/flights/AMS-KRK,nearby/2...",1515.0,1515.0,https://www.kayak.com/book/flight?code=OIEiTUi...,https://www.kayak.com/book/flight?code=OIEiTUi...
1,"https://www.kayak.com/flights/AMS-KRK,nearby/2...",4794.0,4794.0,https://www.kayak.com/book/flight?code=OIHiByw...,https://www.kayak.com/book/flight?code=OIHiByw...
2,"https://www.kayak.com/flights/AMS-SOF,nearby/2...",,,,
3,"https://www.kayak.com/flights/AMS-SOF,nearby/2...",,,,
4,"https://www.kayak.com/flights/AMS-MIL,nearby/2...",,,,
5,"https://www.kayak.com/flights/AMS-MIL,nearby/2...",,,,


In [126]:
df_scrape['quickest_link'][0]

'https://www.kayak.com/book/flight?code=OIEiTUi4du.ZpbE3-UY8hoNUvH8T-y68A.151490.44e661bda91df310e316d009765e95ac&h=3a17573622b8&sub=M-13b32782db7&pageOrigin=F..RP.FE.M0'

## Scraping

### Comparing impact on runtime when using tqdm()
* There's a 45% savings in time when not using tqdm()
* Each iteration takes ~25 seconds to run without tqdm(), meaning a journey with 4 cities to visit (i.e., 24 total permutations) would take ~8 minutes to run

In [206]:
%%time

# Grabbing best & cheapeast flight info (price and link) for one iteration
xp_prices = """//div[@class='above-button']//a[contains(@class,'booking-link')]/span[@class='price option-text']"""
xp_prices_2 = """//div[contains(@class, 'price-text')]"""
xp_urls = """//div[@class='col col-best']//a[@href]"""
xp_urls_2 = """//div[contains(@class, 'main-btn-wrap')]//a[@href]"""
executable_path = '/Users/junerodriguez/Downloads/chromedriver_mac_arm64/chromedriver'

dfs = []

for url in tqdm(urls[0:5]):
    try:
        requests = 0
        
        agents = ["Firefox/66.0.3","Chrome/73.0.3683.68","Edge/16.16299"]
        chrome_options = webdriver.ChromeOptions()
        chrome_options.add_argument('--headless')
        chrome_options.add_argument('--user-agent=' + agents[(requests%len(agents))] + '"')    
        chrome_options.add_experimental_option('useAutomationExtension', False)

        driver = webdriver.Chrome(chrome_options=chrome_options, executable_path=executable_path)
        driver.implicitly_wait(10)
        driver.get(url)
        sleep(randint(8,10))

        ##################
        ##################
        # Get prices:
        prices = driver.find_elements_by_xpath(xp_prices)
        prices_list = [price.text.replace('$','') for price in prices if price.text != '']
        prices_list = [price.replace(',','') for price in prices_list]
        prices_list = list(map(float, prices_list))

        if not prices_list:
            prices = driver.find_elements_by_xpath(xp_prices_2)
            prices_list = [price.text.replace('$','') for price in prices if price.text != '']
            prices_list = [price.replace(',','') for price in prices_list]
            prices_list = list(map(float, prices_list))
        else:
            prices_list
                
        ##################
        ##################
        # Get links:        
        data = []
        elems = driver.find_elements_by_xpath(xp_urls)

        for elem in elems:
            data.append(elem.get_attribute("href"))

        if not data:
            elems = driver.find_elements_by_xpath(xp_urls_2)

            for elem in elems:
                data.append(elem.get_attribute("href"))
        else:
            data
            
        df_elem = pd.DataFrame(data, columns=['Links'])

        ##################
        ##################
        # Make df and append to list: 
        new_df = pd.DataFrame({'quickest_price': [prices_list[0]],
                               'cheapest_price': [prices_list[1]],
                               'quickest_link': [df_elem['Links'][0]],
                               'cheapest_link': [df_elem['Links'][1]]})

        driver.close()

        dfs.append(new_df)
    
    except IndexError:
        pass
        
total_df = pd.concat(dfs).reset_index(drop=True)

# convert URL list to pandas series
series = pd.Series(urls, name='kayak_search_url')

# concatenate the series and dataframe
result = pd.concat([series, total_df], axis=1)
result

100%|█████████████████████████████████████████████████████████████████████████| 5/5 [02:34<00:00, 30.95s/it]

CPU times: user 78.2 ms, sys: 93.5 ms, total: 172 ms
Wall time: 2min 34s





Unnamed: 0,kayak_search_url,quickest_price,cheapest_price,quickest_link,cheapest_link
0,"https://www.kayak.com/flights/AMS-WMI,nearby/2...",817.0,410.0,https://www.kayak.com/book/flight?code=OMFiHCq...,https://www.kayak.com/book/flight?code=OMFiHCq...
1,"https://www.kayak.com/flights/AMS-WMI,nearby/2...",515.0,395.0,https://www.kayak.com/book/flight?code=OMECYi5...,https://www.kayak.com/book/flight?code=OMECYi5...
2,"https://www.kayak.com/flights/AMS-WMI,nearby/2...",731.0,545.0,https://www.kayak.com/book/flight?code=OMHCpE6...,https://www.kayak.com/book/flight?code=OMHCpE6...
3,"https://www.kayak.com/flights/AMS-WMI,nearby/2...",835.0,585.0,https://www.kayak.com/book/flight?code=OMGibrL...,https://www.kayak.com/book/flight?code=OMGibrL...
4,"https://www.kayak.com/flights/AMS-WMI,nearby/2...",710.0,535.0,https://www.kayak.com/book/flight?code=OMFCtiG...,https://www.kayak.com/book/flight?code=OMFCtiG...
5,"https://www.kayak.com/flights/AMS-WMI,nearby/2...",,,,
6,"https://www.kayak.com/flights/AMS-SOF,nearby/2...",,,,
7,"https://www.kayak.com/flights/AMS-SOF,nearby/2...",,,,
8,"https://www.kayak.com/flights/AMS-SOF,nearby/2...",,,,
9,"https://www.kayak.com/flights/AMS-SOF,nearby/2...",,,,


In [207]:
%%time

# Grabbing best & cheapeast flight info (price and link) for one iteration
xp_prices = """//div[@class='above-button']//a[contains(@class,'booking-link')]/span[@class='price option-text']"""
xp_prices_2 = """//div[contains(@class, 'price-text')]"""
xp_urls = """//div[@class='col col-best']//a[@href]"""
xp_urls_2 = """//div[contains(@class, 'main-btn-wrap')]//a[@href]"""
executable_path = '/Users/junerodriguez/Downloads/chromedriver_mac_arm64/chromedriver'

dfs = []

for url in urls[0:5]:
    try:
        requests = 0
        
        agents = ["Firefox/66.0.3","Chrome/73.0.3683.68","Edge/16.16299"]
        chrome_options = webdriver.ChromeOptions()
        chrome_options.add_argument('--headless')
        chrome_options.add_argument('--user-agent=' + agents[(requests%len(agents))] + '"')    
        chrome_options.add_experimental_option('useAutomationExtension', False)

        driver = webdriver.Chrome(chrome_options=chrome_options, executable_path=executable_path)
        driver.implicitly_wait(10)
        driver.get(url)
        sleep(randint(8,10))

        ##################
        ##################
        # Get prices:
        prices = driver.find_elements_by_xpath(xp_prices)
        prices_list = [price.text.replace('$','') for price in prices if price.text != '']
        prices_list = [price.replace(',','') for price in prices_list]
        prices_list = list(map(float, prices_list))

        if not prices_list:
            prices = driver.find_elements_by_xpath(xp_prices_2)
            prices_list = [price.text.replace('$','') for price in prices if price.text != '']
            prices_list = [price.replace(',','') for price in prices_list]
            prices_list = list(map(float, prices_list))
        else:
            prices_list
                
        ##################
        ##################
        # Get links:        
        data = []
        elems = driver.find_elements_by_xpath(xp_urls)

        for elem in elems:
            data.append(elem.get_attribute("href"))

        if not data:
            elems = driver.find_elements_by_xpath(xp_urls_2)

            for elem in elems:
                data.append(elem.get_attribute("href"))
        else:
            data
            
        df_elem = pd.DataFrame(data, columns=['Links'])

        ##################
        ##################
        # Make df and append to list: 
        new_df = pd.DataFrame({'quickest_price': [prices_list[0]],
                               'cheapest_price': [prices_list[1]],
                               'quickest_link': [df_elem['Links'][0]],
                               'cheapest_link': [df_elem['Links'][1]]})

        driver.close()

        dfs.append(new_df)
    
    except IndexError:
        pass
        
total_df = pd.concat(dfs).reset_index(drop=True)

# convert URL list to pandas series
series = pd.Series(urls, name='kayak_search_url')

# concatenate the series and dataframe
result = pd.concat([series, total_df], axis=1)
result

CPU times: user 64 ms, sys: 61.2 ms, total: 125 ms
Wall time: 1min 25s


Unnamed: 0,kayak_search_url,quickest_price,cheapest_price,quickest_link,cheapest_link
0,"https://www.kayak.com/flights/AMS-WMI,nearby/2...",817.0,410.0,https://www.kayak.com/book/flight?code=OMEiNi9...,https://www.kayak.com/book/flight?code=OMEiNi9...
1,"https://www.kayak.com/flights/AMS-WMI,nearby/2...",515.0,395.0,https://www.kayak.com/book/flight?code=OMGCTv-...,https://www.kayak.com/book/flight?code=OMGCTv-...
2,"https://www.kayak.com/flights/AMS-WMI,nearby/2...",731.0,545.0,https://www.kayak.com/book/flight?code=OMEiTYe...,https://www.kayak.com/book/flight?code=OMEiTYe...
3,"https://www.kayak.com/flights/AMS-WMI,nearby/2...",835.0,585.0,https://www.kayak.com/book/flight?code=OMFiUu-...,https://www.kayak.com/book/flight?code=OMFiUu-...
4,"https://www.kayak.com/flights/AMS-WMI,nearby/2...",710.0,535.0,https://www.kayak.com/book/flight?code=OMFii6n...,https://www.kayak.com/book/flight?code=OMFii6n...
5,"https://www.kayak.com/flights/AMS-WMI,nearby/2...",,,,
6,"https://www.kayak.com/flights/AMS-SOF,nearby/2...",,,,
7,"https://www.kayak.com/flights/AMS-SOF,nearby/2...",,,,
8,"https://www.kayak.com/flights/AMS-SOF,nearby/2...",,,,
9,"https://www.kayak.com/flights/AMS-SOF,nearby/2...",,,,


In [178]:
# Grabbing best & cheapeast flight info (price and link) for one iteration
xp_prices = """//div[@class='above-button']//a[contains(@class,'booking-link')]/span[@class='price option-text']"""
xp_prices_2 = """//div[contains(@class, 'price-text')]"""
xp_urls = """//div[@class='col col-best']//a[@href]"""
xp_urls_2 = """//div[contains(@class, 'main-btn-wrap')]//a[@href]"""
executable_path = '/Users/junerodriguez/Downloads/chromedriver_mac_arm64/chromedriver'

dfs = []

for url in urls[10:15]:
    try:
        requests = 0
        
        chrome_options = webdriver.ChromeOptions()
        agents = ["Firefox/66.0.3","Chrome/73.0.3683.68","Edge/16.16299"]
        print("User agent: " + agents[(requests%len(agents))])
        chrome_options.add_argument('--user-agent=' + agents[(requests%len(agents))] + '"')    
        chrome_options.add_experimental_option('useAutomationExtension', False)

        driver = webdriver.Chrome(executable_path)
        driver.implicitly_wait(10)
        driver.get(url)
        sleep(randint(8,10))

        # Get links:
        data = []
        elems = driver.find_elements_by_xpath(xp_urls)

        for elem in elems:
            data.append(elem.get_attribute("href"))

        if not data:
            elems = driver.find_elements_by_xpath(xp_urls_2)

            for elem in elems:
                data.append(elem.get_attribute("href"))
        else:
            data
            
        df_elem = pd.DataFrame(data, columns=['Links'])

        new_df = pd.DataFrame({'quickest_link': [df_elem['Links'][0]],
                               'cheapest_link': [df_elem['Links'][1]]})

        driver.close()

        dfs.append(new_df)
    
    except IndexError:
        pass
        
total_df = pd.concat(dfs)
total_df

User agent: Firefox/66.0.3
User agent: Firefox/66.0.3
User agent: Firefox/66.0.3
User agent: Firefox/66.0.3
User agent: Firefox/66.0.3


Unnamed: 0,quickest_link,cheapest_link
0,https://www.kayak.com/book/flight?code=OKEiZSP...,https://www.kayak.com/book/flight?code=OKEiZSP...
0,https://www.kayak.com/book/flight?code=OKGCr_B...,https://www.kayak.com/book/flight?code=OKGCr_B...
0,https://www.kayak.com/book/flight?code=OLHCK6S...,https://www.kayak.com/book/flight?code=OLHCK6S...
0,https://www.kayak.com/book/flight?code=OLECGkX...,https://www.kayak.com/book/flight?code=OLECGkX...
0,https://www.kayak.com/book/flight?code=OLGCB5U...,https://www.kayak.com/book/flight?code=OLGCB5U...


## Scrape PDF data

In [54]:
!pip install tabula-py

Collecting tabula-py
  Downloading tabula_py-2.6.0-py3-none-any.whl (12.0 MB)
[2K     [90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━[0m [32m12.0/12.0 MB[0m [31m39.5 MB/s[0m eta [36m0:00:00[0m00:01[0m00:01[0m
Collecting distro
  Downloading distro-1.8.0-py3-none-any.whl (20 kB)
Collecting pytz>=2020.1
  Downloading pytz-2022.7.1-py2.py3-none-any.whl (499 kB)
[2K     [90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━[0m [32m499.4/499.4 kB[0m [31m30.5 MB/s[0m eta [36m0:00:00[0m
Installing collected packages: pytz, distro, tabula-py
  Attempting uninstall: pytz
    Found existing installation: pytz 2016.10
    Uninstalling pytz-2016.10:
      Successfully uninstalled pytz-2016.10
[31mERROR: pip's dependency resolver does not currently take into account all the packages that are installed. This behaviour is the source of the following dependency conflicts.
conda-repo-cli 1.0.20 requires clyent==1.2.1, but you have clyent 1.2.2 which is incompatible.
conda-repo-cli 1.0.20 requires

In [61]:
import pandas as pd
import tabula

from tabula.io import read_pdf


# Read the PDF into a DataFrame using tabula
df_iata = tabula.io.read_pdf('IATA_codes.pdf', multiple_tables=True, pages='all')

# If the PDF has multiple tables, you can extract each table into a separate DataFrame
df_list = [table for table in df_iata]

# You can then access each DataFrame by its index in the list, for example:
df1 = df_list[0]

# Finally, you can save the DataFrame as a CSV file for future use:
df1
# df1.to_csv("table1.csv", index=False)

Got stderr: Jan 31, 2023 7:26:53 PM org.apache.pdfbox.pdmodel.font.PDSimpleFont toUnicode
Jan 31, 2023 7:26:53 PM org.apache.pdfbox.pdmodel.font.PDSimpleFont toUnicode
Jan 31, 2023 7:26:53 PM org.apache.pdfbox.pdmodel.font.PDSimpleFont toUnicode
Jan 31, 2023 7:26:54 PM org.apache.pdfbox.pdmodel.font.PDSimpleFont toUnicode
Jan 31, 2023 7:26:54 PM org.apache.pdfbox.pdmodel.font.PDSimpleFont toUnicode
Jan 31, 2023 7:26:55 PM org.apache.pdfbox.pdmodel.font.PDSimpleFont toUnicode
Jan 31, 2023 7:26:55 PM org.apache.pdfbox.pdmodel.font.PDSimpleFont toUnicode



Unnamed: 0,C iudad / Location,C ódigo ciudad /\rCity code,A eropuerto / Airport
0,Aalborg,AAL,AAL
1,Aarhus,AAR,AAR
2,Abadan,ABD,ABD
3,Abakan,ABA,ABA
4,Aberdeen,ABR,ABR
5,Aberdeen,ABZ,ABZ
6,Abha,AHB,AHB
7,Abu Dhabi,AUH,AUH
8,Abidjan,ABJ,ABJ
9,Abilene,ABI,ABI


In [68]:
import pandas as pd
from tabula.io import read_pdf


# Read all tables in the PDF file into a list of DataFrames
df_list = tabula.io.read_pdf('IATA_codes.pdf', multiple_tables=True, pages='all')

# Concatenate all DataFrames in the list into a single DataFrame
df_iata = pd.concat(df_list, axis=0, ignore_index=True)

# You can now use the resulting DataFrame for further processing
# e.g. to inspect the first 5 rows:
df_iata

# To save the DataFrame as a CSV file:
# df.to_csv("all_tables.csv", index=False)

Got stderr: Jan 31, 2023 7:39:16 PM org.apache.pdfbox.pdmodel.font.PDSimpleFont toUnicode
Jan 31, 2023 7:39:16 PM org.apache.pdfbox.pdmodel.font.PDSimpleFont toUnicode
Jan 31, 2023 7:39:16 PM org.apache.pdfbox.pdmodel.font.PDSimpleFont toUnicode
Jan 31, 2023 7:39:17 PM org.apache.pdfbox.pdmodel.font.PDSimpleFont toUnicode
Jan 31, 2023 7:39:18 PM org.apache.pdfbox.pdmodel.font.PDSimpleFont toUnicode
Jan 31, 2023 7:39:18 PM org.apache.pdfbox.pdmodel.font.PDSimpleFont toUnicode
Jan 31, 2023 7:39:19 PM org.apache.pdfbox.pdmodel.font.PDSimpleFont toUnicode



Unnamed: 0.1,C iudad / Location,C ódigo ciudad /\rCity code,A eropuerto / Airport,Unnamed: 0,Unnamed: 1,Unnamed: 2,Unnamed: 3,Unnamed: 4,Unnamed: 5,Unnamed: 6,Unnamed: 7,Unnamed: 8
0,Aalborg,AAL,AAL,,,,,,,,,
1,Aarhus,AAR,AAR,,,,,,,,,
2,Abadan,ABD,ABD,,,,,,,,,
3,Abakan,ABA,ABA,,,,,,,,,
4,Aberdeen,ABR,ABR,,,,,,,,,
...,...,...,...,...,...,...,...,...,...,...,...,...
1302,Yuzhno -Sakhalinsk,UUS,UUS,,,,,,,,,
1303,Zhanjiang,ZHA,ZHA,,,,,,,,,
1304,Zhengzhou,CGO,CGO,,,,,,,,,
1305,Zihuatanejo,ZIH,ZIH,,,,,,,,,
