## Collect each country's URL using Beautifulsoup

### Request and Parse html

In [93]:
import requests
from bs4 import BeautifulSoup

# URL of the webpage to scrape
url = "https://www.myrealtrip.com/regions"

# Send a GET request to the webpage
response = requests.get(url)

# Check if the page is loaded successfully
if response.status_code == 200:
    print("Page loaded successfully!")
else:
    print(f"Page loading failed! Status code: {response.status_code}")

# Parse the HTML content of the page using BeautifulSoup
soup = BeautifulSoup(response.content, "html.parser")

Page loaded successfully!


### Create dataframe containing country and each page's link

In [96]:
import pandas as pd

# Create an empty list to store the scraped data
data = []

# Find all continents (regions) in the HTML by searching for the div with class 'region-name'
continents = soup.find_all('div', class_='region-name')

# Loop through each continent to extract its information
for continent in continents:
    # Get the continent name
    continent_name = continent.get_text(strip=True)
    
    # Find the region list associated with the current continent
    region_container = continent.find_next('ul', class_='region-list')
    
    # Find all subregions (countries) in the region container
    subregions = region_container.find_all('li', class_='region-container')

    # Loop through each subregion to extract subregion name and country information
    for subregion in subregions:
        # Extract the subregion name
        subregion_name = subregion.find('div', class_='subregion-name').get_text(strip=True)
        
        # Find all countries listed in the current subregion
        countries = subregion.find_all('a', class_='list-heading', href=True)
        
        # Loop through each country to extract country name and link
        for country in countries:
            # Extract the country name
            country_name = country.find('div', class_='country-name').get_text(strip=True)
            
            # Construct the full URL of the country page
            country_link = "https://www.myrealtrip.com" + country['href']
            
            # Add the continent name, subregion name, country name, and link to the data list
            data.append([continent_name, subregion_name, country_name, country_link])
    
# Convert the list of data into a pandas DataFrame
countries_link = pd.DataFrame(data, columns=['Continent', 'Subregion', 'Country', 'Link'])

# Display the DataFrame to verify the data
print(countries_link.head())

# Save the DataFrame as a CSV file
countries_link.to_csv('countries_link.csv', index=False)

  Continent Subregion Country                                                       Link
0        유럽       서유럽    네덜란드  https://www.myrealtrip.com/countries?key_name=Netherlands
1        유럽       서유럽      독일      https://www.myrealtrip.com/countries?key_name=Germany
2        유럽       서유럽   룩셈부르크   https://www.myrealtrip.com/countries?key_name=Luxembourg
3        유럽       서유럽     모나코       https://www.myrealtrip.com/countries?key_name=Monaco
4        유럽       서유럽     벨기에      https://www.myrealtrip.com/countries?key_name=Belgium


## Collect each city's URL using Selenium

In [1]:
from selenium import webdriver
from selenium.webdriver.common.by import By
from selenium.webdriver.chrome.service import Service
from selenium.webdriver.chrome.options import Options
from selenium.webdriver.support.ui import WebDriverWait
from selenium.webdriver.support import expected_conditions as EC
from webdriver_manager.chrome import ChromeDriverManager

In [137]:
# Set web driver
options = Options()
options.add_argument("--headless")  # 창 없이 실행
options.add_argument("--disable-gpu")
options.add_argument("--no-sandbox")
options.add_argument("--disable-dev-shm-usage")

driver = webdriver.Chrome(service=Service(ChromeDriverManager().install()), options=options)
wait = WebDriverWait(driver, 5) # Wait until the action happens

data = []

for i in range(countries_link.shape[0]):
    # Open up the page
    url = countries_link['Link'][i]
    country = countries_link['Country'][i]
    driver.get(url)

    # Find the 'select cities' button and click it
    city_select_button = wait.until(EC.presence_of_element_located((By.XPATH, '//*[contains(@class, "Button-module__primary--loIc3")]')))
    driver.execute_script("arguments[0].click();", city_select_button)

    # Find the city list container (without clicking)
    city_list = city_select_button.find_element(By.XPATH, "following-sibling::div//ul")

    # Extract all <a> elements (cities) inside the <ul>
    cities = city_list.find_elements(By.CSS_SELECTOR, "a.LocationCityList-module__link--MEsOD")

    # Extract city names and links
    for city in cities:
        city_name = driver.execute_script("return arguments[0].textContent.trim();", city)
        city_link = city.get_attribute("href")
        data.append([country, city_name, city_link])

driver.quit()

# Convert to DataFrame
cities_link = pd.DataFrame(data, columns=['Country', 'City', 'Link'])

# Display the DataFrame to verify the data
print(cities_link)

# Save the DataFrame as a CSV file
cities_link.to_csv('cities_link.csv', index=False)

    Country        City                                                             Link
0      네덜란드       암스테르담             https://www.myrealtrip.com/cities?key_name=Amsterdam
1      네덜란드       잔세스칸스       https://www.myrealtrip.com/cities?key_name=Zaanse%20Schans
2      네덜란드  네덜란드 기타 도시  https://www.myrealtrip.com/cities?key_name=Netherlands%20Others
3      네덜란드        로테르담             https://www.myrealtrip.com/cities?key_name=Rotterdam
4      네덜란드          리세                 https://www.myrealtrip.com/cities?key_name=Lisse
..      ...         ...                                                              ...
709     모로코         탕헤르               https://www.myrealtrip.com/cities?key_name=Tangier
710     이집트         카이로                 https://www.myrealtrip.com/cities?key_name=Cairo
711     이집트        후르가다              https://www.myrealtrip.com/cities?key_name=Hurghada
712     이집트         룩소르                 https://www.myrealtrip.com/cities?key_name=Luxor
713     이집트         아

## Collect products from each city

In [182]:
options = Options()
options.add_argument("--headless")
options.add_argument("--disable-gpu")
options.add_argument("--no-sandbox")
options.add_argument("--disable-dev-shm-usage")

driver = webdriver.Chrome(service=Service(ChromeDriverManager().install()), options=options)
wait = WebDriverWait(driver, 15)

data = []

for i in range(cities_link.shape[0]):
    if i % 40 == 0:
        options = Options()
        options.add_argument("--headless")
        options.add_argument("--disable-gpu")
        options.add_argument("--no-sandbox")
        options.add_argument("--disable-dev-shm-usage")
        
        driver = webdriver.Chrome(service=Service(ChromeDriverManager().install()), options=options)
        wait = WebDriverWait(driver, 15)

    url = cities_link['Link'][i]
    country = cities_link['Country'][i]
    city = cities_link['City'][i]
    print(f"------------ {country}/{city} ------------")

    driver.get(url)

    button = wait.until(EC.element_to_be_clickable(
        (By.XPATH, '//*[contains(@class, "jM_8U")]//span[text()="전체"]')))
    button.click()

    categories = wait.until(EC.presence_of_all_elements_located((By.XPATH, '//*[contains(@class, "4a2yve")]')))

    for category in categories:
        category_name = category.text.strip()
        print("Checking:", category_name)

        if category_name in ['록키마운틴', '그랜드캐년', '시내투어', '근교투어', '액티비티', '미술관·박물관투어', '클래스', '체험', '체험·클래스', '투어']:
            print("Clicking:", category_name)
            driver.execute_script("arguments[0].scrollIntoView();", category)
            driver.execute_script("arguments[0].click();", category)
            all_button = wait.until(EC.element_to_be_clickable((By.XPATH, f'//button[text()="전체보기" and ancestor::ul[preceding-sibling::li/button/div[text()="{category_name}"]]]')))
            driver.execute_script("arguments[0].click();", all_button)

            link = driver.current_url
            data.append([country, city, category_name, link])
    
    if i % 40 == 0:
        driver.quit()
    

tour_link = pd.DataFrame(data, columns=['Country', 'City', 'Tour Category', 'Link'])

# Save the DataFrame as a CSV file
tour_link.to_csv('tour_link.csv', index=False)

------------ 네덜란드/암스테르담 ------------
Checking: 전체
Checking: 티켓·입장권
Checking: 시내투어
Clicking: 시내투어
Checking: 근교투어
Clicking: 근교투어
Checking: 이동·교통
Checking: 미술관·박물관투어
Clicking: 미술관·박물관투어
Checking: 체험·클래스
Clicking: 체험·클래스
Checking: 여행편의
Checking: 유심·와이파이
Checking: 스냅촬영
Checking: 미식
Checking: 투어
Clicking: 투어
------------ 네덜란드/잔세스칸스 ------------


MaxRetryError: HTTPConnectionPool(host='localhost', port=56031): Max retries exceeded with url: /session/d5f79a1bdb0b846c95a871d2f9c82f08/url (Caused by NewConnectionError('<urllib3.connection.HTTPConnection object at 0x00000228213DDE80>: Failed to establish a new connection: [WinError 10061] 대상 컴퓨터에서 연결을 거부했으므로 연결하지 못했습니다'))

In [179]:
# Set web driver
options = Options()
options.add_argument("--headless")  # 창 없이 실행
options.add_argument("--disable-gpu")
options.add_argument("--no-sandbox")
options.add_argument("--disable-dev-shm-usage")

driver = webdriver.Chrome(service=Service(ChromeDriverManager().install()), options=options)
wait = WebDriverWait(driver, 15) # Wait until the action happens

data = []

for i in range(cities_link.shape[0]):
    if i == 47:
        url = cities_link['Link'][i]
        country = cities_link['Country'][i]
        city = cities_link['City'][i]
        print(f"------------ {country}/{city} ------------")
        driver.get(url)

        button = wait.until(EC.element_to_be_clickable(
            (By.XPATH, '//*[contains(@class, "jM_8U")]//span[text()="전체"]')))
        button.click()

        categories = wait.until(EC.presence_of_all_elements_located((By.XPATH, '//*[contains(@class, "4a2yve")]')))

        for category in categories:
            category_name = category.text.strip()
            print("Checking:", category_name)

            if category_name in ['록키마운틴', '그랜드캐년', '시내투어', '근교투어', '액티비티', '미술관·박물관투어', '클래스', '체험', '체험·클래스', '투어']:
                print("Clicking:", category_name)
                driver.execute_script("arguments[0].scrollIntoView();", category)
                driver.execute_script("arguments[0].click();", category)
                all_button = wait.until(EC.element_to_be_clickable((By.XPATH, f'//button[text()="전체보기" and ancestor::ul[preceding-sibling::li/button/div[text()="{category_name}"]]]')))
                driver.execute_script("arguments[0].click();", all_button)

                link = driver.current_url
                data.append([country, city, category_name, link])

tour_link = pd.DataFrame(data, columns=['Country', 'City', 'Tour Category', 'Link'])

# Save the DataFrame as a CSV file
tour_link.to_csv('tour_link.csv', index=False)

tour_link

------------ 영국/런던 ------------
Checking: 전체
Checking: 시내투어
Clicking: 시내투어
Checking: 근교투어
Clicking: 근교투어
Checking: 티켓·입장권
Checking: 미술관·박물관투어
Clicking: 미술관·박물관투어
Checking: 여행용품
Checking: 이동·교통
Checking: 스냅촬영
Checking: 체험·클래스
Clicking: 체험·클래스
Checking: 여행편의
Checking: 유심·와이파이
Checking: 미식
Checking: 여행편의/대여
Checking: 키즈
Checking: 투어
Clicking: 투어
Checking: 스파·마사지
Checking: 입장권


Unnamed: 0,Country,City,Tour Category,Link
0,영국,런던,시내투어,https://www.myrealtrip.com/offers?t=llp&qct=London&qcr=United%20Kingdom&ext_categories=sinae_tour
1,영국,런던,근교투어,https://www.myrealtrip.com/offers?t=llp&qct=London&qcr=United%20Kingdom&ext_categories=suburb_tour
2,영국,런던,미술관·박물관투어,https://www.myrealtrip.com/offers?t=llp&qct=London&qcr=United%20Kingdom&ext_categories=artmuseum_tour
3,영국,런던,체험·클래스,https://www.myrealtrip.com/offers?t=llp&qct=London&qcr=United%20Kingdom&ext_categories=activity_class
4,영국,런던,투어,https://www.myrealtrip.com/offers?t=llp&qct=London&qcr=United%20Kingdom&ext_categories=tour
