In [10]:
import pandas as pd

# Set the options to display all the columns
pd.set_option('display.max_colwidth', None)

## Collect each country's URL using Beautifulsoup

### Request and Parse html

In [1]:
import requests
from bs4 import BeautifulSoup

# URL of the webpage to scrape
url = "https://www.myrealtrip.com/regions"

# Send a GET request to the webpage
response = requests.get(url)

# Check if the page is loaded successfully
if response.status_code == 200:
    print("Page loaded successfully!")
else:
    print(f"Page loading failed! Status code: {response.status_code}")

# Parse the HTML content of the page using BeautifulSoup
soup = BeautifulSoup(response.content, "html.parser")

Page loaded successfully!


### Create dataframe containing country and each page's link

In [None]:
# Create an empty list to store the scraped data
data = []

# Find all continents (regions) in the HTML by searching for the div with class 'region-name'
continents = soup.find_all('div', class_='region-name')

# Loop through each continent to extract its information
for continent in continents:
    # Get the continent name
    continent_name = continent.get_text(strip=True)
    
    # Find the region list associated with the current continent
    region_container = continent.find_next('ul', class_='region-list')
    
    # Find all subregions (countries) in the region container
    subregions = region_container.find_all('li', class_='region-container')

    # Loop through each subregion to extract subregion name and country information
    for subregion in subregions:
        # Extract the subregion name
        subregion_name = subregion.find('div', class_='subregion-name').get_text(strip=True)
        
        # Find all countries listed in the current subregion
        countries = subregion.find_all('a', class_='list-heading', href=True)
        
        # Loop through each country to extract country name and link
        for country in countries:
            # Extract the country name
            country_name = country.find('div', class_='country-name').get_text(strip=True)
            
            # Construct the full URL of the country page
            country_link = "https://www.myrealtrip.com" + country['href']
            
            # Add the continent name, subregion name, country name, and link to the data list
            data.append([continent_name, subregion_name, country_name, country_link])
    
# Convert the list of data into a pandas DataFrame
countries_link = pd.DataFrame(data, columns=['Continent', 'Subregion', 'Country', 'Link'])

# Display the DataFrame to verify the data
print(countries_link.head())

# Save the DataFrame as a CSV file
# countries_link.to_csv('countries_link.csv', index=False)

  Continent Subregion Country  \
0        유럽       서유럽    네덜란드   
1        유럽       서유럽      독일   
2        유럽       서유럽   룩셈부르크   
3        유럽       서유럽     모나코   
4        유럽       서유럽     벨기에   

                                                Link  
0  https://www.myrealtrip.com/countries?key_name=...  
1  https://www.myrealtrip.com/countries?key_name=...  
2  https://www.myrealtrip.com/countries?key_name=...  
3  https://www.myrealtrip.com/countries?key_name=...  
4  https://www.myrealtrip.com/countries?key_name=...  


## Collect each city's URL using Selenium

In [3]:
from selenium import webdriver
from selenium.webdriver.common.by import By
from selenium.webdriver.chrome.service import Service
from selenium.webdriver.chrome.options import Options
from selenium.webdriver.support.ui import WebDriverWait
from selenium.webdriver.support import expected_conditions as EC
from webdriver_manager.chrome import ChromeDriverManager

In [None]:
# CSV 파일 로드
csv_file = "countries_link.csv"
countries_link = pd.read_csv(csv_file)

# Set web driver
options = Options()
options.add_argument("--headless")  # 창 없이 실행
options.add_argument("--disable-gpu")
options.add_argument("--no-sandbox")
options.add_argument("--disable-dev-shm-usage")

driver = webdriver.Chrome(service=Service(ChromeDriverManager().install()), options=options)
wait = WebDriverWait(driver, 5) # Wait until the action happens

data = []

for i in range(countries_link.shape[0]):
    # Open up the page
    url = countries_link['Link'][i]
    country = countries_link['Country'][i]
    driver.get(url)

    # Find the 'select cities' button and click it
    city_select_button = wait.until(EC.presence_of_element_located((By.XPATH, '//*[contains(@class, "Button-module__primary--loIc3")]')))
    driver.execute_script("arguments[0].click();", city_select_button)

    # Find the city list container (without clicking)
    city_list = city_select_button.find_element(By.XPATH, "following-sibling::div//ul")

    # Extract all <a> elements (cities) inside the <ul>
    cities = city_list.find_elements(By.CSS_SELECTOR, "a.LocationCityList-module__link--MEsOD")

    # Extract city names and links
    for city in cities:
        city_name = driver.execute_script("return arguments[0].textContent.trim();", city)
        city_link = city.get_attribute("href")
        data.append([country, city_name, city_link])

driver.quit()

# Convert to DataFrame
cities_link = pd.DataFrame(data, columns=['Country', 'City', 'Link'])

# Display the DataFrame to verify the data
print(cities_link)

# Save the DataFrame as a CSV file
# cities_link.to_csv('cities_link.csv', index=False)

    Country        City                                                             Link
0      네덜란드       암스테르담             https://www.myrealtrip.com/cities?key_name=Amsterdam
1      네덜란드       잔세스칸스       https://www.myrealtrip.com/cities?key_name=Zaanse%20Schans
2      네덜란드  네덜란드 기타 도시  https://www.myrealtrip.com/cities?key_name=Netherlands%20Others
3      네덜란드        로테르담             https://www.myrealtrip.com/cities?key_name=Rotterdam
4      네덜란드          리세                 https://www.myrealtrip.com/cities?key_name=Lisse
..      ...         ...                                                              ...
709     모로코         탕헤르               https://www.myrealtrip.com/cities?key_name=Tangier
710     이집트         카이로                 https://www.myrealtrip.com/cities?key_name=Cairo
711     이집트        후르가다              https://www.myrealtrip.com/cities?key_name=Hurghada
712     이집트         룩소르                 https://www.myrealtrip.com/cities?key_name=Luxor
713     이집트         아

## Collect categories from each city

In [48]:
# CSV 파일 로드
csv_file = "cities_link.csv"
cities_link = pd.read_csv(csv_file)

options = Options()
options.add_argument("--headless")
options.add_argument("--disable-gpu")
options.add_argument("--no-sandbox")
options.add_argument("--disable-dev-shm-usage")

driver = webdriver.Chrome(service=Service(ChromeDriverManager().install()), options=options)
wait = WebDriverWait(driver, 20)

# data = []

for i in range(cities_link.shape[0]):
    if i >= 705:
        url = cities_link['Link'][i]
        country = cities_link['Country'][i]
        city = cities_link['City'][i]
        print(f"------------ {country}/{city} ------------")

        driver.get(url)

        try:
            button = wait.until(EC.element_to_be_clickable(
                (By.XPATH, '//*[contains(@class, "jM_8U")]//span[text()="전체"]')))
            button.click()

        except Exception as e:
            print(f"⚠️ {category_name} 클릭 실패:", e)
            driver.save_screenshot(f"{country}_{city}.png")

        try:
            categories = wait.until(EC.presence_of_all_elements_located((By.XPATH, '//*[contains(@class, "4a2yve")]')))

        except Exception as e:
            print(f"⚠️ {category_name} 클릭 실패:", e)
            driver.save_screenshot(f"{country}_{city}.png")

        for category in categories:
            category_name = category.text.strip()
            print("❓Checking:", category_name)

            if category_name in ['록키마운틴', '그랜드캐년', '시내투어', '근교투어', '액티비티', '미술관·박물관투어', '클래스', '체험', '체험·클래스', '투어']:
                print("✅Clicking:", category_name)

                try:
                    driver.execute_script("arguments[0].scrollIntoView();", category)
                    driver.execute_script("arguments[0].click();", category)
                    all_button = wait.until(EC.element_to_be_clickable((By.XPATH, f'//button[text()="전체보기" and ancestor::ul[preceding-sibling::li/button/div[text()="{category_name}"]]]')))
                    driver.execute_script("arguments[0].click();", all_button)

                    link = driver.current_url
                    data.append([country, city, category_name, link])

                except Exception as e:
                    print(f"⚠️ {category_name} 클릭 실패:", e)
                    driver.save_screenshot(f"{country}_{city}_{category_name}.png")
    

tour_link = pd.DataFrame(data, columns=['Country', 'City', 'Tour Category', 'Link'])

# Save the DataFrame as a CSV file
tour_link.to_csv('tour_link.csv', index=False)

------------ 남아프리카 공화국/케이프타운 ------------
❓Checking: 전체
❓Checking: 투어
✅Clicking: 투어
❓Checking: 여행편의/대여
❓Checking: 유심·와이파이
❓Checking: 액티비티
✅Clicking: 액티비티
❓Checking: 이동/교통편의
❓Checking: 입장권
------------ 남아프리카 공화국/남아프리카 공화국 기타 도시 ------------
❓Checking: 전체
❓Checking: 액티비티
✅Clicking: 액티비티
------------ 남아프리카 공화국/요하네스버그 ------------
❓Checking: 전체
❓Checking: 투어
✅Clicking: 투어
❓Checking: 액티비티
✅Clicking: 액티비티
❓Checking: 여행편의/대여
❓Checking: 이동/교통편의
------------ 모로코/마라케시 ------------
❓Checking: 전체
❓Checking: 투어
✅Clicking: 투어
❓Checking: 입장권
------------ 모로코/탕헤르 ------------
❓Checking: 전체
❓Checking: 근교투어
✅Clicking: 근교투어
------------ 이집트/카이로 ------------
❓Checking: 전체
❓Checking: 여행용품
❓Checking: 근교투어
✅Clicking: 근교투어
❓Checking: 시내투어
✅Clicking: 시내투어
❓Checking: 액티비티
✅Clicking: 액티비티
❓Checking: 티켓·입장권
❓Checking: 체험·클래스
✅Clicking: 체험·클래스
❓Checking: 이동·교통
❓Checking: 미술관·박물관투어
✅Clicking: 미술관·박물관투어
❓Checking: 투어
✅Clicking: 투어
❓Checking: 유심·와이파이
❓Checking: 여행편의
❓Checking: 여행편의/대여
❓Checking: 미식
------------ 이집트/후

* 다시 해야할 것

베트남/다낭

인도네시아/발리

태국/치앙마이

스크린샷에 찍힌 곳들


In [None]:
# CSV 파일 로드
csv_file = "cities_link.csv"
cities_link = pd.read_csv(csv_file)

options = Options()
options.add_argument("--headless")
options.add_argument("--disable-gpu")
options.add_argument("--no-sandbox")
options.add_argument("--disable-dev-shm-usage")

driver = webdriver.Chrome(service=Service(ChromeDriverManager().install()), options=options)
wait = WebDriverWait(driver, 20)

data = []

for i in range(cities_link.shape[0]):
    if i >= 705:
        url = cities_link['Link'][i]
        country = cities_link['Country'][i]
        city = cities_link['City'][i]
        print(f"------------ {country}/{city} ------------")

        driver.get(url)

        try:
            button = wait.until(EC.element_to_be_clickable(
                (By.XPATH, '//*[contains(@class, "jM_8U")]//span[text()="전체"]')))
            button.click()

        except Exception as e:
            print(f"⚠️ {category_name} 클릭 실패:", e)
            driver.save_screenshot(f"{country}_{city}.png")

        try:
            categories = wait.until(EC.presence_of_all_elements_located((By.XPATH, '//*[contains(@class, "4a2yve")]')))

        except Exception as e:
            print(f"⚠️ {category_name} 클릭 실패:", e)
            driver.save_screenshot(f"{country}_{city}.png")

        for category in categories:
            category_name = category.text.strip()
            print("❓Checking:", category_name)

            if category_name in ['록키마운틴', '그랜드캐년', '시내투어', '근교투어', '액티비티', '미술관·박물관투어', '클래스', '체험', '체험·클래스', '투어']:
                print("✅Clicking:", category_name)

                try:
                    driver.execute_script("arguments[0].scrollIntoView();", category)
                    driver.execute_script("arguments[0].click();", category)
                    all_button = wait.until(EC.element_to_be_clickable((By.XPATH, f'//button[text()="전체보기" and ancestor::ul[preceding-sibling::li/button/div[text()="{category_name}"]]]')))
                    driver.execute_script("arguments[0].click();", all_button)

                    link = driver.current_url
                    data.append([country, city, category_name, link])

                except Exception as e:
                    print(f"⚠️ {category_name} 클릭 실패:", e)
                    driver.save_screenshot(f"{country}_{city}_{category_name}.png")

In [47]:
cities_link[cities_link['City']=='가봉 기타 도시']

Unnamed: 0,Country,City,Link
704,가봉,가봉 기타 도시,https://www.myrealtrip.com/cities?key_name=Gabon%20Others


## Collect the tour information in each city

In [49]:
import time

# CSV 파일 로드
csv_file = "tour_link.csv"
df = pd.read_csv(csv_file)

# 결과 저장을 위한 리스트
tours = []

headers = {
    "User-Agent": "Mozilla/5.0 (Windows NT 10.0; Win64; x64) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/132.0.0.0 Safari/537.36"
}

# 한 페이지에 포함될 최대 상품 개수
PER_PAGE = 24

# 각 행을 순회하며 데이터 수집
for _, row in df.iterrows():
    country = row["Country"]
    city = row["City"]
    category = row["Tour Category"]
    base_url = row["Link"]

    # API 요청 링크 생성
    api_url = base_url.replace("https://www.myrealtrip.com/offers", 
                               "https://api3.myrealtrip.com/search/ticket/v2/web") + f"&per={PER_PAGE}"

    page = 1
    while True:
        url = f"{api_url}&page={page}"
        print(f"Fetching: {country} / {city} / {category} ] ({url})")  # 진행상황 확인

        response = requests.get(url, headers=headers)
        if response.status_code != 200:
            print(f"Failed to fetch data from {url}")
            break

        data = response.json()

        # 데이터가 없으면 종료
        if "offers" not in data['data'] or not data['data']["offers"]:
            break

        # 필요한 정보 추출
        for offers in data['data']["offers"]:
            tours.append({
                "Country": country,
                "City": city,
                "Category": category,
                "Product ID": offers.get("productId"),
                "Product Type": offers.get("productType"),
                "Title": offers.get("title"),
                "Review Count": offers.get("reviewCount", 0),
                "Review Score": offers.get("reviewScore", 0.0),
                "Original Price": offers.get("originalPrice"),
                "Sale Price": offers.get("salePrice"),
                "Currency": offers.get("currency"),
                "Status": offers.get("status"),
                "Link": offers.get("linkUrl")
            })

        page += 1  # 다음 페이지로 이동
        time.sleep(0.5)  # 서버 부하 방지를 위해 1초 대기

# 결과를 DataFrame으로 변환 후 CSV 저장
myrealtrip_products = pd.DataFrame(tours)
myrealtrip_products.to_csv("myrealtrip_products.csv", index=False)

print("Data collection complete. Saved to myrealtrip_products.csv.")

Fetching: 네덜란드 / 암스테르담 / 시내투어 ] (https://api3.myrealtrip.com/search/ticket/v2/web?t=llp&qct=Amsterdam&qcr=Netherlands&ext_categories=sinae_tour&per=24&page=1)
Fetching: 네덜란드 / 암스테르담 / 시내투어 ] (https://api3.myrealtrip.com/search/ticket/v2/web?t=llp&qct=Amsterdam&qcr=Netherlands&ext_categories=sinae_tour&per=24&page=2)
Fetching: 네덜란드 / 암스테르담 / 시내투어 ] (https://api3.myrealtrip.com/search/ticket/v2/web?t=llp&qct=Amsterdam&qcr=Netherlands&ext_categories=sinae_tour&per=24&page=3)
Fetching: 네덜란드 / 암스테르담 / 시내투어 ] (https://api3.myrealtrip.com/search/ticket/v2/web?t=llp&qct=Amsterdam&qcr=Netherlands&ext_categories=sinae_tour&per=24&page=4)
Fetching: 네덜란드 / 암스테르담 / 근교투어 ] (https://api3.myrealtrip.com/search/ticket/v2/web?t=llp&qct=Amsterdam&qcr=Netherlands&ext_categories=suburb_tour&per=24&page=1)
Fetching: 네덜란드 / 암스테르담 / 근교투어 ] (https://api3.myrealtrip.com/search/ticket/v2/web?t=llp&qct=Amsterdam&qcr=Netherlands&ext_categories=suburb_tour&per=24&page=2)
Fetching: 네덜란드 / 암스테르담 / 근교투어 ] (https://api