In [None]:
!apt-get update
!pip install pandas
!pip install beautifulsoup4
!pip install selenium
!pip install selenium-stealth
!pip install webdriver-manager

0% [Working]            Hit:1 http://archive.ubuntu.com/ubuntu jammy InRelease
0% [Waiting for headers] [Connecting to security.ubuntu.com (185.125.190.82)] [Connected to cloud.r-                                                                                                    Get:2 http://archive.ubuntu.com/ubuntu jammy-updates InRelease [128 kB]
0% [2 InRelease 15.6 kB/128 kB 12%] [Connecting to security.ubuntu.com (185.125.190.82)] [Connected                                                                                                     Get:3 https://cloud.r-project.org/bin/linux/ubuntu jammy-cran40/ InRelease [3,626 B]
0% [2 InRelease 79.3 kB/128 kB 62%] [Connecting to security.ubuntu.com (185.125.190.82)] [3 InReleas0% [2 InRelease 79.3 kB/128 kB 62%] [Connecting to security.ubuntu.com (185.125.190.82)] [Connected 0% [Waiting for headers] [Waiting for headers] [Connected to r2u.stat.illinois.edu (192.17.190.167)]                                                   

In [28]:
import time

import pandas as pd

from bs4 import BeautifulSoup

from webdriver_manager.chrome import ChromeDriverManager

from selenium import webdriver
from selenium.webdriver.chrome.service import Service
from selenium.webdriver.common.by import By
from selenium.webdriver.support.ui import WebDriverWait
from selenium.webdriver.support import expected_conditions as EC

from selenium_stealth import stealth

### Notes

- Must keep the chrome browser in full-screen mode, otherwise the page number buttons will not appear.
- You have to always start from page 1 and move to next pages incrementally.
- There are many missing fileds. So it's a must to check whether a specific tag exists before extracting text from it.
- Sometimes there are Buttons with page numbers, sometimes there is only next page button. So, we have to check both. Here, I first tried with page number, and then tried next page button if page number was not available.
- Some Hotel Cards are advertisements and they have no photo of the hotel. So, we have to ignore those cards.

In [83]:
def scrape_hotels(base_url, pages=100):
    """Function to scrape hotel data from multiple pages

    Args:
      base_url (str): URL to the first page of the website to be scraped.
      pages (int): Maximum number of pages to be scraped.
    Returns:
      (pandas.DataFrame): A DataFrame containing all the data.
    """
    service = Service(ChromeDriverManager().install())
    driver = webdriver.Chrome(service=service)
    stealth(
      driver,
      languages=["en-US", "en"],
      vendor="Google Inc.",
      platform="Win32",
      webgl_vendor="Intel Inc.",
      renderer="Intel Iris OpenGL Engine",
      fix_hairline=True,
    )

    # List to store hotel data across all pages
    all_hotels_data = []

    url = f"{base_url}"
    driver.get(url)

    # Loop through each page.The counter starts from 2 beacuse it is also used to click on the next page button.
    for page in range(2, pages + 2):
      # After multiple runs, 10 seconds were found to be the suitable wait time to let the page load
      time.sleep(10)
      # Parse the page source with BeautifulSoup
      soup = BeautifulSoup(driver.page_source, 'html.parser')

      # Find the div that contains all hotel cards
      hotels_list = soup.find_all('div', class_='S0Ps-resultInner') #c71z4
      # List to store hotel data for the current page
      hotels_data = []

      # Iterate through each hotel card and extract the hotel name
      for hotel in hotels_list:
        # Hotel Name
        hotel_name_span = hotel.find('a', class_= 'FLpo-big-name')
        if hotel_name_span:
            hotel_name_span = hotel_name_span.get_text(strip=True)
        else:
            hotel_name_span = 'NA'

        # Hotel Rating
        rating = hotel.find('div', class_= 'wdjx wdjx-positive wdjx-mod-rating-condensed')
        if rating:
          rating = rating.get_text(strip=True)
        else:
          rating = 'NA'
        
        # Hotel Review Counts and Category
        review_count = hotel.find('div', class_='xdhG-rating-description-and-count')
        if review_count:
          review_count = review_count.get_text(strip=True)
          review_category = review_count[0:review_count.find("(")].strip()
          review_count = review_count[review_count.find("(")+1: -1]
        else:
          review_category = 'NA'
          review_count = 'NA'

        # Price
        price = hotel.find('div', {'data-target': 'price'})
        if price:
          price = price.get_text().strip()[1:]
        else:
          price = 'NA'

        # Distance
        distance = hotel.find('div', class_='upS4-landmark-text')
        if distance:
          distance = distance.get_text().strip().split(" ")[0]
        else:
          distance = 'NA'

        # Stars
        star_span = hotel.find('span', class_='Ius0')
        if star_span:
          stars = star_span.get_text(strip=True)[0]
        else:
          stars = 'NA'

        # Hotel Image
        hotel_images = []
        # Find the picture element
        img_tags = hotel.find_all('img', class_='e9fk-photo')
        # Set to keep track of seen alt attributes
        seen_alts = set()  

        for img in img_tags:
          src = img.get('src')
          alt_text = img.get('alt')
          if src and alt_text not in seen_alts:
             hotel_images.append({'src': src, 'alt': alt_text})  # Store the image URL and alt text
             seen_alts.add(alt_text)

        # If there is not e9fk-photo image, it means the card is an advertisement.
        if len(hotel_images) == 0:
           continue
        
        # Add Hotel Information
        info = {
          'Hotel Name': hotel_name_span,
          'Hotel Rating': rating,
          'Review Count': review_count,
          'Review Category': review_category,
          'Price': price,
          'Stars': stars,
          'Distance': distance,
          'Images': hotel_images[0]['src']
        }
        
        # for key, value in info.items():
        #    print(f"{key}: {value}")

        if hotel_name_span:
            hotels_data.append(info)

      # Append the hotel data for the current page to the overall list
      all_hotels_data.extend(hotels_data)
      # Saves the current data after each loop to keep backups
      df = pd.DataFrame(all_hotels_data)
      df.to_csv('hotels.csv', index=False)
      print(f"Saved Page No. {page-1}")

      # Clicks on the next page button
      buttons = soup.find_all('button', {'aria-label': lambda x: x and x.startswith('Page ')})
      try:
          aria_label = f"Next page"
          button_to_click = WebDriverWait(driver, 5).until(
              EC.element_to_be_clickable((By.XPATH, f"//button[@aria-label='{aria_label}']"))
          )
          button_to_click.click()
      except Exception as e:
          print(f"Could not click on {aria_label}: {e}")

    #Close the Selenium WebDriver
    driver.quit()
    #Return the DataFrame
    return df

base_url = 'https://www.kayak.com/hotels/Dubai,Dubai,United-Arab-Emirates-c6080/2024-11-01/2024-11-02/1adults;map?sort=rank_a&fs=location=-1mi__Dubai%20(city%20centre)__25.19649297861462,55.27565177734368__restrict'
hotel_data_df = scrape_hotels(base_url, pages=70)

Saved Page No. 1
Saved Page No. 2
Saved Page No. 3
Saved Page No. 4
Saved Page No. 5
Saved Page No. 6
Saved Page No. 7
Saved Page No. 8
Saved Page No. 9
Saved Page No. 10
Saved Page No. 11
Saved Page No. 12
Saved Page No. 13
Saved Page No. 14
Saved Page No. 15
Saved Page No. 16
Saved Page No. 17
Saved Page No. 18
Saved Page No. 19
Saved Page No. 20
Saved Page No. 21
Saved Page No. 22
Saved Page No. 23
Saved Page No. 24
Saved Page No. 25
Saved Page No. 26
Saved Page No. 27
Saved Page No. 28
Saved Page No. 29
Saved Page No. 30
Saved Page No. 31
Saved Page No. 32
Saved Page No. 33
Saved Page No. 34
Saved Page No. 35
Saved Page No. 36
Saved Page No. 37
Saved Page No. 38
Saved Page No. 39
Saved Page No. 40
Saved Page No. 41
Saved Page No. 42
Saved Page No. 43
Saved Page No. 44
Saved Page No. 45
Saved Page No. 46
Saved Page No. 47
Saved Page No. 48
Saved Page No. 49
Saved Page No. 50
Saved Page No. 51
Saved Page No. 52
Saved Page No. 53
Saved Page No. 54
Saved Page No. 55
Saved Page No. 56
S

In [84]:
pd.set_option('display.max_columns', 500)
pd.set_option('display.width', 1000)
hotel_data_df.head(10)

Unnamed: 0,Hotel Name,Hotel Rating,Review Count,Review Category,Price,Stars,Distance,Images
0,SLS Dubai Hotel & Residences,8.5,4249,Very good,180,5.0,1.3,https://content.r9cdn.net/rimg/himg/67/58/c0/l...
1,"Sheraton Mall of the Emirates Hotel, Dubai",8.4,3680,Very good,160,5.0,7.0,https://content.r9cdn.net/rimg/himg/5d/c1/58/l...
2,"La Ville Hotel & Suites City Walk, Dubai, Auto...",8.6,965,Very good,328,5.0,1.3,https://content.r9cdn.net/rimg/himg/db/db/9d/l...
3,Royal Continental Suites,8.9,1366,Very good,159,,1.0,https://content.r9cdn.net/rimg/himg/5a/34/48/l...
4,Voco Bonnington Dubai,8.4,667,Very good,196,5.0,12.2,https://content.r9cdn.net/rimg/kimg/82/bc/a6d8...
5,Address Montgomerie,8.5,314,Very good,301,5.0,11.3,https://content.r9cdn.net/rimg/himg/f5/cf/bf/i...
6,Four Points by Sheraton Downtown Dubai,8.5,1918,Very good,164,4.0,4.0,https://content.r9cdn.net/rimg/himg/ed/b0/f3/l...
7,"Radisson Blu Hotel, Dubai Media City",,2488,Good,116,4.0,10.3,https://content.r9cdn.net/rimg/himg/fd/ea/38/i...
8,Al Jaddaf Rotana Suite Hotel,9.1,3010,Wonderful,192,5.0,3.7,https://content.r9cdn.net/rimg/himg/69/e6/51/l...
9,DoubleTree by Hilton Dubai Al Jadaf,8.1,4813,Very good,131,4.0,3.9,https://content.r9cdn.net/rimg/himg/97/45/42/i...


In [85]:
hotel_data_df.drop_duplicates(inplace=True)
hotel_data_df.shape

(718, 8)