In [1]:
from selenium import webdriver
from selenium.webdriver.common.action_chains import ActionChains
from selenium.webdriver.support import expected_conditions as EC
from selenium.webdriver.common.by import By
from selenium.webdriver.support.wait import WebDriverWait
from bs4 import BeautifulSoup
import requests
import time
import re

In [2]:
import traceback

In [3]:
base_url = 'https://me.cleartrip.com'

In [43]:
browser = webdriver.Chrome('./chromedriver')
wait_obj = WebDriverWait(browser, 20)

In [7]:
res = requests.get('https://me.cleartrip.com/hotels/united-states/miami/')

In [8]:
num_pages = 38

In [9]:
soup = BeautifulSoup(res.text, 'lxml')

In [10]:
page_num_buttons = soup.select('.pagination a')

In [11]:
page_num_buttons[-1].get('href')

'/hotels/united-states/miami?page=2'

In [12]:
page_num_buttons = page_num_buttons[:-1]

In [13]:
page_num_buttons[-1].get('href')

'/hotels/united-states/miami?page=38'

In [14]:
base_url + page_num_buttons[0].get('href')

'https://me.cleartrip.com/hotels/united-states/miami?page=2'

In [15]:
page_num_hrefs = [(base_url + button.get('href')) for button in page_num_buttons]

In [16]:
# page_num_hrefs

In [17]:
# hotels on the first page

hotels_a_tag_list = soup.select('.hotels-name a')

In [18]:
hotels_a_tag_list[:3]

[<a href="/hotels/details/192017?c=200419|210419&amp;r=2,0" rel="nofollow" title="Hilton Bentley South Beach, Miami, United States"><span>Hilton Bentley South Beach</span></a>,
 <a href="/hotels/details/292714?c=200419|210419&amp;r=2,0" rel="nofollow" title="Kimpton EPIC Hotel, Miami, United States"><span>Kimpton EPIC Hotel</span></a>,
 <a href="/hotels/details/345087?c=200419|210419&amp;r=2,0" rel="nofollow" title="JW Marriott Marquis Miami, Miami, United States"><span>JW Marriott Marquis Miami</span></a>]

In [19]:
len(hotels_a_tag_list)

20

In [20]:
hotels_urls = [(base_url + hotel_a_tag.get('href')) for hotel_a_tag in hotels_a_tag_list]

In [21]:
len(hotels_urls)

20

In [22]:
hotels_urls[:3]

['https://me.cleartrip.com/hotels/details/192017?c=200419|210419&r=2,0',
 'https://me.cleartrip.com/hotels/details/292714?c=200419|210419&r=2,0',
 'https://me.cleartrip.com/hotels/details/345087?c=200419|210419&r=2,0']

In [23]:
for page_num_href in page_num_hrefs:
    res = requests.get(page_num_href)
    res.raise_for_status()
    soup = BeautifulSoup(res.text, 'lxml')
    hotels_a_tag_list = soup.select('.hotels-name a')
    hotels_urls += [(base_url + hotel_a_tag.get('href')) for hotel_a_tag in hotels_a_tag_list]

In [24]:
len(hotels_urls)

755

In [25]:
hotels_urls[:6]

['https://me.cleartrip.com/hotels/details/192017?c=200419|210419&r=2,0',
 'https://me.cleartrip.com/hotels/details/292714?c=200419|210419&r=2,0',
 'https://me.cleartrip.com/hotels/details/345087?c=200419|210419&r=2,0',
 'https://me.cleartrip.com/hotels/details/367313?c=200419|210419&r=2,0',
 'https://me.cleartrip.com/hotels/details/367317?c=200419|210419&r=2,0',
 'https://me.cleartrip.com/hotels/details/367353?c=200419|210419&r=2,0']

In [46]:
def get_dynamic_info(hotel_url):
    browser.get(hotel_url)
    browser.maximize_window()
    browser.execute_script('window.scrollTo(0, document.body.scrollHeight)')
    try:
        wait_obj.until(EC.presence_of_element_located((By.CSS_SELECTOR, 'li.review span[rel="tTooltip"]')))
        rating_elem = browser.find_element_by_css_selector('li.review span[rel="tTooltip"]')
        ActionChains(browser).move_to_element(rating_elem).perform()
        orig_title = rating_elem.get_attribute('original-title')
        rating = float(re.findall('TripAdvisor traveller rating, (.+)/5', orig_title)[0])
    except Exception as e:
#         print('first:', e)
#         print(traceback.format_exc())
        rating = 'NA'

    try:
        wait_obj.until(EC.presence_of_element_located((By.CSS_SELECTOR, '.reviewLink')))
        num_reviews_elem = browser.find_element_by_class_name('reviewLink')
        num_reviews = float(re.findall('([0-9]+) reviews', num_reviews_elem.text)[0])
    except Exception as e:
#         print('second:', e)
#         print(traceback.format_exc())
        num_reviews = 'NA'

    try:
        wait_obj.until(EC.presence_of_element_located((By.CSS_SELECTOR, '#b-min-price')))
        price_elem = browser.find_element_by_css_selector('#b-min-price')
        price = price_elem.text
    except Exception as e:
#         print('third:', e)
#         print(traceback.format_exc())
        price = 'NA'
        
    hotel_info = {}
    wait_obj.until(EC.presence_of_element_located((By.CSS_SELECTOR, "a.shwMoreDesc")))
    more_info = browser.find_element_by_css_selector('a.shwMoreDesc')
#     more_info.click()
    ActionChains(browser).move_to_element(more_info).click(more_info).perform()
    dynamic_info_area = browser.find_elements_by_css_selector('.amenitiesCategory div')[0]

    try: 
        # for pages with property location, rooms, etc. headings
        info = dynamic_info_area.find_elements_by_css_selector('p')[1:-1]
        # exception occurs if these headings not found
        property_location_heading = info[0].find_element_by_css_selector('b')
        rooms_heading = info[1].find_element_by_css_selector('b')
        amenities_heading = info[2].find_element_by_css_selector('b')
        dining_heading = info[3].find_element_by_css_selector('b')
        business_heading = info[4].find_element_by_css_selector('b')
        
        property_location = info[0].text.replace(property_location_heading.text, '').strip()
#         print('waaaaaa')
        rooms = info[1].text.replace(rooms_heading.text, '').strip()
        amenities = info[2].text.replace(amenities_heading.text, '').strip()
        dining = info[3].text.replace(dining_heading.text, '').strip()
        business_other_amenities = info[4].text.replace(business_heading.text, '').strip()
#         print('waaaaaa')
        more_info = {
            'property_location': property_location,
            'rooms': rooms,
            'amenities': amenities,
            'dining': dining,
            'business_other_amenities': business_other_amenities
        }
        
    except Exception as e:
#         print('fourth:', e)
        # when headings are not present
#         print(traceback.format_exc())
        more_info = dynamic_info_area.text.strip()
    
    dynamic_info = {
        'rating': rating,
        'num_reviews': num_reviews,
        'price': price,
        'hotel_info': more_info
        
    }
    
    return dynamic_info

In [47]:
get_dynamic_info(hotels_urls[0])

{'rating': 'NA',
 'num_reviews': 1103.0,
 'price': 'NA',
 'hotel_info': 'Deluxe Class Modern Hotel. Beachfront All Suite property on Ocean Drive, in the world-class environment of South Beach in the exclusive neighborhood of "Sofi" (South of Fifth Street). Minutes from Coral Gables, Coconut Grove and Miami\'s international business district on Brickell Avenue. Miami International Airport is 15 minutes away.\n\nOversized junior suites (350 sq. ft.) furnished with 1 King Bed, custom designed with European furnishings, with floor-to-ceiling windows revealing views of the city or ocean, and are equipped with kitchenette, luxurious oversized marble bathroom and state of the art amenities.\n\nUpscale Italian restaurant on site and numerous dining options within the surrounding area offering a variety of tastes.\n\nEuropean-style boutique hotel in two 10-story towers.\n\nIntimate and elegantly decorated lobby.\n\nOn the southern tip of Ocean Drive, this hotel takes center stage in South Beach

In [28]:
hotels_urls[2]

'https://me.cleartrip.com/hotels/details/345087?c=200419|210419&r=2,0'

In [None]:
all_hotels = []
logfile = open('log', 'w')
for hotel_url in hotels_urls:
#     with open('log', 'a') as logfile:
#         logfile.write(hotel_url+'\n')
    
    logfile.write(hotel_url + '\n')
    
    res = requests.get(hotel_url)
    soup = BeautifulSoup(res.text, 'lxml')
    
    dynamic_info = get_dynamic_info(hotel_url)
#     print(dynamic_info)
    
    hotel_info = {}
    
    hotel_name = soup.select('h1')[0].getText().strip()
    hotel_address = soup.select('h1 small')[0].getText().strip()
    
    amenities_cats_except_first = soup.select('div.amenitiesCategory')[1:]
    
    quick_facts_ul = soup.select('.clearFix.hotelStats')[0]
    quick_facts_li = quick_facts_ul.find_all('li')
    
    for quick_fact in quick_facts_li:
        heading = quick_facts_li[0].find('small').text.strip()
        hotel_info[heading] = quick_facts_li[0].find('span').text.strip()
    
    # making dictionary from information gathered till now
    
    hotel_info = {
        'hotel_name': hotel_name,
        'hotel_address': hotel_address,
    }
    
    hotel_info = dict(list(hotel_info.items()) + list(dynamic_info.items()))
    
    # scraping other information
    amenities_cats_except_second = amenities_cats_except_first[1:]
    
    for amenities_cat in amenities_cats_except_second:
        heading = amenities_cat.find('strong').text
        hotel_info[heading] = amenities_cat.find('ul').text.strip().split('\n')
        
    all_hotels += hotel_info

In [None]:
print(all_hotels)