In [241]:
from selenium import webdriver
from selenium.webdriver.chrome.options import Options
from selenium.webdriver.common.by import By
from selenium.webdriver.support.ui import WebDriverWait
from selenium.webdriver.chrome.service import Service
from webdriver_manager.chrome import ChromeDriverManager
from selenium.webdriver.support import expected_conditions as EC
from bs4 import BeautifulSoup
import random
import time
import csv

from datetime import datetime

In [25]:
def scrape_reviews(page_url, csv_writer):
    driver.get(page_url)

    sleepTime = random.randint(4, 15)
    time.sleep(sleepTime) 

    review_containers = driver.find_elements(By.XPATH, "//div[contains(@class, 'azLzJ') and contains(@class, 'MI') and contains(@class, 'Gi') and @data-test-target='HR_CC_CARD']")
    
    if not review_containers:
        print("No more reviews found or a loading issue occurred.")
        return False

    for review in review_containers:
        try:
            review_title = review.find_element(By.XPATH, ".//div[contains(@class, 'joSMp')]/a/span/span").text
            review_text = review.find_element(By.XPATH, ".//div[contains(@class, 'yJgrn')]/div/div/span/span").text
            review_date = review.find_element(By.XPATH, ".//div[contains(@class, 'ScwkD')]/span").text

            csv_writer.writerow([review_title, review_text, review_date])
        except Exception as e:
            print(f"Error extracting review: {e}")
    
    return True

# DRIVER
driver = webdriver.Chrome(service=Service(ChromeDriverManager().install()))

base_url = "https://www.tripadvisor.com.sg/Hotel_Review-g294265-d1770798-Reviews"
offset_pattern = "-or{}-Marina_Bay_Sands-Singapore.html"

# save to csv 
with open('mbs_reviews.csv', mode='w', newline='', encoding='utf-8') as file:
    csv_writer = csv.writer(file)
    csv_writer.writerow(['Review Title', 'Review Text', 'Review Date'])

    # crawl n scrape
    for offset in range(10, 30, 10):
        page_url = f"{base_url}{offset_pattern.format(offset)}"
        print(f"Scraping page: {page_url}")
        success = scrape_reviews(page_url, csv_writer)
        if not success:
            break

driver.quit()


Scraping page: https://www.tripadvisor.com.sg/Hotel_Review-g294265-d1770798-Reviews-or10-Marina_Bay_Sands-Singapore.html
No more reviews found or a loading issue occurred.


### HTML LOGIC FOR SOUP

using sample_page.html (this is the html page of https://www.tripadvisor.com.sg/Hotel_Review-g294265-d1770798-Reviews-or100-Marina_Bay_Sands-Singapore.html )

In [225]:
#sampling finding html keys with sample_page.html retrieved from tripadvisor.com

with open('page61.html', 'r', encoding = 'utf-8') as file:
    html_content = file.read()

soup = BeautifulSoup(html_content, 'html.parser')


parent_div = soup.find('div', class_='uqMDf z BGJxv xOykd jFVeD yikFK')


reviews = parent_div.find_all('div', {'data-test-target': 'HR_CC_CARD'})


for review in reviews:
    review_title = review.find('div', {'data-test-target': 'review-title'}).text.strip()
    review_text = review.find('div', class_='yJgrn').div.div.span.span.text.strip()

    date_of_stay_raw = review.find('span', class_='iSNGb').text.strip()
    date_of_stay = date_of_stay_raw.replace('Date of stay:', '').strip()

    auth_contribution_raw = review.find('span', class_='b Ch')
    
    if auth_contribution_raw:
        auth_contribution = auth_contribution_raw.text.strip()
    else:
        auth_contribution = "0"
    
    rating_raw = review.find('svg', class_='UctUV').find('title').text.strip()
    rating = rating_raw.split(' ')[0] 
    


    print("Review Title:", review_title)
    print("Review Text:", review_text)
    print("Date of Stay:", date_of_stay)
    print("No.of Author's Contributions:", auth_contribution)
    print("Rating: ", rating)
    print("------")

Review Title: Beautiful Hotel
Review Text: Singapore is such a beautiful place to go and the hotel was great. It has very nice view and the facilities was just nice. Its been so long since my wife and I travelled but there was a nice lady name Sha helped us at the reception answering all out enquiries. It was a lovely stay and friendly people around.
Date of Stay: March 2023
No.of Author's Contributions: 1
Rating:  4.0
------
Review Title: Amazing hotel!!
Review Text: Fantastic hotel with a huge variety of food outlets and activities. Great rooms with great views across the bay and bay gardens. Go for a high floor room to get the views. Also recommend KOMA Japanese restaurant.
Date of Stay: March 2023
No.of Author's Contributions: 122
Rating:  5.0
------
Review Title: Huge hotel complex with a great view
Review Text: We stayed here for 4 nights. We had no idea that the restaurants would be so busy and it would be so hard to book once we were here, so book ahead! We ate in both Lavo and

In [16]:
### headers and agents

USER_AGENTS = [
        'Mozilla/5.0 (Windows NT 10.0; Win64; x64) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/91.0.4472.124 Safari/537.36',
        'Mozilla/5.0 (X11; Linux x86_64) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/92.0.4515.107 Safari/537.36',
        'Mozilla/5.0 (Windows NT 10.0; Win64; x64) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/90.0.4430.212 Safari/537.36',
        'Mozilla/5.0 (iPhone; CPU iPhone OS 12_2 like Mac OS X) AppleWebKit/605.1.15 (KHTML, like Gecko) Mobile/15E148',
        'Mozilla/5.0 (Linux; Android 11; SM-G960U) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/89.0.4389.72 Mobile Safari/537.36'
]

def get_headers():
    """
    Dynamically selects a user agent and constructs the headers.
    """
    user_agent = random.choice(USER_AGENTS)
    headers = {
            'authority': 'httpbin.org',
            'cache-control': 'max-age=0',
            'sec-ch-ua': '"Chromium";v="92", " Not A;Brand";v="99", "Google Chrome";v="92"',
            'sec-ch-ua-mobile': '?0',
            'upgrade-insecure-requests': '1',
            'sec-fetch-site': 'none',
            'sec-fetch-mode': 'navigate',
            'sec-fetch-user': '?1',
        }
    return headers



### Proxy pooling


In [23]:
from itertools import cycle

def get_proxies():
    # Fetching proxies from a website
    url = "https://free-proxy-list.net/"
    response = requests.get(url)
    soup = BeautifulSoup(response.text, "html.parser")

    # Extracting proxy information from the HTML
    proxies = set()
    rows = soup.select("tbody tr")

    for row in rows:
        td7 = row.select_one("td:nth-child(7)")

        # Checking if the proxy supports HTTPS
        if td7 and td7.text.strip().lower() == "yes":
            td1 = row.select_one("td:nth-child(1)").text.strip()
            td2 = row.select_one("td:nth-child(2)").text.strip()

            # Combining IP and Port to form a proxy entry
            combined_result = f"{td1}:{td2}"
            proxies.add(combined_result)
    return proxies

# Obtain the set of proxies and create a cycle
proxies = cycle(get_proxies())
url = 'https://httpbin.org/ip'

for i in range(1, 6):
    # Selecting the next proxy from the cycle for each request
    proxy = next(proxies)

    try:
        # Making a request using the selected proxy
        response = requests.get(url, proxies={"http": proxy, "https": proxy})

        # Checking for HTTP errors in the response
        response.raise_for_status()

        print(
            f"Request #{i} successful. IP Address: {response.text.strip()}", end="\n")
    except Exception as e:
        # Skip free proxies with connection errors; retry the request with the next proxy in the cycle..
        print(f"Request #{i} failed! Exception Name: {type(e).__name__}")

Request #1 failed! Exception Name: ConnectTimeout
Request #2 failed! Exception Name: ProxyError
Request #3 failed! Exception Name: SSLError
Request #4 failed! Exception Name: ConnectTimeout
Request #5 failed! Exception Name: ProxyError


### selenium with soup logic


In [27]:
chrome_options = Options()


user_agent = "Mozilla/5.0 (Windows NT 10.0; Win64; x64) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/88.0.4324.150 Safari/537.36"
chrome_options.add_argument(f'user-agent={user_agent}')
# chrome_options.add_argument('--headless')

driver = webdriver.Chrome(options = chrome_options)



def human_like_scroll(driver, scroll_pause_time=1.5):
    
    last_height = driver.execute_script("return document.body.scrollHeight")

    while True:
        
        driver.execute_script("window.scrollTo(0, document.body.scrollHeight);")

        
        time.sleep(scroll_pause_time)

        
        new_height = driver.execute_script("return document.body.scrollHeight")
        if new_height == last_height:
            break
        last_height = new_height


driver.get('https://www.tripadvisor.com/Hotel_Review-g294265-d1770798-Reviews-Marina_Bay_Sands-Singapore.html')


time.sleep(random.uniform(2.5, 12.5))
human_like_scroll(driver)


soup = BeautifulSoup(driver.page_source, 'html.parser')
parent_div = soup.find('div', class_='uqMDf z BGJxv xOykd jFVeD yikFK')

csv_file = open('reviews.csv', 'w', newline='', encoding='utf-8')
csv_writer = csv.writer(csv_file)
csv_writer.writerow(['Review Title', 'Review Text', 'Date of Stay', 'Trip Type', 'Rating'])

if parent_div:
    
    reviews = parent_div.find_all('div', {'data-test-target': 'HR_CC_CARD'})

    for review in reviews:
        review_title = review.find('div', {'data-test-target': 'review-title'}).text.strip()
        review_text = review.find('div', class_='yJgrn').div.div.span.span.text.strip()
        date_of_stay = review.find('span', class_='iSNGb').text.strip()
        trip_type = review.find('span', class_='b Ch').text.strip()
        rating = review.find('svg', class_='UctUV').find('title').text.strip()

        
        csv_writer.writerow([review_title, review_text, date_of_stay, trip_type, rating])

csv_file.close()



In [198]:
import webbrowser
import pyautogui
import os
import pyperclip
import numpy as np
import math
import pandas as pd

### Using pyautogui to scrape
to bypass bot detection and anti scraping -> humanized mouse movements


In [231]:
sqrt3 = np.sqrt(3)
sqrt5 = np.sqrt(5)


def wind_mouse(start_x, start_y, dest_x, dest_y, G_0=9, W_0=5, M_0=15, D_0=12):
    '''
    WindMouse algorithm adapted for pyautogui.
    G_0 - magnitude of the gravitational force
    W_0 - magnitude of the wind force fluctuations
    M_0 - maximum step size (velocity clip threshold)
    D_0 - distance where wind behavior changes from random to damped
    '''
    current_x, current_y = start_x, start_y
    v_x = v_y = W_x = W_y = 0
    while (dist := np.hypot(dest_x - current_x, dest_y - current_y)) >= 1:
        W_mag = min(W_0, dist)
        if dist >= D_0:
            W_x = W_x / sqrt3 + (2 * np.random.random() - 1) * W_mag / sqrt5
            W_y = W_y / sqrt3 + (2 * np.random.random() - 1) * W_mag / sqrt5
        else:
            W_x /= sqrt3
            W_y /= sqrt3
            if M_0 < 3:
                M_0 = np.random.random() * 3 + 3
            else:
                M_0 /= sqrt5
        v_x += W_x + G_0 * (dest_x - current_x) / dist
        v_y += W_y + G_0 * (dest_y - current_y) / dist
        v_mag = np.hypot(v_x, v_y)
        if v_mag > M_0:
            v_clip = M_0 / 2 + np.random.random() * M_0 / 2
            v_x = (v_x / v_mag) * v_clip
            v_y = (v_y / v_mag) * v_clip
        current_x += v_x
        current_y += v_y
        move_x = int(np.round(current_x))
        move_y = int(np.round(current_y))
        pyautogui.moveTo(move_x, move_y) ## added 
    return current_x, current_y


# do not run unless mouse pos is optimised

def save_html(url, index):
    webbrowser.open_new(url)
    time.sleep(random.uniform(3, 10))
    pyautogui.hotkey('ctrl', 'shift', 'i')
    time.sleep(random.uniform(0.5, 2))


    start_x, start_y = pyautogui.position()


    x, y = random.uniform(2820, 3400), random.uniform(205, 215)
    wind_mouse(start_x, start_y, x, y)
    pyautogui.rightClick()
    time.sleep(random.uniform(1, 2))


    x2, y2 = x + random.uniform(80, 120), y + random.uniform(170, 185)
    wind_mouse(x, y, x2, y2)
    time.sleep(random.uniform(0.5, 2))


    x3, y3 = x2 + random.uniform(170, 190), y2 + random.uniform(0, 5)
    wind_mouse(x2, y2, x3, y3)


    time.sleep(random.uniform(1, 3))
    pyautogui.click()


    html_content = pyperclip.paste()

    
    filename = f"page{index}.html"
    with open(filename, 'w', encoding='utf-8') as file:
        file.write(html_content)

    return filename


def process_html(filename, data_list):
    with open(filename, 'r', encoding='utf-8') as file:
        html_content = file.read()

    soup = BeautifulSoup(html_content, 'html.parser')
    
    parent_div = soup.find('div', class_='uqMDf z BGJxv xOykd jFVeD yikFK')
    reviews = parent_div.find_all('div', {'data-test-target': 'HR_CC_CARD'})

    for review in reviews:
        review_title = review.find('div', {'data-test-target': 'review-title'}).text.strip()
        review_text = review.find('div', class_='yJgrn').div.div.span.span.text.strip()
        
        ###

        date_of_stay_raw = review.find('span', class_='iSNGb').text.strip()
        date_of_stay = date_of_stay_raw.replace('Date of stay:', '').strip()
        
        ###

        auth_contribution_raw = review.find('span', class_='b Ch')
        auth_contribution = "0"

        if auth_contribution_raw:
            auth_contribution = auth_contribution_raw.text.strip()
        else:
            auth_contribution = "0"

        ###
            
        rating_raw = review.find('svg', class_='UctUV').find('title').text.strip()
        rating = rating_raw.split(' ')[0]

        
        data_list.append((review_title, review_text, date_of_stay, auth_contribution, rating))

    
    os.remove(filename)


def close_tab():
    time.sleep(random.uniform(2, 10.5)) 
    pyautogui.hotkey('ctrl', 'w')  
    time.sleep(random.uniform(1.5, 2.5))  

def save_and_process_pages(base_url, start, stop, step):
    data_list = []

    steps = list(range(start, stop + 1, step)) #scrambling seq
    random.shuffle(steps)

    for i in steps:
        url = base_url.format(i)
        print(f"Processing: {url}")
        filename = save_html(url, i)
        process_html(filename, data_list)
        close_tab()  
        time.sleep(random.uniform(3, 10)) 
    
    columns = ['Review Title', 'Review Text', 'Date of Stay', 'Author Contribution', 'Rating']
    df = pd.DataFrame(data_list, columns=columns)
    df.to_csv('tripadvisor_reviews_61_80.csv', index=False)


base_url = "https://www.tripadvisor.com.sg/Hotel_Review-g294265-d1770798-Reviews-or{}-Marina_Bay_Sands-Singapore.html"

# main
save_and_process_pages(base_url, 610, 800, 10)


Processing: https://www.tripadvisor.com.sg/Hotel_Review-g294265-d1770798-Reviews-or800-Marina_Bay_Sands-Singapore.html
Processing: https://www.tripadvisor.com.sg/Hotel_Review-g294265-d1770798-Reviews-or610-Marina_Bay_Sands-Singapore.html
Processing: https://www.tripadvisor.com.sg/Hotel_Review-g294265-d1770798-Reviews-or630-Marina_Bay_Sands-Singapore.html
Processing: https://www.tripadvisor.com.sg/Hotel_Review-g294265-d1770798-Reviews-or670-Marina_Bay_Sands-Singapore.html
Processing: https://www.tripadvisor.com.sg/Hotel_Review-g294265-d1770798-Reviews-or730-Marina_Bay_Sands-Singapore.html
Processing: https://www.tripadvisor.com.sg/Hotel_Review-g294265-d1770798-Reviews-or660-Marina_Bay_Sands-Singapore.html
Processing: https://www.tripadvisor.com.sg/Hotel_Review-g294265-d1770798-Reviews-or690-Marina_Bay_Sands-Singapore.html
Processing: https://www.tripadvisor.com.sg/Hotel_Review-g294265-d1770798-Reviews-or760-Marina_Bay_Sands-Singapore.html
Processing: https://www.tripadvisor.com.sg/Hotel

In [187]:
### optimising pos 

x, y = random.uniform(2820, 3500), random.uniform(205, 215) # Example coordinates y = 205 to 215 x = 2820 to 3300
pyautogui.moveTo(x, y, duration=1) 
pyautogui.rightClick()


x2, y2 = x + random.uniform(80, 120), y + random.uniform(170, 190) # 190
pyautogui.moveTo(x2, y2, duration=1)

x3, y3 = x2 + random.uniform(170, 190), y2 + random.uniform(0, 10) # 190
pyautogui.moveTo(x3, y3, duration=1)

time.sleep(random.uniform(1,5))
pyautogui.click()

In [273]:
df = pd.read_csv('data_csv_files.csv')

df

Unnamed: 0,Review Title,Review Text,Date of Stay,Author Contribution,Rating
0,THANK YOU,Our greatest gratitude to ALL staff members at...,December 2023,1,5.0
1,Great experience .... indeed!!,Marina Bay SandsSingaporeThankful to Yve931361...,December 2023,2,5.0
2,Excellent!,We spent two nights here on vacation and I mus...,December 2023,1,5.0
3,Great Time,"It was my first time and I loved it, the swimm...",December 2023,16,5.0
4,Best and beautiful hotel,We stayed in rooms 506 and 523 in Tower 3. Roo...,December 2023,4,5.0
...,...,...,...,...,...
11227,An awesome last night in Singapore...however s...,We decided to stay at Marina Bay Sands for our...,2015/1,153,4.0
11228,Very Nice,I paid nearly $800 AUS for a one night stay so...,2015/1,50,4.0
11229,Swimming at the top of the world,This hotel is all that I have read about it. O...,2014/12,40,5.0
11230,Great hotel,I must say the view from our room is amazing. ...,2014/12,5,5.0


In [275]:
def normalize_date(date_entry, prev_date):
    if pd.isna(date_entry) or date_entry == '':
        return prev_date  # Use the previous date if the current one is missing
    try:
        
        return datetime.strptime(date_entry, '%b-%y').strftime('%Y/%m')
    except ValueError:
        try:
            
            return datetime.strptime(date_entry, '%B %Y').strftime('%Y/%m')
        except ValueError:
            
            return date_entry

prev_date = None
for index, row in df.iterrows():
    normalized_date = normalize_date(row['Date of Stay'], prev_date)
    df.at[index, 'Date of Stay'] = normalized_date if normalized_date is not None else prev_date
    prev_date = df.at[index, 'Date of Stay']


df['Date of Stay'] = pd.to_datetime(df['Date of Stay'], errors='coerce', format='%Y/%m')

df.sort_values(by='Date of Stay', ascending=False, inplace=True)
df.to_csv('trip_advisor.csv', index=False, encoding='utf-8-sig')
