In [1]:
from bs4 import BeautifulSoup
import json
import requests
import random

import pandas as pd
import math, time
import re
import numpy as np
import logging
from requests.exceptions import Timeout

# Configure logging
logging.basicConfig(level=logging.INFO, format='%(asctime)s - %(levelname)s - %(message)s')
# pd.set_option("display.max_colwidth",None)

# TripAdvisor


In [2]:
# with open("user-agents_1.txt", "r+") as file:
#     user_agents = file.readlines()
#     user_agents = [user_agent.replace("\n","").strip() for user_agent in user_agents]

In [3]:
# JSON dosyasını oku
with open("data/user-agents.json", "r") as f:
    user_agents_data = json.load(f)

# User-Agent listesini oluştur
user_agents = [entry["ua"] for entry in user_agents_data]

In [4]:
# with open("valid-proxy-list.txt","r") as f:
#     prxs = f.read().split("\n")

# proxies =[]

# for prx in prxs:
#     if prx not in proxies:
#         proxies.append(prx)

In [5]:
def fetch_html(url):

    headers = {
        "User-Agent": random.choice(user_agents),
        "Accept-Language": "en-US,en;q=0.9",
        "Referer": "https://www.google.com/",
    }

    try:
        # proxy = random.choice(proxies)
        response = requests.get(url, headers=headers, timeout=10)
        response.raise_for_status()  
        return response.content
    except requests.exceptions.HTTPError as err:
        print(f"HTTP hatası: {err}")
        return None


def parse_html(html_content):
    return BeautifulSoup(html_content, 'lxml')


def get_namesandcount(soup):
    hotel_names_with_numbers = [hotel.get_text(strip=True) for hotel in soup.find_all(name="div", class_="nBrpc o W")]

    cleaned_hotel_names = [re.sub(r'[^\w\s]', '', re.sub(r'^\d+\.', '', hotel)) for hotel in hotel_names_with_numbers]

    review_counts = []
    for count_review in soup.find_all("span", class_="biGQs _P pZUbB hmDzD"):
        span_tag = count_review.find("span", class_="S4")
        if span_tag:
            review_counts.append(span_tag.text)

    review_counts_int = [int(re.sub(r'\.', '', re.search(r'\d+(\.\d+)*', review).group())) for review in review_counts]
    

    return pd.DataFrame({"name":cleaned_hotel_names, "count":review_counts_int })


def get_button_hrefs(soup):
    hrefs = []
    buttons = soup.find_all('button', class_='ypcsE _S wSSLS')
    
    for button in buttons:
        a_tag = button.find('a', href=True)
        if a_tag:
            hrefs.append(a_tag['href'])
    
    return hrefs

def get_dynamic_links(hotels):
    hotel_review_links = []
    root = "https://www.tripadvisor.com.tr"
    for hotel in hotels:
        part = hotel
        full= root+part
        # print(f"\nLinkler: {full}")
        
        location = full.find('Reviews')
        
        # print(f"Linkteki Reviewsin Konumu: {location}")
        full = full[:(location+7)] +"-or{}"+ full[(location+7):]
        # print(f"Dinamik Link: {full}")
        
        hotel_review_links.append(full)
    
    return hotel_review_links


def get_review_by_url(url):
    
    try:
        review_page = parse_html(fetch_html(url))
        reviews = review_page.find_all("div", class_="_T FKffI bmUTE")
        
        texts=[]
        for review in reviews:
            span_element = review.find("span", class_="orRIx Ci _a C")
            if span_element is not None:
                texts.append(span_element.text)
        
        return texts
    except :
        return " "

In [29]:
# url = "https://www.tripadvisor.com.tr/Hotels-g297962-Antalya_Turkish_Mediterranean_Coast-Hotels.html"
# url = "https://www.tripadvisor.com.tr/Hotels-g298020-Bodrum_District_Mugla_Province_Turkish_Aegean_Coast-Hotels.html"
# url = "https://www.tripadvisor.com.tr/Hotels-g293974-Istanbul-Hotels.html"
# url ="https://www.tripadvisor.com.tr/Hotels-g298020-oa30-Bodrum_District_Mugla_Province_Turkish_Aegean_Coast-Hotels.html"
url = "https://www.tripadvisor.com.tr/Hotels-g293974-oa30-Istanbul-Hotels.html"
response = fetch_html(url=url)

In [30]:
soup = parse_html(response)

In [31]:
hotels = get_button_hrefs(soup)

In [32]:
hotels, len(hotels)

(['/Hotel_Review-g293974-d8335839-Reviews-Aprilis_Gold_Hotel-Istanbul.html#REVIEWS',
  '/Hotel_Review-g293974-d1674691-Reviews-Hotel_Amira_Istanbul-Istanbul.html#REVIEWS',
  '/Hotel_Review-g293974-d2049041-Reviews-The_Story_Hotel_Pera-Istanbul.html#REVIEWS',
  '/Hotel_Review-g293974-d14012180-Reviews-Primero_Hotel-Istanbul.html#REVIEWS',
  '/Hotel_Review-g293974-d2554998-Reviews-Rixos_Pera_Istanbul-Istanbul.html#REVIEWS',
  '/Hotel_Review-g293974-d12992361-Reviews-Hilton_Istanbul_Bakirkoy-Istanbul.html#REVIEWS',
  '/Hotel_Review-g293974-d20946349-Reviews-The_Soul_Hotel-Istanbul.html#REVIEWS',
  '/Hotel_Review-g293974-d1897902-Reviews-Levni_Hotel_Spa-Istanbul.html#REVIEWS',
  '/Hotel_Review-g293974-d8309712-Reviews-Ayramin_Hotel-Istanbul.html#REVIEWS',
  '/Hotel_Review-g293974-d304836-Reviews-Movenpick_Hotel_Istanbul_Bosphorus-Istanbul.html#REVIEWS',
  '/Hotel_Review-g293974-d3484310-Reviews-MySuite_Istanbul-Istanbul.html#REVIEWS',
  '/Hotel_Review-g293974-d294607-Reviews-InterContinent

In [33]:
hotel_review_links = get_dynamic_links(hotels)

In [34]:
hotel_review_links, len(hotel_review_links)

(['https://www.tripadvisor.com.tr/Hotel_Review-g293974-d8335839-Reviews-or{}-Aprilis_Gold_Hotel-Istanbul.html#REVIEWS',
  'https://www.tripadvisor.com.tr/Hotel_Review-g293974-d1674691-Reviews-or{}-Hotel_Amira_Istanbul-Istanbul.html#REVIEWS',
  'https://www.tripadvisor.com.tr/Hotel_Review-g293974-d2049041-Reviews-or{}-The_Story_Hotel_Pera-Istanbul.html#REVIEWS',
  'https://www.tripadvisor.com.tr/Hotel_Review-g293974-d14012180-Reviews-or{}-Primero_Hotel-Istanbul.html#REVIEWS',
  'https://www.tripadvisor.com.tr/Hotel_Review-g293974-d2554998-Reviews-or{}-Rixos_Pera_Istanbul-Istanbul.html#REVIEWS',
  'https://www.tripadvisor.com.tr/Hotel_Review-g293974-d12992361-Reviews-or{}-Hilton_Istanbul_Bakirkoy-Istanbul.html#REVIEWS',
  'https://www.tripadvisor.com.tr/Hotel_Review-g293974-d20946349-Reviews-or{}-The_Soul_Hotel-Istanbul.html#REVIEWS',
  'https://www.tripadvisor.com.tr/Hotel_Review-g293974-d1897902-Reviews-or{}-Levni_Hotel_Spa-Istanbul.html#REVIEWS',
  'https://www.tripadvisor.com.tr/Hote

In [35]:
# def get_turkish_review_counts(hotel_review_links):
#     """Fetch Turkish review counts from a list of hotel review links."""
#     review_counts = []

#     for link in hotel_review_links:
#         html_content = fetch_html(link)
        
#         if html_content:
#             rev_page = parse_html(html_content)
#             turkish_label = rev_page.find('span', string='Türkçe')
#             if turkish_label:
#                 turkish_review_count = turkish_label.find_next('span', class_='mUseN').text
#                 turkish_review_count = turkish_review_count.strip('()').replace('.', '')
#                 review_counts.append(int(turkish_review_count))
#                 print(turkish_review_count)
#             else:
#                 logging.error('Turkish review count not found')
#                 review_counts.append(None)
#         else:
#             logging.error('Failed to fetch HTML content')
#             review_counts.append(None)
    
#     return review_counts

In [38]:
get_review_by_url(hotel_review_links[0].format(5))[0]

'otelin konumu mükemmel bütün tarihi yerlere yakın otel personelleri cok cana yakın odaları ferah cok temiz gidilebilecek en iyi otellerden biri personeli cok güler yüzlü mükemmel bir kahvaltınız var her ceştenvar  herşey için  cok  teşekkürler tekrardan görüşmek üzere hoşçakalın'

In [39]:
hotel_df = get_namesandcount(soup)

In [43]:

hotel_df["review_link"] = hotel_review_links

In [44]:
results = []
for idx, row in hotel_df.iterrows():
    links = []
    for rng in range(0, row["count"], 5):
        if rng ==0: continue
        link = row["review_link"].format(rng)
        links.append(link)
    results.append(links)

hotel_df["dynamic_links"] = results

In [45]:
hotel_df

Unnamed: 0,name,count,review_link,dynamic_links
0,Aprilis Gold Hotel,743,https://www.tripadvisor.com.tr/Hotel_Review-g2...,[https://www.tripadvisor.com.tr/Hotel_Review-g...
1,Hotel Amira Istanbul,4732,https://www.tripadvisor.com.tr/Hotel_Review-g2...,[https://www.tripadvisor.com.tr/Hotel_Review-g...
2,The Story Hotel Pera,208,https://www.tripadvisor.com.tr/Hotel_Review-g2...,[https://www.tripadvisor.com.tr/Hotel_Review-g...
3,Primero Hotel,746,https://www.tripadvisor.com.tr/Hotel_Review-g2...,[https://www.tripadvisor.com.tr/Hotel_Review-g...
4,Rixos Pera Istanbul,2348,https://www.tripadvisor.com.tr/Hotel_Review-g2...,[https://www.tripadvisor.com.tr/Hotel_Review-g...
5,Hilton Istanbul Bakirkoy,1564,https://www.tripadvisor.com.tr/Hotel_Review-g2...,[https://www.tripadvisor.com.tr/Hotel_Review-g...
6,The Soul Hotel,251,https://www.tripadvisor.com.tr/Hotel_Review-g2...,[https://www.tripadvisor.com.tr/Hotel_Review-g...
7,Levni Hotel Spa,3192,https://www.tripadvisor.com.tr/Hotel_Review-g2...,[https://www.tripadvisor.com.tr/Hotel_Review-g...
8,Ayramin Hotel,379,https://www.tripadvisor.com.tr/Hotel_Review-g2...,[https://www.tripadvisor.com.tr/Hotel_Review-g...
9,Mövenpick Hotel İstanbul Bosphorus,888,https://www.tripadvisor.com.tr/Hotel_Review-g2...,[https://www.tripadvisor.com.tr/Hotel_Review-g...


In [47]:
hotel_df["count"].sum()/3

12067.333333333334

In [48]:
# hotel_df.to_csv("data/hotels_with_links5.csv", index=False)

# türkçe yorumları seç

In [49]:
import pandas as pd 

hotel_df = pd.read_csv("data/hotels_with_links5.csv")

In [50]:
[link for link in hotel_df["review_link"]]

['https://www.tripadvisor.com.tr/Hotel_Review-g293974-d8335839-Reviews-or{}-Aprilis_Gold_Hotel-Istanbul.html#REVIEWS',
 'https://www.tripadvisor.com.tr/Hotel_Review-g293974-d1674691-Reviews-or{}-Hotel_Amira_Istanbul-Istanbul.html#REVIEWS',
 'https://www.tripadvisor.com.tr/Hotel_Review-g293974-d2049041-Reviews-or{}-The_Story_Hotel_Pera-Istanbul.html#REVIEWS',
 'https://www.tripadvisor.com.tr/Hotel_Review-g293974-d14012180-Reviews-or{}-Primero_Hotel-Istanbul.html#REVIEWS',
 'https://www.tripadvisor.com.tr/Hotel_Review-g293974-d2554998-Reviews-or{}-Rixos_Pera_Istanbul-Istanbul.html#REVIEWS',
 'https://www.tripadvisor.com.tr/Hotel_Review-g293974-d12992361-Reviews-or{}-Hilton_Istanbul_Bakirkoy-Istanbul.html#REVIEWS',
 'https://www.tripadvisor.com.tr/Hotel_Review-g293974-d20946349-Reviews-or{}-The_Soul_Hotel-Istanbul.html#REVIEWS',
 'https://www.tripadvisor.com.tr/Hotel_Review-g293974-d1897902-Reviews-or{}-Levni_Hotel_Spa-Istanbul.html#REVIEWS',
 'https://www.tripadvisor.com.tr/Hotel_Review-

In [51]:
turk_review_count =[38,33,7,64,206,565,37,35,17,240,26,152,154,470,225,4,199,55,41,281,9,126,30,101,22,21,15,15,133,26]

In [52]:
len(turk_review_count), sum(turk_review_count)

(30, 3347)

In [53]:
hotel_df["turk_review"] = turk_review_count

In [54]:
import ast
hotel_df["dynamic_links"] = hotel_df["dynamic_links"].apply(ast.literal_eval)

In [55]:
[link for link in hotel_df["dynamic_links"][0]]

['https://www.tripadvisor.com.tr/Hotel_Review-g293974-d8335839-Reviews-or5-Aprilis_Gold_Hotel-Istanbul.html#REVIEWS',
 'https://www.tripadvisor.com.tr/Hotel_Review-g293974-d8335839-Reviews-or10-Aprilis_Gold_Hotel-Istanbul.html#REVIEWS',
 'https://www.tripadvisor.com.tr/Hotel_Review-g293974-d8335839-Reviews-or15-Aprilis_Gold_Hotel-Istanbul.html#REVIEWS',
 'https://www.tripadvisor.com.tr/Hotel_Review-g293974-d8335839-Reviews-or20-Aprilis_Gold_Hotel-Istanbul.html#REVIEWS',
 'https://www.tripadvisor.com.tr/Hotel_Review-g293974-d8335839-Reviews-or25-Aprilis_Gold_Hotel-Istanbul.html#REVIEWS',
 'https://www.tripadvisor.com.tr/Hotel_Review-g293974-d8335839-Reviews-or30-Aprilis_Gold_Hotel-Istanbul.html#REVIEWS',
 'https://www.tripadvisor.com.tr/Hotel_Review-g293974-d8335839-Reviews-or35-Aprilis_Gold_Hotel-Istanbul.html#REVIEWS',
 'https://www.tripadvisor.com.tr/Hotel_Review-g293974-d8335839-Reviews-or40-Aprilis_Gold_Hotel-Istanbul.html#REVIEWS',
 'https://www.tripadvisor.com.tr/Hotel_Review-g29

In [56]:
hotel_df.head()

Unnamed: 0,name,count,review_link,dynamic_links,turk_review
0,Aprilis Gold Hotel,743,https://www.tripadvisor.com.tr/Hotel_Review-g2...,[https://www.tripadvisor.com.tr/Hotel_Review-g...,38
1,Hotel Amira Istanbul,4732,https://www.tripadvisor.com.tr/Hotel_Review-g2...,[https://www.tripadvisor.com.tr/Hotel_Review-g...,33
2,The Story Hotel Pera,208,https://www.tripadvisor.com.tr/Hotel_Review-g2...,[https://www.tripadvisor.com.tr/Hotel_Review-g...,7
3,Primero Hotel,746,https://www.tripadvisor.com.tr/Hotel_Review-g2...,[https://www.tripadvisor.com.tr/Hotel_Review-g...,64
4,Rixos Pera Istanbul,2348,https://www.tripadvisor.com.tr/Hotel_Review-g2...,[https://www.tripadvisor.com.tr/Hotel_Review-g...,206


In [57]:
def ceil_to_nearest_five(number):
    """Return the ceiling of number to the nearest multiple of 5."""
    remainder = number % 5
    if remainder == 0:
        return int(number)
    else:
        return int(number + (5 - remainder))

# Test cases
print(ceil_to_nearest_five(4.2))  # Output: 5
print(ceil_to_nearest_five(9.999))  # Output: 10
print(ceil_to_nearest_five(-3.7))  # Output: 0 (since -3.7 rounded up to nearest 5 is 0)
print(ceil_to_nearest_five(5.0))  # Output: 5
print(ceil_to_nearest_five(15))  # Output: 15
print(ceil_to_nearest_five(16))  # Output: 20


5
10
0
5
15
20


In [58]:
hotel_df["turk_review"] = hotel_df["turk_review"].apply(ceil_to_nearest_five)

In [59]:
import re

def extract_and_filter_urls(urls, user_number):
    """Extract the number from URLs after '-or' and return those with the number smaller than user_number."""
    filtered_urls = []
    
    for url in urls:
        match = re.search(r'-or(\d+)-', url)
        if match:
            number = int(match.group(1))
            if number <= user_number:
                filtered_urls.append(url)
            if user_number<5:
                filtered_urls.append(url)
    
    return filtered_urls


In [60]:
filtered_links=[]
for idx, row in hotel_df.iterrows():
    filtered_links.append(extract_and_filter_urls(row["dynamic_links"], row["turk_review"]))

In [61]:
hotel_df["filtered_links"] = filtered_links

In [62]:
hotel_df["filtered_links"].agg(len)

  hotel_df["filtered_links"].agg(len)


0       8
1       7
2       2
3      13
4      42
5     113
6       8
7       7
8       4
9      48
10      6
11     31
12     31
13     94
14     45
15      1
16     40
17     11
18      9
19     57
20      2
21     26
22      6
23     21
24      5
25      5
26      3
27      3
28     27
29      6
Name: filtered_links, dtype: int64

In [63]:
# hotel_df.to_csv("data/filtered_hotel_review_links5.csv", index=False)

# Scrape reviews

In [27]:
hotel_df = pd.read_csv("hotels_with_links.csv")

In [28]:
text_list = []
for idx, row in hotel_df.iterrows():
    text= []
    for i,url in enumerate(row["dynamic_links"]):
        print(f"Hotel Name: {row['name']} Count: {(i+1)*5}")
        print(url)
        print("----------------------------------------------")
        txt = get_review_by_url(url)
        time.sleep(5)
        text.extend(txt)
    
    print(f"New hotel\n\n")
    text_list.append(text)

Hotel Name: Concorde De Luxe Resort Count: 5
[
----------------------------------------------
Hotel Name: Concorde De Luxe Resort Count: 10
'
----------------------------------------------
Hotel Name: Concorde De Luxe Resort Count: 15
h
----------------------------------------------
Hotel Name: Concorde De Luxe Resort Count: 20
t
----------------------------------------------
Hotel Name: Concorde De Luxe Resort Count: 25
t
----------------------------------------------
Hotel Name: Concorde De Luxe Resort Count: 30
p
----------------------------------------------


KeyboardInterrupt: 

In [None]:
text

['Concorde Otel, mükemmel hizmet kalitesi ve konforuyla öne çıkıyor. Şık ve modern odaları, lezzetli yemekler sunan restoranları ve güler yüzlü personeli ile misafirlerine unutulmaz bir deneyim yaşatıyor. Merkezi konumu sayesinde şehirdeki önemli noktalara kolayca ulaşım imkanı sunuyor.',
 'Buraya geldiğim için çok mutluyum buradaki her şeyi genel anlamda çok beğendim yemekleri çok lezzetli sahili plajı denizi havuzu herşeyiyle mükemmel çalışanlar güler yüzlü ve ilgili yemekler çok lezzetliydi özellikle sahildeki Boyan partiye bayıldım animasyon dans yolları gece etkinlikleri canlı müziği herşeyiyle kesinlikle gelmeye değer kesinlikle ailemle tekrar geleceğim',
 'Herkes cok kibar ve iyi.. Temizlik çok güzel..Restaurant harika yemekler çok lezzetli.. Burada kötü bir ṣey hiç görmedim..Denizi havuzu aqua parkì harika.. Harika dans shovları var.. Aktiviteler harika.. Voleybol, dart,yoga, step aerobik gibi harika aktiviteler var.. Canlı müzik ve dj performans gerçekten harika.. Çalisanlar ç

# Proxyy

In [64]:
import requests
from bs4 import BeautifulSoup
import random
import logging

logging.basicConfig(level=logging.INFO)

def get_html(link, **kwargs):
    """Fetch HTML content from a URL using a proxy."""
    max_retries = 5
    retries = 0

    while retries < max_retries:
        try:
            proxy = proxy_generator()
            response = requests.get(link, proxies=proxy, timeout=7, **kwargs)
            response.raise_for_status()
            source = BeautifulSoup(response.content, "html.parser")
            return source
        except requests.exceptions.RequestException as e:
            logging.error(f"Request error: {e}")
        except Exception as e:
            logging.error(f"An unexpected error occurred: {e}")
        retries += 1
        logging.info(f"Retrying... ({retries}/{max_retries})")
    
    logging.error(f"Failed to fetch HTML after {max_retries} retries.")
    return None

def proxy_generator():
    """Generate a random proxy from sslproxies.org."""
    response = requests.get("https://sslproxies.org/")
    soup = BeautifulSoup(response.content, 'html.parser')
    proxies = soup.find_all("tbody")[0].find_all("tr")
    proxies_array = []

    for i in proxies:
        try:
            ip = i.find_all("td")[0].text
            port = i.find_all("td")[1].text
            proxies_array.append({"https": f"http://{ip}:{port}"})
        except Exception as e:
            logging.error(f"Error parsing proxy: {e}")

    if proxies_array:
        return random.choice(proxies_array)
    else:
        logging.error("No valid proxies found.")
        return None


In [65]:
get_html('https://www.tripadvisor.com.tr/Hotel_Review-g20116893-d572784-Reviews-or5-Concorde_De_Luxe_Resort-Kemeragzi_Antalya_Turkish_Mediterranean_Coast.html#REVIEWS')

ERROR:root:Proxy check error: HTTPConnectionPool(host='79.137.194.203', port=4019): Max retries exceeded with url: http://www.google.com/ (Caused by ProxyError('Unable to connect to proxy', NewConnectionError('<urllib3.connection.HTTPConnection object at 0x71be12d1dea0>: Failed to establish a new connection: [Errno 111] Connection refused')))
ERROR:root:Proxy check error: HTTPConnectionPool(host='203.189.88.156', port=80): Read timed out. (read timeout=5)
ERROR:root:Proxy check error: HTTPConnectionPool(host='51.145.176.250', port=8080): Max retries exceeded with url: http://www.google.com/ (Caused by ProxyError('Unable to connect to proxy', ConnectTimeoutError(<urllib3.connection.HTTPConnection object at 0x71be12d1e080>, 'Connection to 51.145.176.250 timed out. (connect timeout=5)')))
ERROR:root:Proxy check error: HTTPConnectionPool(host='178.128.113.118', port=23128): Read timed out. (read timeout=5)
ERROR:root:Proxy check error: HTTPConnectionPool(host='203.205.9.105', port=8080): R

KeyboardInterrupt: 

In [67]:
import json
import random
import re
import time
import ast
import requests
import pandas as pd
from bs4 import BeautifulSoup as bs
import logging
from requests.exceptions import Timeout
def fetch_html(url, user_agents):
    """Fetch HTML content from a URL using random user agents."""
    headers = {
        "User-Agent": random.choice(user_agents),
        "Accept-Language": "en-US,en;q=0.9",
        "Referer": "https://www.google.com/",
    }
    max_retries = 5
    retries = 0
    while retries < max_retries:
        try:
            proxy = proxy_generator()
            response = requests.get(url=url, proxies=proxy, headers=headers, timeout=10)
            logging.info(f"Status Code: {response.status_code}")
            response.raise_for_status()
            return response.content
        except:
            logging.error(f"Error fetching HTML from URL: {url}")
        retries += 1
    logging.error(f"Failed to fetch HTML after {max_retries} retries.")
    return None

def get_user_agents(path):
    """Load user agents from a JSON file."""
    with open(path, "r", encoding="utf-8") as f:
        user_agents_data = json.load(f)
    return [entry["ua"] for entry in user_agents_data]

def parse_html(content):
    """Parse HTML content using BeautifulSoup."""
    if content:
        return bs(content, "lxml")
    logging.error(f"Error parsing HTML")
    return None


In [68]:
user_agents = get_user_agents("data/user-agents.json")

In [69]:
url = "https://google.com"


In [None]:
fetch_html(url,user_agents)

ERROR:root:Proxy check error: HTTPConnectionPool(host='79.137.194.203', port=4019): Max retries exceeded with url: http://www.google.com/ (Caused by ProxyError('Unable to connect to proxy', NewConnectionError('<urllib3.connection.HTTPConnection object at 0x71be12e23dc0>: Failed to establish a new connection: [Errno 111] Connection refused')))
ERROR:root:Proxy check error: ('Connection broken: IncompleteRead(3130 bytes read, 6078 more expected)', IncompleteRead(3130 bytes read, 6078 more expected))
ERROR:root:Proxy check error: HTTPConnectionPool(host='203.189.88.156', port=80): Max retries exceeded with url: http://www.google.com/ (Caused by ProxyError('Unable to connect to proxy', ConnectTimeoutError(<urllib3.connection.HTTPConnection object at 0x71be12e237f0>, 'Connection to 203.189.88.156 timed out. (connect timeout=5)')))
ERROR:root:Proxy check error: HTTPConnectionPool(host='64.112.184.89', port=3128): Read timed out. (read timeout=5)
ERROR:root:Proxy check error: HTTPConnectionPo

In [None]:
def is_proxy_working(proxy):
    """Check if the given proxy is working by sending a request through it."""
    try:
        response = requests.get("http://www.google.com", proxies=proxy, timeout=5)
        return response.status_code == 200
    except Exception as e:
        logging.error(f"Proxy check error: {e}")
        return False

def proxy_generator():
    """Generate a random proxy from sslproxies.org and verify if it's working."""
    response = requests.get("https://sslproxies.org/")
    soup = BeautifulSoup(response.content, 'html.parser')
    proxies = soup.find_all("tbody")[0].find_all("tr")
    proxies_array = []

    for i in proxies:
        try:
            ip = i.find_all("td")[0].text
            port = i.find_all("td")[1].text
            proxy = {"http": f"http://{ip}:{port}", "https": f"http://{ip}:{port}"}
            if is_proxy_working(proxy):
                proxies_array.append(proxy)
        except Exception as e:
            logging.error(f"Error parsing proxy: {e}")

    if proxies_array:
        return random.choice(proxies_array)
    else:
        logging.error("No valid proxies found.")
        return None

# create df

In [7]:
import json
import random
import re
import time
import ast
import requests
import pandas as pd
from bs4 import BeautifulSoup as bs
import logging
from requests.exceptions import Timeout

# Configure logging
logging.basicConfig(level=logging.INFO, format='%(asctime)s - %(levelname)s - %(message)s')

def get_user_agents(path):
    """Load user agents from a JSON file."""
    with open(path, "r", encoding="utf-8") as f:
        user_agents_data = json.load(f)
    return [entry["ua"] for entry in user_agents_data]
    
def fetch_html(url, user_agents):
    """Fetch HTML content from a URL using random user agents."""
    headers = {
        "User-Agent": random.choice(user_agents),
        "Accept-Language": "en-US,en;q=0.9",
        "Referer": "https://www.google.com/",
    }
    max_retries = 35
    retries = 0
    while retries < max_retries:
        try:
            proxy = proxy_generator()
            response = requests.get(url=url, proxies=proxy, headers=headers, timeout=20)
            logging.info(f"Status Code: {response.status_code}")
            response.raise_for_status()
            return response.content
        except:
            logging.error(f"Error fetching HTML from URL: {url}")
        retries += 1
    logging.error(f"Failed to fetch HTML after {max_retries} retries.")
    return None
    
def get_button_hrefs(soup):
    """Extract href links from button elements in the parsed HTML."""
    hrefs = []
    buttons = soup.find_all('button', class_='ypcsE _S wSSLS')
    for button in buttons:
        a_tag = button.find('a', href=True)
        if a_tag:
            hrefs.append(a_tag['href'])
    return hrefs
    
def get_dynamic_links(hotels):
    """Generate dynamic review links for hotels."""
    root = "https://www.tripadvisor.com.tr"
    hotel_review_links = [root + hotel for hotel in hotels]
    return [
        link[:link.find('Reviews') + 7] + "-or{}" + link[link.find('Reviews') + 7:]
        for link in hotel_review_links
    ]

def get_names_and_count(soup):
    """Extract hotel names and review counts from the parsed HTML."""
    hotel_names_with_numbers = [
        hotel.get_text(strip=True) for hotel in soup.find_all(name="div", class_="nBrpc o W")
    ]
    cleaned_hotel_names = [
        re.sub(r'[^\w\s]', '', re.sub(r'^\d+\.', '', hotel)) for hotel in hotel_names_with_numbers
    ]

    review_counts = []
    for count_review in soup.find_all("span", class_="biGQs _P pZUbB hmDzD"):
        span_tag = count_review.find("span", class_="S4")
        if span_tag:
            review_counts.append(span_tag.text)

    review_counts_int = [
        int(re.sub(r'\.', '', re.search(r'\d+(\.\d+)*', review).group())) for review in review_counts
    ]

    return pd.DataFrame({"name": cleaned_hotel_names, "count": review_counts_int})

def get_all_dynamic_links(hotel_df):
    """Generate all dynamic review links for a dataframe of hotels."""
    results = []
    for _, row in hotel_df.iterrows():
        links = [
            row["review_link"].format(rng)
            for rng in range(5, row["count"], 5)
        ]
        results.append(links)
    return results


def create_hotel_df(user_agents_path, output_path):
    """Create a dataframe of hotels with their dynamic review links."""
    user_agents = get_user_agents(user_agents_path)
    response = fetch_html(url=HOTELS_URL, user_agents=user_agents)

    if response:
        hotels = get_button_hrefs(parse_html(response))
        hotel_review_links = get_dynamic_links(hotels)
        hotel_df = get_names_and_count(parse_html(response))
        hotel_df["review_link"] = hotel_review_links
        hotel_df["dynamic_links"] = get_all_dynamic_links(hotel_df)
        return hotel_df
        # hotel_df.to_csv(output_path, index=False)
    
    else:
        logging.error("Failed to fetch hotel data.")



In [8]:
user_agents_path = "data/user-agents.json"
output_path = None
HOTELS_URL = "https://www.tripadvisor.com.tr/Hotels-g297962-Antalya_Turkish_Mediterranean_Coast-Hotels.html"

In [9]:
create_hotel_df(user_agents_path, output_path)

2024-06-02 12:19:54,444 - ERROR - Error fetching HTML from URL: https://www.tripadvisor.com.tr/Hotels-g297962-Antalya_Turkish_Mediterranean_Coast-Hotels.html
2024-06-02 12:19:54,445 - ERROR - Error fetching HTML from URL: https://www.tripadvisor.com.tr/Hotels-g297962-Antalya_Turkish_Mediterranean_Coast-Hotels.html
2024-06-02 12:19:54,445 - ERROR - Error fetching HTML from URL: https://www.tripadvisor.com.tr/Hotels-g297962-Antalya_Turkish_Mediterranean_Coast-Hotels.html
2024-06-02 12:19:54,446 - ERROR - Error fetching HTML from URL: https://www.tripadvisor.com.tr/Hotels-g297962-Antalya_Turkish_Mediterranean_Coast-Hotels.html
2024-06-02 12:19:54,446 - ERROR - Error fetching HTML from URL: https://www.tripadvisor.com.tr/Hotels-g297962-Antalya_Turkish_Mediterranean_Coast-Hotels.html
2024-06-02 12:19:54,446 - ERROR - Error fetching HTML from URL: https://www.tripadvisor.com.tr/Hotels-g297962-Antalya_Turkish_Mediterranean_Coast-Hotels.html
2024-06-02 12:19:54,447 - ERROR - Error fetching HTM