In [None]:
import time
import random
import pandas as pd
from selenium import webdriver
from selenium.webdriver.common.by import By
from selenium.webdriver.chrome.service import Service
from selenium.webdriver.common.keys import Keys
from bs4 import BeautifulSoup
from selenium.webdriver.chrome.options import Options
import re
import logging
import requests
from contextlib import contextmanager
from selenium.webdriver.support.ui import WebDriverWait
from selenium.webdriver.support import expected_conditions as EC
from selenium.common.exceptions import TimeoutException, StaleElementReferenceException
from functools import wraps
import pickle

def setup_logger():
    logger = logging.getLogger(__name__)
    
    # Nếu logger đã được cấu hình, trả về nó
    if logger.hasHandlers():
        return logger
    
    # Xóa tất cả handlers hiện có
    logger.handlers.clear()
    
    logger.setLevel(logging.INFO)
    
    # Tạo file handler
    file_handler = logging.FileHandler('scraper.log')
    file_handler.setLevel(logging.INFO)
    
    # Tạo console handler
    console_handler = logging.StreamHandler()
    console_handler.setLevel(logging.INFO)
    
    # Tạo formatter
    formatter = logging.Formatter('%(asctime)s - %(levelname)s - %(message)s')
    file_handler.setFormatter(formatter)
    console_handler.setFormatter(formatter)
    
    # Thêm handlers vào logger
    logger.addHandler(file_handler)
    logger.addHandler(console_handler)
    
    # Ngăn chặn logger truyền messages lên các parent loggers
    logger.propagate = False
    
    return logger

# Xóa tất cả handlers của root logger
logging.getLogger().handlers.clear()
# Khởi tạo logger
logger = setup_logger()

# Rate limiting decorator
def rate_limited(max_per_second):
    min_interval = 1.0 / float(max_per_second)
    def decorate(func):
        last_time_called = [0.0]
        @wraps(func)
        def rate_limited_function(*args, **kwargs):
            elapsed = time.time() - last_time_called[0]
            left_to_wait = min_interval - elapsed
            if left_to_wait > 0:
                time.sleep(left_to_wait)
            ret = func(*args, **kwargs)
            last_time_called[0] = time.time()
            return ret
        return rate_limited_function
    return decorate

# Cache functions
def load_cache():
    try:
        with open('scrape_cache.pkl', 'rb') as f:
            return pickle.load(f)
    except FileNotFoundError:
        return {}

def save_cache(cache):
    with open('scrape_cache.pkl', 'wb') as f:
        pickle.dump(cache, f)

scrape_cache = load_cache()

@contextmanager
def managed_chrome_driver(webdriver_path):
    """
    Context manager để quản lý vòng đời của Chrome WebDriver.
    
    Args:
        webdriver_path (str): Đường dẫn đến ChromeDriver.
    
    Yields:
        webdriver.Chrome: Đối tượng WebDriver đã được khởi tạo.
    """
    options = Options()
    options.add_argument('--disable-gpu')
    driver = webdriver.Chrome(service=Service(webdriver_path), options=options)
    try:
        yield driver
    finally:
        driver.quit()

@rate_limited(0.1)  # 1 request per 10 seconds
def scrape_page(url):
    if url in scrape_cache:
        logger.info(f"Using cached data for {url}")
        return scrape_cache[url]

    def exponential_backoff(max_retries=5, backoff_factor=2):
        for i in range(max_retries):
            try:
                time.sleep(random.uniform(10, 20) * (backoff_factor ** i))
                with requests.Session() as session:
                    page = session.get(url)
                    page.raise_for_status()
                
                soup = BeautifulSoup(page.text, 'html.parser')
                
                soup_title = soup.find('h1', class_="title")
                title = soup_title.text.strip() if soup_title else 'No Title'
                
                soup_content = soup.find_all('span', class_="richtext-text css-1iqe90x")
                content = ' '.join([c.text.strip() for c in soup_content])
                
                date_match = re.search(r'/(\d{4}-\d{2}-\d{2})-', url)
                date = date_match.group(1) if date_match else 'No Date'
                
                logger.info(f"Scraped URL: {url} | Date: {date} | Title: {title[:50]}...")
                
                result = (date, title, content)
                scrape_cache[url] = result
                save_cache(scrape_cache)
                return result
            except requests.RequestException as e:
                if e.response.status_code == 429:
                    logger.warning(f"Rate limit exceeded for {url}. Retrying in {backoff_factor ** i} seconds...")
                    continue
                logger.error(f"Network error while scraping {url}: {e}")
            except Exception as e:
                logger.error(f"Unexpected error while scraping {url}: {e}")
        logger.error(f"Failed to scrape {url} after {max_retries} attempts")
        return None, None, None

    return exponential_backoff()

def get_all_post_urls(base_url, driver, target_date='2020-01-01'):
    post_urls = set()
    found_target_date = False
    scroll_pause_time = random.uniform(15, 30)  # Increased wait time
    last_height = driver.execute_script("return document.body.scrollHeight")
    no_new_content_count = 0
    max_no_new_content = 10

    try:
        driver.get(base_url)
        time.sleep(random.uniform(10, 15))  # Increased initial wait time

        while not found_target_date and no_new_content_count < max_no_new_content:
            scroll_height = random.randint(100, 300)  # Reduced scroll amount
            driver.execute_script(f"window.scrollBy(0, {scroll_height});")
            time.sleep(random.uniform(3, 7))  # Increased wait after each scroll

            if random.random() < 0.05:  # Reduced frequency to 5%
                perform_random_action(driver)

            time.sleep(scroll_pause_time)

            if random.random() < 0.2:  # 20% chance to pause for "reading"
                reading_time = random.uniform(10, 20)  # Increased reading time
                logger.info(f"Pausing to 'read' content for {reading_time:.2f} seconds")
                time.sleep(reading_time)

            new_urls_found = find_new_links(driver, post_urls, target_date)
            
            if new_urls_found:
                no_new_content_count = 0
            else:
                no_new_content_count += 1

            if found_target_date:
                break

            new_height = driver.execute_script("return document.body.scrollHeight")
            if new_height == last_height:
                for _ in range(3):
                    driver.execute_script("window.scrollTo(0, document.body.scrollHeight);")
                    time.sleep(scroll_pause_time)
                    new_height = driver.execute_script("return document.body.scrollHeight")
                    if new_height > last_height:
                        no_new_content_count = 0
                        break
                else:
                    if no_new_content_count >= max_no_new_content:
                        logger.info("Reached end of page or no new content. Refreshing...")
                        driver.refresh()
                        time.sleep(random.uniform(10, 15))
                        no_new_content_count = 0

            last_height = new_height
            scroll_pause_time = random.uniform(15, 30)  # Increased and randomized pause time

            logger.info(f"URLs found: {len(post_urls)}, Next wait time: {scroll_pause_time:.2f}s")

    except Exception as e:
        logger.error(f"Error during scrolling: {e}")

    logger.info(f"Total post URLs collected from {base_url}: {len(post_urls)}")
    return list(post_urls)

def perform_random_action(driver):
    """Thực hiện một hành động ngẫu nhiên trên trang."""
    try:
        actions = [
            lambda: driver.find_element(By.TAG_NAME, 'body').send_keys(Keys.PAGE_UP),
            lambda: driver.find_element(By.TAG_NAME, 'body').send_keys(Keys.PAGE_DOWN),
            lambda: driver.find_element(By.TAG_NAME, 'body').send_keys(Keys.ARROW_UP),
            lambda: driver.find_element(By.TAG_NAME, 'body').send_keys(Keys.ARROW_DOWN),
            lambda: WebDriverWait(driver, 10).until(EC.element_to_be_clickable((By.TAG_NAME, 'button'))).click()
        ]
        random.choice(actions)()
        logger.info("Performed a random action")
    except Exception as e:
        logger.warning(f"Failed to perform random action: {e}")

def find_new_links(driver, existing_urls, target_date):
    """Tìm các liên kết mới và thêm vào tập hợp existing_urls."""
    new_urls_found = False
    try:
        links = WebDriverWait(driver, 10).until(
            EC.presence_of_all_elements_located((By.XPATH, "//a[contains(@href, '/en/square/post/')]"))
        )
        for link in links:
            url = link.get_attribute('href')
            if url not in existing_urls:
                date_match = re.search(r'/(\d{4}-\d{2}-\d{2})-', url)
                if date_match:
                    date = date_match.group(1)
                    if date <= target_date:
                        return True  # Tìm thấy target date
                    existing_urls.add(url)
                    new_urls_found = True
    except (TimeoutException, StaleElementReferenceException) as e:
        logger.warning(f"Error finding links: {e}")
    return new_urls_found

def process_url(url, webdriver_path):
    with managed_chrome_driver(webdriver_path) as driver:
        try:
            post_urls = get_all_post_urls(url, driver, target_date='2020-01-01')
            
            data = []
            for i, post_url in enumerate(post_urls, 1):
                date, title, content = scrape_page(post_url)
                if date and title and content:
                    data.append({'Date': date, 'Title': title, 'Content': content})
                
                if i % 50 == 0:  # Reduced frequency of temporary saves
                    partial_df = pd.DataFrame(data)
                    partial_filename = f'temp_{i}_posts.csv'
                    partial_df.to_csv(partial_filename, index=False, encoding='utf-8-sig')
                    logger.info(f"Temporary data saved to {partial_filename}")
                
                time.sleep(random.uniform(5, 10))  # Added delay between processing URLs
            
            df = pd.DataFrame(data)
            df['Date'] = pd.to_datetime(df['Date'])
            df = df.sort_values(by='Date', ascending=False).reset_index(drop=True)
            
            filename = 'bitcoin_news.csv' if 'bitcoin' in url else 'ethereum_news.csv' if 'ethereum' in url else 'output.csv'
            df.to_csv(filename, index=False, encoding='utf-8-sig')
            logger.info(f"Data saved to {filename}")
        
        except Exception as e:
            logger.error(f"Error processing {url}: {e}")

# Main execution
webdriver_path = 'chromedriver.exe'
urls = [
    'https://www.binance.com/en/square/news/bitcoin%20news',
    'https://www.binance.com/en/square/news/ethereum%20news'
]

for url in urls:
    process_url(url, webdriver_path)