In [5]:
from selenium import webdriver
from selenium.webdriver.chrome.options import Options
from selenium.webdriver.common.by import By
from selenium.webdriver.support.ui import WebDriverWait
from selenium.webdriver.support import expected_conditions as EC
from selenium.common.exceptions import TimeoutException
from bs4 import BeautifulSoup
from urllib.parse import urljoin, urlparse, parse_qs, unquote
import logging
import time

# Cấu hình logging
logging.basicConfig(level=logging.INFO, format='%(asctime)s - %(levelname)s - %(message)s')

def get_direct_url(url):
    """
    Nếu URL có dạng redirect (ví dụ: /redirect?url=...), trích xuất URL thực từ tham số 'url'.
    Đồng thời, nếu URL chưa có giao thức, thêm 'https://' vào.
    """
    parsed = urlparse(url)
    if "redirect" in parsed.path.lower():
        query_params = parse_qs(parsed.query)
        if 'url' in query_params:
            direct_url = query_params['url'][0]
            direct_url = unquote(direct_url)
            if not direct_url.startswith("http"):
                direct_url = "https://" + direct_url
            return direct_url
    if not url.startswith("http"):
        url = "https://" + url
    return url

def get_article_direct_urls(driver, aggregator_url):
    """
    Mở trang tổng hợp và lấy tất cả các liên kết bài báo từ trang.
    Trước tiên, lấy các thẻ <a> có class "article-link", sau đó bổ sung tất cả các thẻ <a> khác nếu chúng có thuộc tính href hợp lệ.
    Với mỗi liên kết, mở trong tab mới để lấy URL bài báo thực (driver.current_url).
    Giới hạn số lượng URL lấy được theo tham số count (ví dụ: 10).
    """
    driver.get(aggregator_url)
    WebDriverWait(driver, 60).until(
        EC.presence_of_element_located((By.TAG_NAME, "body"))
    )
    
    # Lấy các liên kết có class "article-link"
    article_link_elements = driver.find_elements(By.CSS_SELECTOR, "a.article-link")
    
    # Lấy tất cả các thẻ <a> có thuộc tính href
    all_a_elements = driver.find_elements(By.TAG_NAME, "a")
    
    # Kết hợp các liên kết và loại bỏ trùng lặp
    seen_urls = set()
    combined_links = []
    
    for element in article_link_elements + all_a_elements:
        href = element.get_attribute("href")
        if href and href.strip() and href != "#" and href not in seen_urls:
            combined_links.append(element)
            seen_urls.add(href)
    
    direct_urls = []
    original_window = driver.current_window_handle
    count = 0
    for link in combined_links:
        href = link.get_attribute("href")
        if href:
            driver.execute_script("window.open(arguments[0]);", href)
            WebDriverWait(driver, 10).until(lambda d: len(d.window_handles) > 1)
            new_window = [w for w in driver.window_handles if w != original_window][0]
            driver.switch_to.window(new_window)
            try:
                WebDriverWait(driver, 60).until(
                    EC.presence_of_element_located((By.TAG_NAME, "body"))
                )
            except TimeoutException:
                logging.error(f"Timeout loading article: {href}")
            final_url = driver.current_url
            final_url = get_direct_url(final_url)
            direct_urls.append(final_url)
            driver.close()
            driver.switch_to.window(original_window)
            time.sleep(1)  # Delay để tránh quá tải
            count += 1
        if count == 5:  # Giới hạn lấy 10 URL, điều chỉnh nếu cần
            break
    return direct_urls

if __name__ == "__main__":
    aggregator_url = "https://tldr.tech/"  # Thay bằng URL trang tổng hợp tin tức của bạn
    chrome_options = Options()
    chrome_options.add_argument("--disable-gpu")
    chrome_options.add_argument("--start-maximized")
    chrome_options.add_argument("--disable-notifications")
    chrome_options.add_argument("--disable-popup-blocking")
    driver = webdriver.Chrome(options=chrome_options)
    
    try:
        direct_article_urls = get_article_direct_urls(driver, aggregator_url)
        logging.info("Direct Article URLs:")
        for url in direct_article_urls:
            logging.info(url)
    finally:
        driver.quit()


2025-02-24 15:05:57,977 - INFO - Direct Article URLs:
2025-02-24 15:05:57,978 - INFO - https://tldr.tech/
2025-02-24 15:05:57,979 - INFO - https://tldr.tech/newsletters
2025-02-24 15:05:57,980 - INFO - https://advertise.tldr.tech/
2025-02-24 15:05:57,980 - INFO - https://tldr.tech/tech/2025-02-21
2025-02-24 15:05:57,981 - INFO - https://blog.pragmaticengineer.com/software-engineer-jobs-five-year-low/?utm_source=tldrnewsletter


In [6]:
direct_article_urls

['https://tldr.tech/',
 'https://tldr.tech/newsletters',
 'https://advertise.tldr.tech/',
 'https://tldr.tech/tech/2025-02-21',
 'https://blog.pragmaticengineer.com/software-engineer-jobs-five-year-low/?utm_source=tldrnewsletter']

In [7]:
data

{'title': 'TLDR - A Byte Sized Daily Tech Newsletter',
 'description': 'TLDR is the free daily newsletter with the most interesting stories in startups, tech and programming!',
 'text': 'Keep up with tech in 5 minutes Get the free daily email with summaries of the most interesting stories in startups 🚀, tech 📱, and programming 💻! Subscribe Join 1,250,000 readers for one daily email Feb 21 | Tech Software engineering job openings hit five-year low? (15 minute read) There has been a 35% decrease in the number of software engineering job vacancies in the US since January 2020. While the story is similar in Canada, things are different in the UK, France, Germany, and Australia. Australia is the only country where the number of jobs listed is not lower than in 2020. Software development jobs have been the biggest boom and bust in vacancies compared to all industries. This post discusses the reasons for the decrease and predictions for what growth in the tech industry is likely to be like th