In [1]:
import requests
from bs4 import BeautifulSoup
import re
from datetime import datetime
from urllib.parse import urljoin

from selenium import webdriver
from selenium.webdriver.chrome.service import Service
from webdriver_manager.chrome import ChromeDriverManager
from bs4 import BeautifulSoup

from selenium.common.exceptions import NoSuchElementException, TimeoutException, StaleElementReferenceException
from selenium.webdriver.common.by import By

from selenium.webdriver.support.ui import WebDriverWait
from selenium.webdriver.common.by import By
from selenium.webdriver.support import expected_conditions as EC

import time

In [2]:
def get_page_content(url, max_retries=3, delay=5):
    headers = {
        'User-Agent': 'Mozilla/5.0 (Windows NT 10.0; Win64; x64) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/93.0.4577.82 Safari/537.36'
    }
    retries = 0
    while retries < max_retries:
        try:
            response = requests.get(url, headers=headers)
            response.raise_for_status()
            return response.text
        except requests.exceptions.RequestException as e:
            print(f"Error fetching the page: {e}")
            retries += 1
            time.sleep(delay)
    return None

def extract_article_content(soup):
    # Remove unwanted elements like scripts, styles, and comments
    for element in soup(["script", "style", "meta", "noscript", "iframe", "aside", "header", "footer", "nav"]):
        element.decompose()

    # Find the main content container
    main_content = None
    content_tags = ["article", "div", "section", "main"]
    for tag in content_tags:
        main_content = soup.find(tag, {"class": re.compile("(article|post|content|entry|body|story)", re.IGNORECASE)})
        if main_content:
            break

    if main_content is None:
        main_content = soup.find("body")

    # Extract paragraphs from the main content
    paragraphs = main_content.find_all("p")
    article_content = "\n".join([p.get_text(strip=True) for p in paragraphs])

    # If the extracted content is too short, try extracting from the whole page
    if len(article_content) < 200:
        paragraphs = soup.find_all("p")
        article_content = "\n".join([p.get_text(strip=True) for p in paragraphs])

    # Remove empty lines and extra whitespace
    article_content = re.sub(r"\n+", "\n", article_content).strip()
    article_content = re.sub(r"\s+", " ", article_content)

    return article_content




In [3]:
from bs4 import BeautifulSoup
from datetime import datetime
import re
import json

def extract_article_title(soup):
    title_element = soup.find('title')
    if title_element:
        return title_element.get_text(strip=True)

    header_element = soup.find('header')
    if header_element:
        title_element = header_element.find('h1')
        if title_element:
            return title_element.get_text(strip=True)

    title_element = soup.find('h1')
    if title_element:
        return title_element.get_text(strip=True)

    return ""

def extract_article_date(soup):
    date_element = soup.find('time')
    if date_element:
        datetime_str = date_element.get('datetime') or date_element.get('data-timestamp')
        if datetime_str:
            try:
                return datetime.fromisoformat(datetime_str).strftime("%Y-%m-%d")
            except ValueError:
                pass

    date_element = soup.find(class_=re.compile('date|timestamp|publish', re.IGNORECASE))
    if date_element:
        datetime_str = date_element.get_text(strip=True)
        try:
            return datetime.strptime(datetime_str, "%Y-%m-%d").strftime("%Y-%m-%d")
        except ValueError:
            pass

    script_element = soup.find('script', type='application/ld+json')
    if script_element:
        try:
            ld_json = json.loads(script_element.string)
            if 'datePublished' in ld_json:
                return datetime.fromisoformat(ld_json['datePublished']).strftime("%Y-%m-%d")
        except (json.JSONDecodeError, KeyError, ValueError):
            pass

    return ""

def extract_article_author(soup):
    author_element = soup.find(class_=re.compile('author|byline', re.IGNORECASE))
    if author_element:
        return author_element.get_text(strip=True)

    script_element = soup.find('script', type='application/ld+json')
    if script_element:
        try:
            ld_json = json.loads(script_element.string)
            if 'author' in ld_json:
                if isinstance(ld_json['author'], dict):
                    return ld_json['author'].get('name', '')
                elif isinstance(ld_json['author'], list):
                    return ', '.join([author.get('name', '') for author in ld_json['author']])
        except (json.JSONDecodeError, KeyError):
            pass

    return ""

def extract_article_images(soup, base_url):
    img_tags = soup.find_all('img')
    image_urls = []

    for img in img_tags:
        img_url = img.get('src') or img.get('data-src') or img.get('srcset') or img.get('data-srcset')
        if img_url:
            if img_url.startswith('//'):
                img_url = 'https:' + img_url
            elif img_url.startswith('/'):
                img_url = urljoin(base_url, img_url)
            elif not img_url.startswith('http'):
                img_url = urljoin(base_url, img_url)
            image_urls.append(img_url)

    return list(set(image_urls))

In [4]:
from gnews import GNews

google_news = GNews(language='es', 
                    country='Mexico', 
                    period='7d', 
                    start_date=None, 
                    end_date=None, 
                    max_results=5, 
                    #exclude_websites=['yahoo.com', 'cnn.com'],
                    #proxy=proxy
                    )
news = google_news.get_news('espanol Claudia Sheinbaum')

In [5]:
def extract_article_content_selenium(url,proxy=None):
    options = webdriver.ChromeOptions()
    options.add_argument('--headless')  # Run Chrome in headless mode
    options.add_argument('user-agent=Mozilla/5.0 (Windows NT 10.0; Win64; x64) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/93.0.4577.82 Safari/537.36')
    if proxy is not None:
        options.add_argument(f'--proxy-server={proxy}')
    driver = webdriver.Chrome(service=Service(ChromeDriverManager().install()), options=options)
    driver.get(url)

    # Find the article content using common HTML tags and classes
    article_tags = [
        {'name': 'article'},
        {'name': 'div', 'class': 'article-body'},
        {'name': 'div', 'class': 'article-content'},
        {'name': 'div', 'class': 'entry-content'},
        {'name': 'div', 'class': 'post-content'},
        {'name': 'div', 'class': 'story-body'},
        {'name': 'div', 'itemprop': 'articleBody'},
        {'name': 'div', 'id': 'article-body'},
        {'name': 'div', 'class': 'article-text'},
        {'name': 'div', 'class': 'post-text'},
        {'name': 'div', 'class': 'post-body'},
        {'name': 'div', 'class': 'rich-text'},
        {'name': 'div', 'class': 'article-content'},
        {'name': 'section', 'class': 'article-body'},
        {'name': 'section', 'class': 'post-content'},
        {'name': 'section', 'class': 'entry-content'},
    ]

    article_content = ''
    for tag in article_tags:
        try:
            if 'class' in tag:
                article_element = WebDriverWait(driver, 10).until(
                    EC.presence_of_element_located((By.CLASS_NAME, tag['class']))
                )
            elif 'id' in tag:
                article_element = WebDriverWait(driver, 10).until(
                    EC.presence_of_element_located((By.ID, tag['id']))
                )
            elif 'itemprop' in tag:
                article_element = WebDriverWait(driver, 10).until(
                    EC.presence_of_element_located((By.XPATH, f"//*[@itemprop='{tag['itemprop']}']"))
                )
            else:
                article_element = WebDriverWait(driver, 10).until(
                    EC.presence_of_element_located((By.TAG_NAME, tag['name']))
                )
            
            paragraphs = article_element.find_elements(By.TAG_NAME, 'p')
            article_content = '\n'.join([p.text for p in paragraphs])
            
            if article_content:
                break
        except (TimeoutException, StaleElementReferenceException):
            continue

    if not article_content:
        # If the article content is still not found, try to extract paragraphs from the entire page
        try:
            paragraphs = WebDriverWait(driver, 10).until(
                EC.presence_of_all_elements_located((By.TAG_NAME, 'p'))
            )
            article_content = '\n'.join([p.text for p in paragraphs])
        except (TimeoutException, StaleElementReferenceException):
            pass

    driver.quit()

    # Remove empty lines and extra whitespace
    article_content = re.sub(r"\n+", "\n", article_content).strip()
    article_content = re.sub(r"\s+", " ", article_content)

    return article_content

def extract_article_content_selenium_js(url):
    options = webdriver.ChromeOptions()
    options.add_argument('--headless')  # Run Chrome in headless mode
    driver = webdriver.Chrome(service=Service(ChromeDriverManager().install()), options=options)
    driver.get(url)

    # Wait for the page to load and JavaScript to render the content
    time.sleep(5)  # Adjust the delay as needed

    # Find the article content using common HTML tags and classes
    article_tags = [
        {'name': 'article'},
        {'name': 'div', 'class': 'article-body'},
        {'name': 'div', 'class': 'article-content'},
        {'name': 'div', 'class': 'entry-content'},
        {'name': 'div', 'class': 'post-content'},
        {'name': 'div', 'class': 'story-body'},
        {'name': 'div', 'itemprop': 'articleBody'},
        {'name': 'div', 'id': 'article-body'},
        {'name': 'div', 'class': 'article-text'},
        {'name': 'div', 'class': 'post-text'},
        {'name': 'div', 'class': 'post-body'},
        {'name': 'div', 'class': 'rich-text'},
        {'name': 'div', 'class': 'article-content'},
        {'name': 'section', 'class': 'article-body'},
        {'name': 'section', 'class': 'post-content'},
        {'name': 'section', 'class': 'entry-content'},
    ]

    article_content = ''
    for tag in article_tags:
        try:
            if 'class' in tag:
                article_element = driver.find_element(By.CLASS_NAME, tag['class'])
            elif 'id' in tag:
                article_element = driver.find_element(By.ID, tag['id'])
            elif 'itemprop' in tag:
                article_element = driver.find_element(By.XPATH, f"//*[@itemprop='{tag['itemprop']}']")
            else:
                article_element = driver.find_element(By.TAG_NAME, tag['name'])
            
            paragraphs = article_element.find_elements(By.TAG_NAME, 'p')
            article_content = '\n'.join([p.text for p in paragraphs])
            
            if article_content:
                break
        except (NoSuchElementException, StaleElementReferenceException):
            continue

    if not article_content:
        # If the article content is still not found, try to extract paragraphs from the entire page
        try:
            paragraphs = driver.find_elements(By.TAG_NAME, 'p')
            article_content = '\n'.join([p.text for p in paragraphs])
        except (NoSuchElementException, StaleElementReferenceException):
            pass

    driver.quit()

    # Remove empty lines and extra whitespace
    article_content = re.sub(r"\n+", "\n", article_content).strip()
    article_content = re.sub(r"\s+", " ", article_content)

    return article_content

In [6]:
def scrape_news_article(gnews_data=None, url=None):
    if gnews_data is None:
        pass
    else:
        url = gnews_data['url']
    
    page_content = url
    page_content = get_page_content(url)
    
    if page_content:
        soup = BeautifulSoup(page_content, 'html.parser')
        article_title = extract_article_title(soup)
        article_date = extract_article_date(soup)
        article_author = extract_article_author(soup)
        article_images = extract_article_images(soup, url)
        # Try extracting content using the existing function first
        article_content = extract_article_content_selenium(url)
        
        # If the content is empty, try the new function for JavaScript-rendered content
        if not article_content.strip():
            article_content = extract_article_content_selenium_js(url)
        #article_content = ''
        
        article_data = {
            'url':url,
            'title': article_title,
            'date': article_date,
            'author': article_author,
            'content': article_content,
            'images': article_images
        }
        if gnews_data is not None:
            article_data['metadata'] = gnews_data
        return article_data
    return None

In [7]:
# Example usage
article_data = scrape_news_article(news[1])
article_data

{'url': 'https://news.google.com/rss/articles/CBMiXGh0dHBzOi8vY25uZXNwYW5vbC5jbm4uY29tLzIwMjQvMDMvMTEvY2xhdWRpYS1zaGVpbmJhdW0tcHJvcHVlc3RhLXJlbGFjaW9uLW1leGljby1lZXV1LW9yaXgv0gFgaHR0cHM6Ly9jbm5lc3Bhbm9sLmNubi5jb20vMjAyNC8wMy8xMS9jbGF1ZGlhLXNoZWluYmF1bS1wcm9wdWVzdGEtcmVsYWNpb24tbWV4aWNvLWVldXUtb3JpeC9hbXAv?oc=5&hl=en-US&gl=US&ceid=US:en',
 'title': 'Google News',
 'date': '',
 'author': '',
 'content': '',
 'images': [],
 'metadata': {'title': '¿Qué dijo y qué propone Sheinbaum sobre las relaciones entre EE.UU. y México? - CNN en Español',
  'description': '¿Qué dijo y qué propone Sheinbaum sobre las relaciones entre EE.UU. y México?  CNN en Español',
  'published date': 'Mon, 11 Mar 2024 19:27:00 GMT',
  'url': 'https://news.google.com/rss/articles/CBMiXGh0dHBzOi8vY25uZXNwYW5vbC5jbm4uY29tLzIwMjQvMDMvMTEvY2xhdWRpYS1zaGVpbmJhdW0tcHJvcHVlc3RhLXJlbGFjaW9uLW1leGljby1lZXV1LW9yaXgv0gFgaHR0cHM6Ly9jbm5lc3Bhbm9sLmNubi5jb20vMjAyNC8wMy8xMS9jbGF1ZGlhLXNoZWluYmF1bS1wcm9wdWVzdGEtcmVsYWNpb24tbWV4aWNvL