In [1]:
def crawler(google_news, news,i):
    naive_spyder = google_news.get_full_article(news[i]['url'])
    article = news[i].copy()
    naive_article = {
        'authors':naive_spyder.authors,
        'title': naive_spyder.title,
        'text': naive_spyder.text,
        'images': naive_spyder.images
        
    }
    article['naive'] = naive_article
    
    return article
    

In [2]:
import requests
import time
def get_page_content(url, max_retries=3, delay=5):
    headers = {
        'User-Agent': 'Mozilla/5.0 (Windows NT 10.0; Win64; x64) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/93.0.4577.82 Safari/537.36'
    }
    retries = 0
    while retries < max_retries:
        try:
            response = requests.get(url, headers=headers)
            response.raise_for_status()
            return response.text
        except requests.exceptions.RequestException as e:
            print(f"Error fetching the page: {e}")
            retries += 1
            time.sleep(delay)
    return None

In [3]:
from selenium import webdriver
from selenium.webdriver.chrome.service import Service
from webdriver_manager.chrome import ChromeDriverManager
from selenium.webdriver.support.ui import WebDriverWait
from selenium.webdriver.common.by import By
from selenium.webdriver.support import expected_conditions as EC
from selenium.common.exceptions import NoSuchElementException, TimeoutException, StaleElementReferenceException
import re


def extract_article_content_selenium(url,proxy=None):
    options = webdriver.ChromeOptions()
    options.add_argument('--headless')  # Run Chrome in headless mode
    options.add_argument('user-agent=Mozilla/5.0 (Windows NT 10.0; Win64; x64) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/93.0.4577.82 Safari/537.36')
    if proxy is not None:
        options.add_argument(f'--proxy-server={proxy}')
    driver = webdriver.Chrome(service=Service(ChromeDriverManager().install()), options=options)
    driver.get(url)

    # Find the article content using common HTML tags and classes
    article_tags = [
        {'name': 'article'},
        {'name': 'div', 'class': 'article-body'},
        {'name': 'div', 'class': 'article-content'},
        {'name': 'div', 'class': 'entry-content'},
        {'name': 'div', 'class': 'post-content'},
        {'name': 'div', 'class': 'story-body'},
        {'name': 'div', 'itemprop': 'articleBody'},
        {'name': 'div', 'id': 'article-body'},
        {'name': 'div', 'class': 'article-text'},
        {'name': 'div', 'class': 'post-text'},
        {'name': 'div', 'class': 'post-body'},
        {'name': 'div', 'class': 'rich-text'},
        {'name': 'div', 'class': 'article-content'},
        {'name': 'section', 'class': 'article-body'},
        {'name': 'section', 'class': 'post-content'},
        {'name': 'section', 'class': 'entry-content'},
    ]

    article_content = ''
    for tag in article_tags:
        try:
            if 'class' in tag:
                article_element = WebDriverWait(driver, 10).until(
                    EC.presence_of_element_located((By.CLASS_NAME, tag['class']))
                )
            elif 'id' in tag:
                article_element = WebDriverWait(driver, 10).until(
                    EC.presence_of_element_located((By.ID, tag['id']))
                )
            elif 'itemprop' in tag:
                article_element = WebDriverWait(driver, 10).until(
                    EC.presence_of_element_located((By.XPATH, f"//*[@itemprop='{tag['itemprop']}']"))
                )
            else:
                article_element = WebDriverWait(driver, 10).until(
                    EC.presence_of_element_located((By.TAG_NAME, tag['name']))
                )
            
            paragraphs = article_element.find_elements(By.TAG_NAME, 'p')
            article_content = '\n'.join([p.text for p in paragraphs])
            
            if article_content:
                break
        except (TimeoutException, StaleElementReferenceException):
            continue

    if not article_content:
        # If the article content is still not found, try to extract paragraphs from the entire page
        try:
            paragraphs = WebDriverWait(driver, 10).until(
                EC.presence_of_all_elements_located((By.TAG_NAME, 'p'))
            )
            article_content = '\n'.join([p.text for p in paragraphs])
        except (TimeoutException, StaleElementReferenceException):
            pass

    driver.quit()

    # Remove empty lines and extra whitespace
    article_content = re.sub(r"\n+", "\n", article_content).strip()
    article_content = re.sub(r"\s+", " ", article_content)

    return article_content

def extract_article_content_selenium_js(url):
    options = webdriver.ChromeOptions()
    options.add_argument('--headless')  # Run Chrome in headless mode
    driver = webdriver.Chrome(service=Service(ChromeDriverManager().install()), options=options)
    driver.get(url)

    # Wait for the page to load and JavaScript to render the content
    time.sleep(5)  # Adjust the delay as needed

    # Find the article content using common HTML tags and classes
    article_tags = [
        {'name': 'article'},
        {'name': 'div', 'class': 'article-body'},
        {'name': 'div', 'class': 'article-content'},
        {'name': 'div', 'class': 'entry-content'},
        {'name': 'div', 'class': 'post-content'},
        {'name': 'div', 'class': 'story-body'},
        {'name': 'div', 'itemprop': 'articleBody'},
        {'name': 'div', 'id': 'article-body'},
        {'name': 'div', 'class': 'article-text'},
        {'name': 'div', 'class': 'post-text'},
        {'name': 'div', 'class': 'post-body'},
        {'name': 'div', 'class': 'rich-text'},
        {'name': 'div', 'class': 'article-content'},
        {'name': 'section', 'class': 'article-body'},
        {'name': 'section', 'class': 'post-content'},
        {'name': 'section', 'class': 'entry-content'},
    ]

    article_content = ''
    for tag in article_tags:
        try:
            if 'class' in tag:
                article_element = driver.find_element(By.CLASS_NAME, tag['class'])
            elif 'id' in tag:
                article_element = driver.find_element(By.ID, tag['id'])
            elif 'itemprop' in tag:
                article_element = driver.find_element(By.XPATH, f"//*[@itemprop='{tag['itemprop']}']")
            else:
                article_element = driver.find_element(By.TAG_NAME, tag['name'])
            
            paragraphs = article_element.find_elements(By.TAG_NAME, 'p')
            article_content = '\n'.join([p.text for p in paragraphs])
            
            if article_content:
                break
        except (NoSuchElementException, StaleElementReferenceException):
            continue

    if not article_content:
        # If the article content is still not found, try to extract paragraphs from the entire page
        try:
            paragraphs = driver.find_elements(By.TAG_NAME, 'p')
            article_content = '\n'.join([p.text for p in paragraphs])
        except (NoSuchElementException, StaleElementReferenceException):
            pass

    driver.quit()

    # Remove empty lines and extra whitespace
    article_content = re.sub(r"\n+", "\n", article_content).strip()
    article_content = re.sub(r"\s+", " ", article_content)

    return article_content

In [4]:
def scrape_news_article(article):
    
    url = article['url']
    page_content = get_page_content(url)
    
    if page_content:
        # Try extracting content using the existing function first
        article_content = extract_article_content_selenium(url)
        # If the content is empty, try the new function for JavaScript-rendered content
        if not article_content.strip():
            article_content = extract_article_content_selenium_js(url)
        article['content'] = article_content
    else:
        article['content'] = None
    return article

In [5]:
from gnews import GNews

google_news = GNews(language='es', 
                    country='Mexico', 
                    period='14d', 
                    start_date=None, 
                    end_date=None, 
                    max_results=10, 
                    #exclude_websites=['yahoo.com', 'cnn.com'],
                    #proxy=proxy
                    )
news = google_news.get_news('Crisis del agua cdmx claudia sheinbaum')
news

[{'title': '¿Cómo enfrentará la CDMX la crisis de agua este 2024? Un especialista nos explica - Muy Interesante',
  'description': '¿Cómo enfrentará la CDMX la crisis de agua este 2024? Un especialista nos explica  Muy Interesante',
  'published date': 'Mon, 11 Mar 2024 09:00:00 GMT',
  'url': 'https://news.google.com/rss/articles/CBMiNWh0dHBzOi8vd3d3Lm11eWludGVyZXNhbnRlLmNvbS5teC9zb2NpZWRhZC8zODk1NC5odG1s0gEA?oc=5&hl=en-US&gl=US&ceid=US:en',
  'publisher': {'href': 'https://www.muyinteresante.com.mx',
   'title': 'Muy Interesante'}},
 {'title': 'Clara Brugada llama a plan megalopolitano y a largo plazo para resolver crisis agua en la CDMX - Político MX',
  'description': 'Clara Brugada llama a plan megalopolitano y a largo plazo para resolver crisis agua en la CDMX  Político MX',
  'published date': 'Wed, 28 Feb 2024 08:00:00 GMT',
  'url': 'https://news.google.com/rss/articles/CBMicmh0dHBzOi8vcG9saXRpY28ubXgvY2xhcmEtYnJ1Z2FkYS1sbGFtYS1hLXBsYW4tbWVnYWxvcG9saXRhbm8teS1hLWxhcmdvLXBsYXpv

In [6]:
article = crawler(google_news,news,0)
article

{'title': '¿Cómo enfrentará la CDMX la crisis de agua este 2024? Un especialista nos explica - Muy Interesante',
 'description': '¿Cómo enfrentará la CDMX la crisis de agua este 2024? Un especialista nos explica  Muy Interesante',
 'published date': 'Mon, 11 Mar 2024 09:00:00 GMT',
 'url': 'https://news.google.com/rss/articles/CBMiNWh0dHBzOi8vd3d3Lm11eWludGVyZXNhbnRlLmNvbS5teC9zb2NpZWRhZC8zODk1NC5odG1s0gEA?oc=5&hl=en-US&gl=US&ceid=US:en',
 'publisher': {'href': 'https://www.muyinteresante.com.mx',
  'title': 'Muy Interesante'},
 'naive': {'authors': ['Publicado Por',
   'Andrea Fischer',
   'Editora Y Periodista Científica',
   'Talia Cohen',
   'Istock Photo.',
   'Sara Carrasco',
   'Jennifer Delgado',
   'Andrea Arzola',
   'Andrea Sirvent',
   'Glóbuloazul'],
  'title': '¿Cómo enfrentará la CDMX la crisis de agua este 2024? Un especialista nos explica',
  'text': 'Es innegable. A causa de la crisis climática global, la Ciudad de México se enfrenta a la escasez de agua más grave que

In [7]:
article = scrape_news_article(article)
article

/bin/sh: line 1: google-chrome: command not found
/bin/sh: line 1: google-chrome: command not found
03/12/2024 11:38:11 AM - Get LATEST chromedriver version for google-chrome
03/12/2024 11:38:12 AM - Get LATEST chromedriver version for google-chrome
03/12/2024 11:38:12 AM - Driver [/home/uumami/.wdm/drivers/chromedriver/linux64/115.0.5790.170/chromedriver-linux64/chromedriver] found in cache


{'title': '¿Cómo enfrentará la CDMX la crisis de agua este 2024? Un especialista nos explica - Muy Interesante',
 'description': '¿Cómo enfrentará la CDMX la crisis de agua este 2024? Un especialista nos explica  Muy Interesante',
 'published date': 'Mon, 11 Mar 2024 09:00:00 GMT',
 'url': 'https://news.google.com/rss/articles/CBMiNWh0dHBzOi8vd3d3Lm11eWludGVyZXNhbnRlLmNvbS5teC9zb2NpZWRhZC8zODk1NC5odG1s0gEA?oc=5&hl=en-US&gl=US&ceid=US:en',
 'publisher': {'href': 'https://www.muyinteresante.com.mx',
  'title': 'Muy Interesante'},
 'naive': {'authors': ['Publicado Por',
   'Andrea Fischer',
   'Editora Y Periodista Científica',
   'Talia Cohen',
   'Istock Photo.',
   'Sara Carrasco',
   'Jennifer Delgado',
   'Andrea Arzola',
   'Andrea Sirvent',
   'Glóbuloazul'],
  'title': '¿Cómo enfrentará la CDMX la crisis de agua este 2024? Un especialista nos explica',
  'text': 'Es innegable. A causa de la crisis climática global, la Ciudad de México se enfrenta a la escasez de agua más grave que