In [1]:
import re
import requests
from bs4 import BeautifulSoup

Nota para Max:

La función get_summary(url,class_name) es equivalente a llamar a la función multi_reader(url,method='div',class_name='nombre_de_la_clase').

In [2]:
def multi_reader(url:str, with_headers:bool=False, method:str='p', class_name:str=None) -> str:
    """Dado un url, obtiene el contenido de la noticia.

    Args
    ==========
    url (str): dirección web del sitio que contiene la noticia
    with_headers (bool): indica si llamar a la función requests.get()
    con alguna configuración de los headers.
    method (str): indica si buscar mediante las etiquetas 'p' o 
    'div'. 
    class_name (str): Name of the class which contains the text of the
    news in the url. Used when method == 'div'.

    Returns
    ==========
    output (str): Texto de la noticia.
    """

    try:
        
        if with_headers == True:
            headers = {'User-Agent': 'Mozilla/5.0 (Windows NT 10.0; Win64; x64) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/58.0.3029.110 Safari/537.3'}
            page = requests.get(url,headers=headers)
        else:
            page = requests.get(url)

        if method == 'p':
            soup = BeautifulSoup(page.text,'html')
            output = ''
            for z in soup.find_all('p'):
                output += str(z.text)
            return output if output else None
        elif method == 'div':
            soup = BeautifulSoup(page.content, 'html.parser')
            output = soup.find('div', class_=class_name)
            return output.text if output else print('nimais')
        else:
            print(f'{method} not recognized.\nEnter either \'p\' or \'div\'')
            return None
            
    except Exception as e:
        print(f'An error ocurred: {e}')
        return None

In [3]:
def news_extractor(url:str) -> str:
    """Devuelve el texto de la noticia limpio de algunos sitios web.

    Args
    ==========
    url (str): Dirección web de la noticia.
    
    Returns
    ==========
    output (str): Texto de la noticia limpio.
    """
    parenturl = list(re.finditer(r"http(s?):\/\/(www\.)?(.+?\.).+?\/",url))[0].group()

    if parenturl == 'https://www.yahoo.com/':
        output = multi_reader(url, method='div', class_name='caas-body')
    elif parenturl == 'https://www.themarketsdaily.com/':
        output = re.sub('(\s){2,}',' ',re.sub('Get Aflac alerts:','',re.sub('(\\n){2,}','\\n',multi_reader(url,method ='div',class_name='entry').split('Recommended Stories')[0])))
    elif parenturl == 'https://businessmirror.com.ph/':
        output = multi_reader(url,with_headers=True,method='p').split('Be the First to #KnowMoreInput your search keywords and press Enter')[0]
    elif parenturl == 'https://www.dailypolitical.com/':
        output = re.sub('\\n(\\t){1,}','\n',multi_reader(url,method='p')).split('Receive News & Ratings')[0]

    return output

Podemos probar las funciones con artículos de 4 páginas diferentes:

In [4]:
# url='https://www.yahoo.com/news/israeli-defence-minister-warns-iran-001132797.html'
# url='https://www.themarketsdaily.com/2024/08/11/forthright-wealth-management-llc-raises-holdings-in-aflac-incorporated-nyseafl.html'
# url = 'https://businessmirror.com.ph/2024/08/12/generika-drugstore-salutes-uniformed-and-non-uniformed-heroes-with-exclusive-program/'
url='https://www.dailypolitical.com/2024/08/11/bank-of-america-increases-eli-lilly-and-company-nyselly-price-target-to-1125-00.html'
text=news_extractor(url)
print(text)


Posted by Tyrone Williams on Aug 11th, 2024
Eli Lilly and Company (NYSE:LLY – Get Free Report) had its price objective hoisted by stock analysts at Bank of America  from $1,000.00 to $1,125.00 in a report issued on Friday, Benzinga reports. The firm currently has a “buy” rating on the stock. Bank of America‘s target price points to a potential upside of 26.17% from the company’s current price.LLY has been the topic of several other reports. The Goldman Sachs Group upped their price target on Eli Lilly and Company from $650.00 to $723.00 and gave the company a “neutral” rating in a research report on Thursday, April 11th. Barclays upped their price objective on Eli Lilly and Company from $913.00 to $1,025.00 and gave the stock an “overweight” rating in a research note on Wednesday, July 10th. Berenberg Bank upped their price objective on Eli Lilly and Company from $850.00 to $1,000.00 and gave the stock a “buy” rating in a research note on Thursday, July 11th. Argus upped their price o