# Web Scrapping

In [1]:
import pandas as pd
import requests as rq
from bs4 import BeautifulSoup

In [117]:
URL = "https://www.bbc.com/news"

In [118]:
try:
    res = rq.get(URL)
    res.raise_for_status()
    print("La solicitud fue exitosa")
except Exception as e:
    print(f"Error al obtener la página: {res.status_code}")

La solicitud fue exitosa


In [119]:
soup = BeautifulSoup(res.text, 'html.parser')

In [178]:
links = soup.find_all('a', href=True)
articulos = [link['href'] for link in links if '/news/articles/' in link['href']]
print(f"{len(articulos)} Artículos")
print(articulos[:3])

34 Artículos
['/news/articles/cp9yv1gnzyvo', '/news/articles/cevx7lkznm1o', '/news/articles/cvge4l109r3o']


In [179]:
def get_titulo(soup_articulo):
    return soup_articulo.find('h1').get_text(strip=True) if titulo else ""

def get_texto(soup_articulo):
    text_blocks = soup_articulo.find_all('div', {'data-component': 'text-block'})
    return ' '.join(
        p.get_text(strip=True) for block in text_blocks for p in block.find_all('p')
    )

def get_etiquetas(soup_articulo):
    tag_block = soup_articulo.find('div', {'data-component': 'tags'})
    nuevos_articulos = [a.get_text(strip=True) for a in tag_block.find_all('a')] if tag_block else []
    return ", ".join(nuevos_articulos)

def get_contribuyentes(soup_articulo):
    contributors_block = soup_articulo.find('div', {'data-testid': 'byline-new-contributors'})
    contribuyentes = [span.get_text(strip=True) for span in contributors_block.find_all('span', {'class': 'sc-b42e7a8f-7 kItaYD'})] if contributors_block else []
    return ", ".join(contribuyentes)

# def get_enlaces_topics(soup_articulo):
#     tag_block = soup_articulo.find('div', {'data-component': 'tags'})
#     return [a['href'] for a in tag_block.find_all('a', href=True)] if tag_block else []

# def get_enlaces(topics):
#     articulos = []
#     for t in topics:
#         links = soup.find_all('a', href=True)
#         articulos = [link['href'] for link in links if '/news/articles/' in link['href']]
#     return articulos

In [180]:
data = []
for articulo in articulos:
    link = f"https://www.bbc.com{articulo}"
    articulo_res = rq.get(link)
    soup_articulo = BeautifulSoup(articulo_res.text, 'html.parser')
    titulo = get_titulo(soup_articulo)
    texto = get_texto(soup_articulo)
    tags = get_etiquetas(soup_articulo)
    contribuyentes = get_contribuyentes(soup_articulo)
    if titulo and texto:
        data.append({
            'Titulo': titulo,
            'Texto': texto,
            'Etiquetas': tags,
            'Contribuyentes': contribuyentes,
            'URL': link,
        })
    # nuevos_articulos = get_enlaces(get_enlaces_topics(soup_articulo))

In [181]:
df = pd.DataFrame(data)

In [182]:
df

Unnamed: 0,Titulo,Texto,Etiquetas,Contribuyentes,URL
0,US deports hundreds of Venezuelans despite cou...,Planes carrying more than 200 Venezuelans depo...,"El Salvador, Venezuela, United States",Brandon Drenon,https://www.bbc.com/news/articles/cp9yv1gnzyvo
1,Tornadoes and dust storms leave at least 34 de...,At least 34 people have died in the US after v...,"Missouri, Tornadoes, Texas, United States, Okl...","Thomas Mackintosh, Lisa Lambert & Tiffany Wert...",https://www.bbc.com/news/articles/cevx7lkznm1o
2,Trump moves to close down Voice of America,US President Donald Trump has signed an order ...,"Donald Trump, United States",Thomas Mackintosh & Merlyn Thomas,https://www.bbc.com/news/articles/cvge4l109r3o
3,Fisherman rescued after 95 days adrift eating ...,A Peruvian fisherman who survived 95 days lost...,"Peru, Pacific Ocean, Ecuador",Rachel Hagan,https://www.bbc.com/news/articles/cj92438d3xmo
4,Tornadoes and dust storms leave at least 34 de...,At least 34 people have died in the US after v...,"Missouri, Tornadoes, Texas, United States, Okl...","Thomas Mackintosh, Lisa Lambert & Tiffany Wert...",https://www.bbc.com/news/articles/cevx7lkznm1o
5,Death toll from North Macedonia nightclub fire...,At least 59 people have been killed and more t...,North Macedonia,"Rachel Hagan, Guy Delauney",https://www.bbc.com/news/articles/c70wdedp20wo
6,How our noisy world is seriously damaging our ...,We are surrounded by an invisible killer. One ...,"Medical research, Health, Noise pollution",James Gallagher,https://www.bbc.com/news/articles/crmjdm2m4yjo
7,South Africa should not be 'bullied' by US in ...,Opposition parties in South Africa have called...,"South Africa, Africa",Danai Nesta Kupemba,https://www.bbc.com/news/articles/cge10nnynzlo
8,Pope Francis says he faces 'period of trial' i...,"Pope Francis has said he faces a ""period of tr...","Catholicism, Vatican City, Pope Francis, Roman...",Adam Goldsmith & Seher Asaf,https://www.bbc.com/news/articles/crrd17xp79zo
9,US launches wave of air strikes on Yemen's Hou...,"The US has launched a ""decisive and powerful"" ...","Middle East, Houthis, Donald Trump",Malu Cursino,https://www.bbc.com/news/articles/c05mvr3j3yro


In [185]:
n = 0
print(df['URL'][n])
print(df['Titulo'][n])
print(df['Contribuyentes'][n])
print(df['Etiquetas'][n])
print(df['Texto'][n])

https://www.bbc.com/news/articles/cp9yv1gnzyvo
US deports hundreds of Venezuelans despite court order
Brandon Drenon
El Salvador, Venezuela, United States
Planes carrying more than 200 Venezuelans deported by the US have landed in El Salvador, hours after a US judge ordered the Trump administration not to do so. El Salvador's president, Nayib Bukele, wrote on social media that 238 members of the Venezuelan gang Tren de Aragua had arrived, along with 23 members of the international MS-13 gang, on Sunday morning. Their arrival in the central American nation came aftera federal judge blocked US President Donald Trump from invoking a centuries-old wartime lawto justify the deportations - something Bukele made fun of in a later post. "Oopsie... Too late," he said. The move by the US to send alleged criminals from other countries to El Salvador was an arrangement US Secretary of State Marco Rubio previously called "the most unprecedented and extraordinary migratory agreement anywhere in the 

In [186]:
path = "./dataset/"
df.to_csv(path + "BBC_Noticias.csv", index=False)