# Web Scrapping

In [8]:
import pandas as pd
import requests as rq
from bs4 import BeautifulSoup

In [1]:
URL = "https://www.bbc.com/news"

In [4]:
try:
    res = rq.get(URL)
    res.raise_for_status()
    print("La solicitud fue exitosa")
except Exception as e:
    print(f"Error al obtener la página: {res.status_code}")

La solicitud fue exitosa


In [5]:
soup = BeautifulSoup(res.text, 'html.parser')

In [12]:
links = soup.find_all('a', href=True)
articulos = [link['href'] for link in links if '/news/articles/' in link['href']]

In [9]:
data = []
for articulo in articulos:
    link = f"https://www.bbc.com{articulo}"
    articulo_res = rq.get(link)
    soup_articulo = BeautifulSoup(articulo_res.text, 'html.parser')
    titulo = soup_articulo.find('h1')
    texto =  ' '.join([palabra for p in soup_articulo.find_all('p') for palabra in p.text.strip().lower().split() if palabra.isalpha()])
    if titulo and texto:
        data.append({
            'Titulo': titulo.get_text(),
            'Texto': texto
        })

In [10]:
df = pd.DataFrame(data)

In [11]:
df

Unnamed: 0,Titulo,Texto
0,Putin sets out conditions for Ukraine ceasefire,russian president vladimir putin said he agree...
1,Is Putin ready for a ceasefire or playing for ...,russia is ready for a halt in says vladimir bu...
2,USAID kept Kajol alive – but after the cuts sh...,when kajol contracted tuberculosis in usaid ke...
3,Top Democrat Schumer backs Republican spending...,the us may avert a looming government shutdown...
4,Is Putin ready for a ceasefire or playing for ...,russia is ready for a halt in says vladimir bu...
5,Trump threatens 200% tariff on alcohol from EU,us president donald trump has threatened a tar...
6,'Killed in front of our eyes': How the Pakista...,mehboob hussain was riding the train home on t...
7,UN experts accuse Israel of sexual violence an...,un experts have accused israel of increasingly...
8,Donatella steps down as Versace creative director,donatella versace is to step down from her cre...
9,Judge orders Trump administration to give fire...,a judge has ordered several federal government...


In [13]:
path = "./dataset/"
df.to_csv(path + "BBC_Noticias.csv", index=False)