# Web Scrapping

In [4]:
import pandas as pd
import requests as rq
from bs4 import BeautifulSoup

In [5]:
URL = "https://www.bbc.com/news"

In [6]:
try:
    res = rq.get(URL)
    res.raise_for_status()
    print("La solicitud fue exitosa")
except Exception as e:
    print(f"Error al obtener la página: {res.status_code}")

La solicitud fue exitosa


In [7]:
soup = BeautifulSoup(res.text, 'html.parser')

In [8]:
links = soup.find_all('a', href=True)
articulos = [link['href'] for link in links if '/news/articles/' in link['href']]
print(f"{len(articulos)} Artículos")
print(articulos[:3])

39 Artículos
['/news/articles/c5y41z4351qo', '/news/articles/cnvz4097q8ro', '/news/articles/cj02033n18go']


In [14]:
def get_titulo(soup_articulo):
    titulo = soup_articulo.find('h1')
    return titulo.get_text(strip=True) if titulo else ""

def get_texto(soup_articulo):
    text_blocks = soup_articulo.find_all('div', {'data-component': 'text-block'})
    return ' '.join(
        p.get_text(strip=True) for block in text_blocks for p in block.find_all('p')
    )

def get_etiquetas(soup_articulo):
    tag_block = soup_articulo.find('div', {'data-component': 'tags'})
    nuevos_articulos = [a.get_text(strip=True) for a in tag_block.find_all('a')] if tag_block else []
    return ", ".join(nuevos_articulos)

def get_contribuyentes(soup_articulo):
    contributors_block = soup_articulo.find('div', {'data-testid': 'byline-new-contributors'})
    contribuyentes = [span.get_text(strip=True) for span in contributors_block.find_all('span', {'class': 'sc-b42e7a8f-7 kItaYD'})] if contributors_block else []
    return ", ".join(contribuyentes)

# def get_enlaces_topics(soup_articulo):
#     tag_block = soup_articulo.find('div', {'data-component': 'tags'})
#     return [a['href'] for a in tag_block.find_all('a', href=True)] if tag_block else []

# def get_enlaces(topics):
#     articulos = []
#     for t in topics:
#         links = soup.find_all('a', href=True)
#         articulos = [link['href'] for link in links if '/news/articles/' in link['href']]
#     return articulos

In [15]:
data = []
for articulo in articulos:
    link = f"https://www.bbc.com{articulo}"
    articulo_res = rq.get(link)
    soup_articulo = BeautifulSoup(articulo_res.text, 'html.parser')
    titulo = get_titulo(soup_articulo)
    texto = get_texto(soup_articulo)
    tags = get_etiquetas(soup_articulo)
    contribuyentes = get_contribuyentes(soup_articulo)
    if titulo and texto:
        data.append({
            'Titulo': titulo,
            'Texto': texto,
            'Etiquetas': tags,
            'Contribuyentes': contribuyentes,
            'URL': link,
        })
    # nuevos_articulos = get_enlaces(get_enlaces_topics(soup_articulo))

In [16]:
df = pd.DataFrame(data)

In [17]:
df

Unnamed: 0,Titulo,Texto,Etiquetas,Contribuyentes,URL
0,Canada PM Mark Carney says old relationship wi...,Canadian Prime Minister Mark Carney said that ...,"Trump tariffs, Donald Trump, Canada election 2...","Jessica Murphy & Ali Abbas Ahmadi, Bernd Debu...",https://www.bbc.com/news/articles/c5y41z4351qo
1,How will carmakers be affected by Trump's tari...,A day after US President Donald Trump said he ...,"Trump tariffs, Global trade, Car industry",Natalie Sherman,https://www.bbc.com/news/articles/cnvz4097q8ro
2,Australia to hold federal election on 3 May,Australia will head to the polls for a federal...,Australia,Tiffanie Turnbull,https://www.bbc.com/news/articles/cj02033n18go
3,Starmer accuses Putin of 'playing games' over ...,Western sanctions on Russia need to be increas...,"War in Ukraine, Volodymyr Zelensky, Keir Starm...",James Gregory and Laura Gozzi,https://www.bbc.com/news/articles/c78e2x7lz25o
4,How will carmakers be affected by Trump's tari...,A day after US President Donald Trump said he ...,"Trump tariffs, Global trade, Car industry",Natalie Sherman,https://www.bbc.com/news/articles/cnvz4097q8ro
5,Senior Trump officials ordered to preserve Sig...,A federal judge ordered White House officials ...,"Donald Trump, Tulsi Gabbard, US politics, Unit...",Kayla Epstein,https://www.bbc.com/news/articles/c9de770q9e0o
6,Marco Rubio says US revoked at least 300 forei...,US Secretary of State Marco Rubio said the US ...,"US immigration, Donald Trump, US politics",Madeline Halpert,https://www.bbc.com/news/articles/c75720q9d7lo
7,King experiences temporary side effects of can...,King Charles III spent a short period of time ...,King Charles III,"James Gregory and Sean Coughlan, royal corresp...",https://www.bbc.com/news/articles/c3vwgq24klxo
8,Australia to hold federal election on 3 May,Australia will head to the polls for a federal...,Australia,Tiffanie Turnbull,https://www.bbc.com/news/articles/cj02033n18go
9,Trump withdraws Elise Stefanik's nomination to...,President Donald Trump has pulled the nominati...,"United Nations, Donald Trump, US politics",Nada Tawfik,https://www.bbc.com/news/articles/c3rnge7ql7wo


In [21]:
df = df.drop_duplicates()
df.reset_index(drop=True, inplace=True)
df

Unnamed: 0,Titulo,Texto,Etiquetas,Contribuyentes,URL
0,Canada PM Mark Carney says old relationship wi...,Canadian Prime Minister Mark Carney said that ...,"Trump tariffs, Donald Trump, Canada election 2...","Jessica Murphy & Ali Abbas Ahmadi, Bernd Debu...",https://www.bbc.com/news/articles/c5y41z4351qo
1,How will carmakers be affected by Trump's tari...,A day after US President Donald Trump said he ...,"Trump tariffs, Global trade, Car industry",Natalie Sherman,https://www.bbc.com/news/articles/cnvz4097q8ro
2,Australia to hold federal election on 3 May,Australia will head to the polls for a federal...,Australia,Tiffanie Turnbull,https://www.bbc.com/news/articles/cj02033n18go
3,Starmer accuses Putin of 'playing games' over ...,Western sanctions on Russia need to be increas...,"War in Ukraine, Volodymyr Zelensky, Keir Starm...",James Gregory and Laura Gozzi,https://www.bbc.com/news/articles/c78e2x7lz25o
4,Senior Trump officials ordered to preserve Sig...,A federal judge ordered White House officials ...,"Donald Trump, Tulsi Gabbard, US politics, Unit...",Kayla Epstein,https://www.bbc.com/news/articles/c9de770q9e0o
5,Marco Rubio says US revoked at least 300 forei...,US Secretary of State Marco Rubio said the US ...,"US immigration, Donald Trump, US politics",Madeline Halpert,https://www.bbc.com/news/articles/c75720q9d7lo
6,King experiences temporary side effects of can...,King Charles III spent a short period of time ...,King Charles III,"James Gregory and Sean Coughlan, royal corresp...",https://www.bbc.com/news/articles/c3vwgq24klxo
7,Trump withdraws Elise Stefanik's nomination to...,President Donald Trump has pulled the nominati...,"United Nations, Donald Trump, US politics",Nada Tawfik,https://www.bbc.com/news/articles/c3rnge7ql7wo
8,Can Vietnam golf its way out of new Trump tari...,Countries around the world are braced for Dona...,"Vietnam, International Business, Trump tariffs...",Annabelle Liang,https://www.bbc.com/news/articles/cge1r1ezw74o
9,Salman Rushdie to release first fiction since ...,Acclaimed author Sir Salman Rushdie is set to ...,"Salman Rushdie, Books",Malu Cursino,https://www.bbc.com/news/articles/c04zw0qvergo


In [22]:
n = 0
print(df['URL'][n])
print(df['Titulo'][n])
print(df['Contribuyentes'][n])
print(df['Etiquetas'][n])
print(df['Texto'][n])

https://www.bbc.com/news/articles/c5y41z4351qo
Canada PM Mark Carney says old relationship with US 'is over'
Jessica Murphy &  Ali Abbas Ahmadi, Bernd Debusmann
Trump tariffs, Donald Trump, Canada election 2025, US politics, Canada
Canadian Prime Minister Mark Carney said that Canada's old relationship with the United States, "based on deepening integration of our economies and tight security and military cooperation, is over". Speaking to reporters in Ottawa after a cabinet meeting, Carney said Canadians must "fundamentally reimagine our economy" in the face of US President Donald Trump's tariffs. He said Canada would respond with retaliatory tariffs that will have "maximum impact" on the US. Trump announced on Wednesday he would target imported vehicles and vehicle parts with a 25% tax, stating: "This is permanent." Carney, the Liberal Party leader, called the original Canada-US Automotive Products Agreement signed in 1965 the most important deal in his lifetime. "That's finished wit

In [23]:
path = "./dataset/"
df.to_csv(path + "BBC_Noticias.csv", index=False)