# HTML Request & BeautifulSoup

### Import Libraries

In [112]:
import requests
# Text Parser Identifier
from bs4 import BeautifulSoup
# Data Manipulation
import pandas as pd

In [14]:
# Getting URL
url = 'https://www.pagina12.com.ar'

In [15]:
# Retrieving URL
p12 = requests.get(url)

In [16]:
# Checking on Server Status Code
p12.status_code

200

In [17]:
print(p12.text)

<!DOCTYPE html><html class="no-js " lang="es"><head><meta charset="utf-8"><title>Página | 12: La otra mirada sobre Argentina y el mundo</title><meta name="google-site-verification" content="x6zSdT0DBcKDmridH4LpEVrCmxcOunR2dgBQVmuL6fg"><link rel="canonical" href="https://www.pagina12.com.ar"><script type="application/ld+json">{"@context": "http://schema.org","@type": "Organization","name": "Página12","url": "https://www.pagina12.com.ar","logo": {"@type": "ImageObject","url": "https://www.pagina12.com.ar/assets/media/logo_default_p12.png","width": "600","height": "60"},"sameAs":["https://twitter.com/pagina12","https://www.youtube.com/channel/UCJNDedOnljCssaiRZqg8-Dg","https://www.instagram.com/pagina12/","https://www.facebook.com/Pagina12ok/"]}</script><meta property="description" name="description" content="Información sobre Argentina y el mundo. Noticias en fotos y videos de los principales hechos y acontecimientos del país. Análisis, opinión y entrevistas."><meta property="fb:pages" n

### Looking for Title's Header

#### Many times the response to the request can be something other than text: an image, an audio file, a video, etc.

In [18]:
p12.content

b'<!DOCTYPE html><html class="no-js " lang="es"><head><meta charset="utf-8"><title>P\xc3\xa1gina | 12: La otra mirada sobre Argentina y el mundo</title><meta name="google-site-verification" content="x6zSdT0DBcKDmridH4LpEVrCmxcOunR2dgBQVmuL6fg"><link rel="canonical" href="https://www.pagina12.com.ar"><script type="application/ld+json">{"@context": "http://schema.org","@type": "Organization","name": "P\xc3\xa1gina12","url": "https://www.pagina12.com.ar","logo": {"@type": "ImageObject","url": "https://www.pagina12.com.ar/assets/media/logo_default_p12.png","width": "600","height": "60"},"sameAs":["https://twitter.com/pagina12","https://www.youtube.com/channel/UCJNDedOnljCssaiRZqg8-Dg","https://www.instagram.com/pagina12/","https://www.facebook.com/Pagina12ok/"]}</script><meta property="description" name="description" content="Informaci\xc3\xb3n sobre Argentina y el mundo. Noticias en fotos y videos de los principales hechos y acontecimientos del pa\xc3\xads. An\xc3\xa1lisis, opini\xc3\xb3n

#### Let's look at other elements of the response. Response headers

In [19]:
p12.headers

{'Date': 'Fri, 05 May 2023 15:56:43 GMT', 'Content-Type': 'text/html; charset=utf-8', 'Transfer-Encoding': 'chunked', 'Connection': 'keep-alive', 'vary': 'Accept-Encoding', 'x-dns-prefetch-control': 'off', 'strict-transport-security': 'max-age=15724800; includeSubDomains', 'x-download-options': 'noopen', 'x-content-type-options': 'nosniff', 'x-xss-protection': '1; mode=block', 'x-etag': 'W/"712d0-87f0fDHn2qvuFQg79awZstDu+Kc"', 'x-backend': 'frontend', 'x-type': 'Dynamic URI', 'x-backend-ttl': '120.000', 'age': '66', 'grace': '86400.000 none', 'ttl': '53.717', 'x-instance': 'cache-front-prod-varnish-779f5667bd-2q89p', 'x-restarts': '0', 'x-cache': 'hit cached', 'CF-Cache-Status': 'DYNAMIC', 'Server': 'cloudflare', 'CF-RAY': '7c2a2ccfec121f3d-DEN', 'Content-Encoding': 'br'}

#### Header's Requests 

In [20]:
p12.request.headers

{'User-Agent': 'python-requests/2.28.1', 'Accept-Encoding': 'gzip, deflate, br', 'Accept': '*/*', 'Connection': 'keep-alive'}

#### The request's content recetly performed warned us about python library uses & that it is not a conventional browser. Can be modified

In [21]:
p12.request.method

'GET'

In [22]:
p12.request.url

'https://www.pagina12.com.ar/'

#### Since HTML code has already retrieved. Let's extract the desired information

In [24]:
# Parser Application with lxml
s = BeautifulSoup(p12.text, 'lxml')

In [25]:
# Verifying Data Type
type(s)

bs4.BeautifulSoup

In [29]:
# Let's look into this information 
print(s.prettify())

<!DOCTYPE html>
<html class="no-js" lang="es">
 <head>
  <meta charset="utf-8"/>
  <title>
   Página | 12: La otra mirada sobre Argentina y el mundo
  </title>
  <meta content="x6zSdT0DBcKDmridH4LpEVrCmxcOunR2dgBQVmuL6fg" name="google-site-verification"/>
  <link href="https://www.pagina12.com.ar" rel="canonical"/>
  <script type="application/ld+json">
   {"@context": "http://schema.org","@type": "Organization","name": "Página12","url": "https://www.pagina12.com.ar","logo": {"@type": "ImageObject","url": "https://www.pagina12.com.ar/assets/media/logo_default_p12.png","width": "600","height": "60"},"sameAs":["https://twitter.com/pagina12","https://www.youtube.com/channel/UCJNDedOnljCssaiRZqg8-Dg","https://www.instagram.com/pagina12/","https://www.facebook.com/Pagina12ok/"]}
  </script>
  <meta content="Información sobre Argentina y el mundo. Noticias en fotos y videos de los principales hechos y acontecimientos del país. Análisis, opinión y entrevistas." name="description" property="des

In [32]:
# Getting information from tag's retrievier

# Let's store it within a variable
sections = s.find('ul', attrs={'class':'horizontal-list main-sections hide-on-dropdown'}).find_all('li')
sections

[<li class="p12-separator--right--blue"><a href="https://www.pagina12.com.ar/secciones/el-pais">El país</a></li>,
 <li class="p12-separator--right--blue"><a href="https://www.pagina12.com.ar/secciones/economia">Economía</a></li>,
 <li class="p12-separator--right--blue"><a href="https://www.pagina12.com.ar/secciones/sociedad">Sociedad</a></li>,
 <li class="no-separator-on-1040 p12-separator--right--blue"><a href="https://www.pagina12.com.ar/suplementos/cultura-y-espectaculos">Espectáculos</a></li>,
 <li class="hide-on-1040 p12-separator--right--blue"><a href="https://www.pagina12.com.ar/secciones/deportes">Deportes</a></li>,
 <li class="hide-on-1040 p12-separator--right--blue"><a href="https://www.pagina12.com.ar/secciones/el-mundo">El mundo</a></li>,
 <li class="hide-on-1040"><a href="https://www.pagina12.com.ar/secciones/ciencia">Ciencia</a></li>]

In [34]:
# Let's get the first element
section = sections[0]
section

<li class="p12-separator--right--blue"><a href="https://www.pagina12.com.ar/secciones/el-pais">El país</a></li>

In [36]:
section.find('a')

<a href="https://www.pagina12.com.ar/secciones/el-pais">El país</a>

In [37]:
# Checking the tect information within
section.a.get_text()

'El país'

In [38]:
# Getting on "string" mode 
section.a.get('href')

'https://www.pagina12.com.ar/secciones/el-pais'

In [61]:
# Accessing by List Comprehension

links_sections = [section.a.get('href') for section in sections if section.a is not None]
links_sections

['https://www.pagina12.com.ar/secciones/el-pais',
 'https://www.pagina12.com.ar/secciones/economia',
 'https://www.pagina12.com.ar/secciones/sociedad',
 'https://www.pagina12.com.ar/suplementos/cultura-y-espectaculos',
 'https://www.pagina12.com.ar/secciones/deportes',
 'https://www.pagina12.com.ar/secciones/el-mundo',
 'https://www.pagina12.com.ar/secciones/ciencia']

In [62]:
# New request & retrieve onf this special link

sec = requests.get(links_sections[0])

In [63]:
# Checking on sec status
sec.status_code

200

In [76]:
# Parsing on this new section

s_section = BeautifulSoup(sec.text, 'lxml')

In [77]:
# Retrieving on prettifying information

print(s_section.prettify())

<!DOCTYPE html>
<html amp="" lang="es">
 <head>
  <meta charset="utf-8"/>
  <title>
   El país | Página12
  </title>
  <!-- DUST PATH: /usr/src/app/src/widgets/fc_jsonLD.dust/ -->
  <script type="application/ld+json">
   {"@context": "http://schema.org","@type": "Organization","name": "Página12","url": "https://www.pagina12.com.ar","logo": {"@type": "ImageObject","url": "https://www.pagina12.com.ar/assets/media/logo_default_p12.png","width": "600","height": "60"},"sameAs":["https://twitter.com/pagina12","https://www.youtube.com/channel/UCJNDedOnljCssaiRZqg8-Dg","https://www.instagram.com/pagina12/","https://www.facebook.com/Pagina12ok/"]}
  </script>
  <script type="application/ld+json">
   {"@context": "http://schema.org","@type": "NewsArticle","mainEntityOfPage": {"@type": "WebPage","@id": "https://www.pagina12.com.ar/secciones/el-pais"},"headline": "El país | Página12","isAccessibleForFree": "False","image": {"@type": "ImageObject","url": "https://images.pagina12.com.ar/styles/focal

In [78]:
featured_article = s_section.find('div', attrs={'class':'article-item__content'})
featured_article

<div class="article-item__content"><!-- Title --><h2 class="h1 title-list"><a href="/546393-casacion-rechazo-un-nuevo-pedido-de-libertad-del-tigre-acost">Casación rechazó un nuevo pedido de libertad del "Tigre" Acosta</a></h2><!-- Kicker --><p><a href="/546393-casacion-rechazo-un-nuevo-pedido-de-libertad-del-tigre-acost">El genocida está detenido hace 25 años</a></p></div>

In [81]:
featured_article.a.get('href')

'/546393-casacion-rechazo-un-nuevo-pedido-de-libertad-del-tigre-acost'

In [86]:
article_list = s_section.find('div', attrs={'class':'articles-list'})

In [87]:
article_list

<div class="articles-list is-grid-col2 grid-mobile-row"><!-- DUST PATH: /usr/src/app/src/templates/partials/amp/lists/articles_list_item.dust/ --><!-- DUST PATH: /usr/src/app/src/templates/partials/amp/articles/featured_article.dust/ --><article class="article-item article-item--featured deco-bar opinion"><!-- Image --><div class="article-item__header deco-bar-here-bottom is-mobile-left"><a href="/546387-despues-del-capitalismo-que"><!-- DUST PATH: /usr/src/app/node_modules/frontend-core/views/widgets/fc_displayImg_amp.dust/ --><amp-img alt="" class="" height="313" layout="responsive" src="https://images.pagina12.com.ar/styles/focal_3_2_470x313/public/2023-05/722804-toroimages-20-281-29.jpg?h=eea869cb&amp;itok=1vgopfVh" srcset="https://images.pagina12.com.ar/styles/focal_3_2_470x313/public/2023-05/722804-toroimages-20-281-29.jpg?h=eea869cb&amp;itok=1vgopfVh 470w, https://images.pagina12.com.ar/styles/focal_3_2_300x200/public/2023-05/722804-toroimages-20-281-29.jpg?h=eea869cb&amp;itok=Y

#### Dealing with Errors

In [102]:
link_seccion='https://www.pagina13.com.ar/secciones/el-pais'

In [110]:
def obtener_noticias(link_seccion):
    # Setting a URL's List
    lista_noticias=[]
    try:
        url=requests.get(link_seccion)
        # preguntamos si nuestra peticion nos devuelve como resultado 200 entonces parseamos la web
        if url.status_code== 200:
            soup=BeautifulSoup(url.text, 'lxml')
            
            #obtener noticias promocionados
            noticias_promocionadas=soup.find('div', attrs={'class':'article-item__content'})
            if noticias_promocionadas:
                lista_noticias.append(link_seccion+noticias_promocionadas.a.get('href'))

            #obtener resto de noticias
            resto_noticias=soup.find('section', attrs={'class':'list-content'})
            for i in resto_noticias.find_all('div', attrs={'class':'article-item__header'}):
                if i.a:
                    lista_noticias.append(link_seccion+i.a.get('href'))
    except Exception as e:
            # en caso de que haya un error en la peticion, se activa el bloque EXCEPT
            print('Server Response Error')
            print(e)
            print ('\n')
            lista_noticias.append(e)

    return lista_noticias

In [111]:
lista_noticias = obtener_noticias(link_seccion)
lista_noticias

Server Response Error
HTTPSConnectionPool(host='www.pagina13.com.ar', port=443): Max retries exceeded with url: /secciones/el-pais (Caused by NewConnectionError('<urllib3.connection.HTTPSConnection object at 0x7f936074c1c0>: Failed to establish a new connection: [Errno 8] nodename nor servname provided, or not known'))




[requests.exceptions.ConnectionError(urllib3.exceptions.MaxRetryError("HTTPSConnectionPool(host='www.pagina13.com.ar', port=443): Max retries exceeded with url: /secciones/el-pais (Caused by NewConnectionError('<urllib3.connection.HTTPSConnection object at 0x7f936074c1c0>: Failed to establish a new connection: [Errno 8] nodename nor servname provided, or not known'))"))]

In [105]:
def obtener_notas(soup):
    '''
    Función que recibe un objeto de BeautifulSoup de una página de una sección
    y devuelve una lista de URLs a las notas de esa sección
    '''
    lista_notas = []
    
    # Obtengo el artículo promocionado
    featured_article = soup.find('div', attrs={'class':'featured-article__container'})
    if featured_article:
        lista_notas.append(featured_article.a.get('href'))
    
    # Obtengo el listado de artículos
    article_list = soup.find('ul', attrs={'class':'article-list'})
    for article in article_list.find_all('li'):
        if article.a:
            lista_notas.append(article.a.get('href'))
    
    return lista_notas

In [107]:
lista_notas = obtener_notas(s_seccion)
lista_notas

AttributeError: 'NoneType' object has no attribute 'find_all'

In [None]:
r = requests.get(url)

In [None]:
url_mala = url.replace('2','3')
url_mala

In [None]:
try:
    requests.get(url_mala)
except Exception as e:
    print('error en la request')
    print(e)
    print('\n')

In [None]:
featured_article.b.get('href')

In [None]:
try:
    featured_article.b.get('href')
except:
    pass
print('continua el codigo')

In [None]:
url_nota = lista_notas[0]

In [None]:
try:
    nota = requests.get(url_nota)
    if nota.status_code == 200:
        s_nota = BeautifulSoup(nota.text, 'lxml')
        # Extraemos el titulo
        titulo = s_nota.find('div', attrs={'class':'article-title'})
        print(titulo.text)
        # Extraer la fecha
        fecha = s_nota.find('span', attrs={'pubdate':'pubdate'}).get('datetime')
        print(fecha)
        # Extraer la volanta
        volanta = s_nota.find('div', attrs={'class':'article-prefix'})
        print(volanta.get_text())
except Exception as e:
    print('Error:')
    print(e)
    print('\n')

#### Multimedia Content

In [None]:
media = s_nota.find('div', attrs={'class':'article-main-media-image'})

In [None]:
imagenes = media.find_all('img')
imagenes

In [None]:
if len(imagenes) == 0:
    print('no se encontraron imágenes')
else:
    imagen = imagenes[-1]
    img_src = imagen.get('data-src')
    print(img_src)

In [None]:
img_req = requests.get(img_src)

In [None]:
img_req.status_code

In [None]:
from IPython.display import Image

In [None]:
Image(img_req.content)

#### Scraper Unifier 

In [None]:
def obtener_info(s_nota):
    
    # Creamos un diccionario vacío para poblarlo con la información
    ret_dict = {}
    
    # Extraemos la fecha
    fecha = s_nota.find('span', attrs={'pubdate':'pubdate'})
    if fecha:
        ret_dict['fecha'] = fecha.get('datetime')
    else:
        ret_dict['fecha'] = None
    
    # Extraemos el título
    titulo = s_nota.find('div', attrs={'class':'article-title'})
    if titulo:
        ret_dict['titulo'] = titulo.text
    else:
        ret_dict['titulo'] = None

    # Extraemos la volanta
    volanta = s_nota.find('div', attrs={'class':'article-prefix'})
    if volanta:
        ret_dict['volanta'] = volanta.get_text()
    else:
        ret_dict['volanta'] = None
    
    # Extraemos el copete
    copete = s_nota.find('div', attrs={'class':'article-summary'})
    if copete:
        ret_dict['copete'] = volanta.get_text()
    else:
        ret_dict['copete'] = None
    
    autor = s_nota.find('div', attrs={'class':'article-author'})
    if autor:
        ret_dict['autor'] = autor.a.get_text()
    else:
        ret_dict['autor'] = None
    
    # Extraemos la imagen
    media = s_nota.find('div', attrs={'class':'article-main-media-image'})
    if media:
        imagenes = media.find_all('img')
        if len(imagenes) == 0:
            print('no se encontraron imágenes')
        else:
            imagen = imagenes[-1]
            img_src = imagen.get('data-src')
            try:
                img_req = requests.get(img_src)
                if img_req.status_code == 200:
                    ret_dict['imagen'] = img_req.content
                else:
                    ret_dict['imagen'] = None
            except:
                print('No se pudo obtener la imagen')
    else:
        print('No se encontró media')
    
    # Extraemos el cuerpo de la nota
    cuerpo = s_nota.find('div', attrs={'class':'article-text'})
    if cuerpo:
        ret_dict['texto'] = cuerpo.get_text()
    else:
        ret_dict['texto'] = None
    
    return ret_dict

In [None]:
def scrape_nota(url):
    try:
        nota = requests.get(url)
    except Exception as e:
        print('Error scrapeando URL', url)
        print(e)
        return None
    
    if nota.status_code !=200:
        print(f'Error obteniendo nota {url}')
        print(f'stuts Code = {nota.status_code}')
        return None
    
    s_nota = BeautifulSoup(nota.text, 'lxml')
    
    ret_dict = obtener_info(s_nota)
    ret_dict['url'] = url
    
    return ret_dict

In [None]:
scrape_nota(url_nota)

In [None]:
links_secciones

In [None]:
notas = []
for link in links_secciones:
    try:
        r = requests.get(link)
        if r.status_code == 200:
            soup = BeautifulSoup(r.text, 'lxml')
            notas.extend(obtener_notas(soup))
        else:
            print('No se pudo obtener la sección', link)
    except:
        print('No se pudo obtener la sección', link)

In [None]:
notas

In [None]:
data = []
for i, nota in enumerate(notas):
    print(f'Scrapeando nota{i}/{len(notas)}')
    data.append(scrape_nota(nota))

In [None]:
len(data)

In [None]:
df = pd.DataFrame(data)

In [None]:
df.head()

In [None]:
df.to_csv('Notas Pagina12.csv')