In [None]:
# -*- coding: utf-8 -*-
import urllib.request
import re
from bs4 import BeautifulSoup
import pickle
import os.path
import json

def gets_html(url):
    """ Lee un url y devuelve el código html """
    req = urllib.request.Request(url,
                                 headers={'User-Agent':
                                          'Mozilla/5.0 (Windows NT 6.1; Win64; x64)'})
    html = urllib.request.urlopen(req).read().decode("utf-8")
    return html

def gets_article_links(html):
    soup = BeautifulSoup(html, "lxml")
    articles = soup.find_all("article", class_="nota")
    links = [article.a['href'] for article in articles]
    links = ['https://www.lanacion.com.ar'+link for link in links]
    return links

def gets_article(html):
    soup = BeautifulSoup(html, "lxml")
    titulo = soup.find_all("h1", class_='titulo')[0].text.strip()
    fecha = soup.find_all("", class_='fecha')[0].text.strip()
    textos = [p.text for p in soup.find_all("p") if len(p.text) > 100]
    texto = ''.join(textos)
    return fecha, titulo, texto

url = 'https://www.lanacion.com.ar/revista-brando-t61735'
def scrap_url(url):
    #Scraps the homepage with links
    html = gets_html(url)
    links = gets_article_links(html)
    articles = {}
    for link in links:
        html = gets_html(link)
        titulo, texto = gets_article(html)
        articles.update({link:[titulo,texto]})
    return articles

#ohlala_url = 'https://api-contenidos.lanacion.com.ar/json/v2/tema/50784?p={}&c={}&s=1'
brando_url = 'https://api-contenidos.lanacion.com.ar/json/v2/tema/61735?p={}&c={}&s=1'
base_url = brando_url

def crawl(file_path, base_url, articles_request=30, calls = 100):
    #crawls the whole web using calls to the API
    
    #base_url: API URL
    #articles_request: how many articles per call
    #calls: how many calls to the API
    #file_path: Path to file were parsed data has been saved or is going to be saved if it doesnt exist
    
    #Load previously saved data if exists
    if os.path.isfile(file_path):
        articles = pickle.load( open( file_path, "rb" ) )
        parsed_urls = list(articles.keys())
    else:
        parsed_urls = []
        articles = {}
        
    i = 0
    for t in range(calls):
        #Get URLs from api
        api_call = base_url.format(3000+t+1,articles_request)
        print('Getting {} call'.format(t+1))
        json_str = gets_html(api_call)
        json_articles = json.loads(json_str).get('items')
        urls = ['https://www.lanacion.com.ar/'+url['url'] for url in json_articles]
        print('URLs retrieved:')
        print(*urls, sep='\n')

        #Filter new urls
        urls = [url for url in urls if url not in parsed_urls]
        print('URLs to parse: {:02d}'.format(len(urls)))
        
        #If after N calls no new URLs are found, exit
        if len(urls) == 0:
            i += 1
            if i == calls:
                print('No more urls to parse')
                return articles
            else:
                print('{:02d} responses with no new URLs'.format(i))
                continue
        #Get htmls from the URL list, parse and save in dictionary
        try:
            htmls = [(url,gets_html(url)) for url in urls]
            parsed_urls.extend(urls)
            p = 0
            for url,html in htmls:
                p += 1
                print('parsing {:02d} URL'.format(p))
                fecha, titulo, texto = gets_article(html)
                articles.update({url:[fecha, titulo, texto]})
            with open(file_path, 'wb') as f:
                # Pickle the 'data' dictionary using the highest protocol available.
                pickle.dump(articles, f, pickle.HIGHEST_PROTOCOL)
        except:
            continue
    return articles

articles = crawl('/home/fer/text_mining/brando.p',base_url, articles_request=2, calls = 50000)
#articles = crawl('/home/fer/text_mining/ohlala.p',base_url, articles_request=2, calls = 10)



Getting 1 call
URLs retrieved:
https://www.lanacion.com.ar/1325111-nueva-sony-cybershot-dsc-hx5
https://www.lanacion.com.ar/1326009-la-c-de-la-gastronomia-afrodisiaca
URLs to parse: 00
01 responses with no new URLs
Getting 2 call
URLs retrieved:
https://www.lanacion.com.ar/1325989-5-cachetazos-y-algo-mas-que-quedaran-en-el-recuerdo
https://www.lanacion.com.ar/1325087-donde-se-disfrutan-los-mejores-cigarros-de-buenos-aires
URLs to parse: 00
02 responses with no new URLs
Getting 3 call
URLs retrieved:
https://www.lanacion.com.ar/1325722-pablo-fazio-cervecero-por-opcion
https://www.lanacion.com.ar/1325720-gustavo-fernandez-protomastro-chatarrero-profesional
URLs to parse: 00
03 responses with no new URLs
Getting 4 call
URLs retrieved:
https://www.lanacion.com.ar/1325714-eduardo-costantini-h-el-productor-detras-del-cine-de-autor
https://www.lanacion.com.ar/1325710-javier-otaegui-el-mago-de-la-computadora
URLs to parse: 00
04 responses with no new URLs
Getting 5 call
URLs retrieved:
https:/

URLs retrieved:
https://www.lanacion.com.ar/1314892-toti-pasman-vs-maradona-a-un-ano-del-la-tenes-adentro-quien-tenia-razon
https://www.lanacion.com.ar/1314888-los-simpsons-por-bansky
URLs to parse: 00
31 responses with no new URLs
Getting 36 call
URLs retrieved:
https://www.lanacion.com.ar/1314882-adidas
https://www.lanacion.com.ar/1314537-como-pegarte-un-faltazo-en-la-oficina-sin-ser-descubierto-vos-como-haces
URLs to parse: 00
32 responses with no new URLs
Getting 37 call
URLs retrieved:
https://www.lanacion.com.ar/1313078-brando-maps-un-recorrido-por-lo-mejor-de-boedo
https://www.lanacion.com.ar/1314548-mercedes-oviedo-el-lado-sexy-de-la-chica-inocente
URLs to parse: 00
33 responses with no new URLs
Getting 38 call
URLs retrieved:
https://www.lanacion.com.ar/1314246-las-mejores-etiquetas-argentinas-de-tannat
https://www.lanacion.com.ar/1314218-windows-phone-7-la-nueva-apuesta-de-microsoft-para-smartphones
URLs to parse: 00
34 responses with no new URLs
Getting 39 call
URLs retrieve

parsing 01 URL
parsing 02 URL
Getting 68 call
URLs retrieved:
https://www.lanacion.com.ar/1289676-transition
https://www.lanacion.com.ar/1302615-5-sitios-para-ver-videos-on-line-alternativos-a-youtube
URLs to parse: 02
parsing 01 URL
parsing 02 URL
Getting 69 call
URLs retrieved:
https://www.lanacion.com.ar/1301949-pam-anderson-vs-carmen-electra-con-cual-de-las-dos-conejitas-te-quedarias
https://www.lanacion.com.ar/1302295-duke-nukem-forever-el-mito-del-video-juego-imposible
URLs to parse: 02
parsing 01 URL
parsing 02 URL
Getting 70 call
URLs retrieved:
https://www.lanacion.com.ar/1302258-test-drive-bmw-x1-20-una-suv-equipado-para-el-asfalto
https://www.lanacion.com.ar/1298715-adidas-edicion-limitada
URLs to parse: 02
parsing 01 URL
parsing 02 URL
Getting 71 call
URLs retrieved:
https://www.lanacion.com.ar/1300973-sexo-por-dinero-el-trabajo-mas-antiguo-de-la-naturaleza
https://www.lanacion.com.ar/1300960-ryders-los-skaters-vuelven-de-la-mano-de-las-longboards
URLs to parse: 02
parsing 

parsing 01 URL
parsing 02 URL
Getting 100 call
URLs retrieved:
https://www.lanacion.com.ar/1288352-5-formas-alternativas-en-las-que-boca-le-tendria-que-haber-ofrecido-a-riquelme-el-nuevo-contrato
https://www.lanacion.com.ar/1287574-5-bigotes-que-no-deberian-ser-afeitados
URLs to parse: 02
parsing 01 URL
parsing 02 URL
Getting 101 call
URLs retrieved:
https://www.lanacion.com.ar/1287555-mito-o-realidad-twitter-facebook-etc-solo-sirven-para-levantar-mujeres-vos-que-pensas
https://www.lanacion.com.ar/1280895-anteojos-oakley
URLs to parse: 02
parsing 01 URL
parsing 02 URL
Getting 102 call
URLs retrieved:
https://www.lanacion.com.ar/1287191-que-tiene-en-su-ipod-favio-posca
https://www.lanacion.com.ar/1286634-que-actriz-deberia-hacer-topless-en-el-cine
URLs to parse: 02
parsing 01 URL
parsing 02 URL
Getting 103 call
URLs retrieved:
https://www.lanacion.com.ar/1286895-u2-es-la-banda-que-mas-facturo-en-2010-y-encabeza-el-negocio-de-la-musica
https://www.lanacion.com.ar/1286571-donde-comes-las-

parsing 01 URL
parsing 02 URL
Getting 133 call
URLs retrieved:
https://www.lanacion.com.ar/1274034-delivery-mundial
https://www.lanacion.com.ar/1274013-zaira-se-puso-la-camiseta
URLs to parse: 02
parsing 01 URL
parsing 02 URL
Getting 134 call
URLs retrieved:
https://www.lanacion.com.ar/1274007-opciones-para-seguir-el-mundial-en-la-web
https://www.lanacion.com.ar/1273660-las-botineras-campeonas
URLs to parse: 02
parsing 01 URL
parsing 02 URL
Getting 135 call
URLs retrieved:
https://www.lanacion.com.ar/1271765-puma-maxx
https://www.lanacion.com.ar/1272728-el-mundial-de-las-marcas-las-campanas-de-ypf-y-quilmes
URLs to parse: 02
parsing 01 URL
parsing 02 URL
Getting 136 call
URLs retrieved:
https://www.lanacion.com.ar/1273329-para-vos-cual-fue-el-mundial-mas-memorable
https://www.lanacion.com.ar/1273327-llega-el-peugeot-3008
URLs to parse: 02
parsing 01 URL
parsing 02 URL
Getting 137 call
URLs retrieved:
https://www.lanacion.com.ar/1273070-las-mejores-pastas-para-comer-en-casa
https://www.

In [14]:
articles = pickle.load( open('/home/fer/text_mining/brando.p', "rb" ) )
print(len(articles.keys()))
articles['https://www.lanacion.com.ar/1415461-la-politica-viste-a-la-moda-lilita-carrio']

5154


['17  de octubre de 2011 \xa0• 14:38',
 'La política viste a la moda: Lilita Carrio',
 '\n\nLilita Carrio, nacida Elisa María Avelina Carrió, es una de las figuras políticas más erráticas del escenario nacional: de los 23 puntos de 2007 se deslizó en caída libre al penoso 3,30 por ciento de las recientes internas abiertas, \r\n resultado que la obligó  a hacer un \r\n mea culpa  como buena cristiana ( \r\n "Soy la razón de la derrota"  ). Su biografía, extensa y notoria, tiene un pico de popularidad durante la crisis del menemismo, el gobierno de la Alianza y 2001, cuando el espíritu de hastío colectivo y derrumbe social fogoneó su rol de custodia de los otrora tibios valores de la honestidad, la decencia y la administración.Los contornos complejos de su posición pública, a la vez, refuerzan este rasgo institucionalista puro con una clara vocación profética que pegó bastante en momentos de la historia argentina que tenían mucho sabor a fin del mundo y Apocalipsis. En efecto, \r\n Lilit