In [2]:
import scrapy

In [6]:
class Spider12(scrapy.Spider):
    name = 'spider12'
    allowed_domains = ['pagina12.com.ar']
    custom_settings = {'FEED_FORMAT':'json',
                       'FEED_URI':'resultdados.json',
                       'DEPTH_LIMIT':3}
    
    start_urls = ['https://www.pagina12.com.ar/secciones/el-pais',
                  'https://www.pagina12.com.ar/secciones/economia',
                  'https://www.pagina12.com.ar/secciones/sociedad',
                  'https://www.pagina12.com.ar/secciones/cultura-y-espectaculos',
                  'https://www.pagina12.com.ar/secciones/ciencia',
                  'https://www.pagina12.com.ar/secciones/el-mundo',
                  'https://www.pagina12.com.ar/secciones/deportes',
                  'https://www.pagina12.com.ar/secciones/contratapa']
    def parse(self, response):
        # Articulo promocionado
        nota_promocionada = response.xpath('//div[@class="featured-article__container"]/h2/a/@href').get()
        
        if nota_promocionada is not None:
            yield response.follow(nota_promocionada, callback=self.parse_nota)
        
        # listado de notas
        notas = response.xpath('//ul[@class="article-list"]//li//a/@href').getall()
        for nota in notas:
            yield response.follow(nota, callback=self.parse_nota)
        
        #link a la siguiente pagina
        next_page = response.xpath('//a[@class="pagination-btn-next"]/@href')
        if next_page is not None:
            yield response.follow(next_page, callback=self.parse)
    
    def parse_nota(self, response):
        title = response.xpath('//div[@class="article-title"]/text()').get()
        date = response.xpath('//span[@pubdate="pubdate"]/@datetime').get()
        summary = response.xpath('//div[@class="article-summary"]/text()').get()
        prefix = response.xpath('//div[@class="article-prefix"]/text()').get()
        media = response.xpath('//div[@class="article-main-media-image"]/@data-src').getall()[-1]
        body = "\n\n".join(response.xpath('//div[@class="article-body"]//@div[@class="article-text"]//p/text()')).get_all()
        author = response.xpath('//div[@class="article-author"]//span//a/text()').get()
        yield{'url':response.url,
              'titulo':title,
              'date':date,
              'summary':summary,
              'prefix':prefix,
              'media':media,
              'body':body,
              'author':author}

In [7]:
from scrapy.crawler import CrawlerProcess

In [9]:
process = CrawlerProcess()
process.crawl(Spider12)
process.start()

2019-12-15 13:11:39 [scrapy.utils.log] INFO: Scrapy 1.6.0 started (bot: scrapybot)
2019-12-15 13:11:39 [scrapy.utils.log] INFO: Versions: lxml 4.3.4.0, libxml2 2.9.9, cssselect 1.1.0, parsel 1.5.2, w3lib 1.21.0, Twisted 19.10.0, Python 3.7.3 (default, Mar 27 2019, 16:54:48) - [Clang 4.0.1 (tags/RELEASE_401/final)], pyOpenSSL 19.0.0 (OpenSSL 1.1.1d  10 Sep 2019), cryptography 2.7, Platform Darwin-17.7.0-x86_64-i386-64bit
2019-12-15 13:11:39 [scrapy.crawler] INFO: Overridden settings: {'DEPTH_LIMIT': 3, 'FEED_FORMAT': 'json', 'FEED_URI': 'resultdados.json'}
2019-12-15 13:11:39 [scrapy.extensions.telnet] INFO: Telnet Password: 923fb828e130c891
2019-12-15 13:11:39 [scrapy.middleware] INFO: Enabled extensions:
['scrapy.extensions.corestats.CoreStats',
 'scrapy.extensions.telnet.TelnetConsole',
 'scrapy.extensions.memusage.MemoryUsage',
 'scrapy.extensions.feedexport.FeedExporter',
 'scrapy.extensions.logstats.LogStats']
2019-12-15 13:11:39 [scrapy.middleware] INFO: Enabled downloader middle

ReactorNotRestartable: 

In [11]:
process.stop()

2019-12-15 13:12:27 [scrapy.core.engine] INFO: Closing spider (shutdown)
2019-12-15 13:12:27 [scrapy.statscollectors] INFO: Dumping Scrapy stats:
{'finish_reason': 'shutdown',
 'finish_time': datetime.datetime(2019, 12, 15, 19, 12, 27, 279235),
 'log_count/INFO': 9,
 'memusage/max': 71290880,
 'memusage/startup': 71290880,
 'start_time': datetime.datetime(2019, 12, 15, 19, 11, 39, 842009)}
2019-12-15 13:12:27 [scrapy.core.engine] INFO: Spider closed (shutdown)


<DeferredList at 0x10d27fcf8 current result: [(True, None)]>