# Using Scrapy to scrape a website

### Command line version

In [None]:
# You can run this spider with the scrapy crawl blogspider -o output.json command, 
# which will save the scraped data to the output.json file.


import scrapy
from scrapy.spiders import CrawlSpider, Rule
from scrapy.linkextractors import LinkExtractor
from w3lib.html import remove_tags

class BlogSpider(CrawlSpider):
    name = "blogspider"
    allowed_domains = ["website.com"]
    start_urls = ['http://website.com/']

    rules = (
        #Rule(LinkExtractor(allow=('/posts/')), callback='parse_item', follow=True),
        Rule(LinkExtractor(allow=('/')), callback='parse_item', follow=True),
    )
    # stuff to not get blocked
    custom_settings = {
        'DOWNLOAD_DELAY': 5,  # delay between requests
        'USER_AGENT': 'Mozilla/5.0 (Windows NT 10.0; Win64; x64) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/89.0.4389.82 Safari/537.36', # mimic a browser user-agent
        'CONCURRENT_REQUESTS': 2, # reduce the number of concurrent requests
        'AUTOTHROTTLE_ENABLED': True, # enable auto-throttling
        'AUTOTHROTTLE_START_DELAY': 5, # initial download delay
        'AUTOTHROTTLE_MAX_DELAY': 30, # maximum download delay
        'AUTOTHROTTLE_TARGET_CONCURRENCY': 1.0, # average number of requests to send in parallel to each remote server
        'DEPTH_LIMIT': 2, # Only follow links one layer deep
    }

    def parse_item(self, response):
        yield {
            'url': response.url,
            'title': response.css('h1::text').get(),
            'date': response.css('time::attr(datetime)').get(),
            'content': remove_tags(" ".join(response.css('article *::text').getall())),
        }


### Jupyter Notebook version

In [None]:
# Jupyter Notebook version

from scrapy.crawler import CrawlerProcess
from scrapy.spiders import CrawlSpider, Rule
from scrapy.linkextractors import LinkExtractor
from w3lib.html import remove_tags

class BlogSpider(CrawlSpider):
    name = "blogspider"
    allowed_domains = ["blueswivel.com"]
    start_urls = ['http://blueswivel.com/']

    rules = (
        Rule(LinkExtractor(allow=('/')), callback='parse_item', follow=True),
    )

    custom_settings = {
        'DOWNLOAD_DELAY': 3,  
        'USER_AGENT': 'Mozilla/5.0 (Windows NT 10.0; Win64; x64) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/89.0.4389.82 Safari/537.36',
        'CONCURRENT_REQUESTS': 2, 
        'AUTOTHROTTLE_ENABLED': True, 
        'AUTOTHROTTLE_START_DELAY': 5, 
        'AUTOTHROTTLE_MAX_DELAY': 60, 
        'AUTOTHROTTLE_TARGET_CONCURRENCY': 1.0,
        'DEPTH_LIMIT': 2, # Only follow links one layer deep
        # this stores output in a JSON file
        'FEEDS': {
            'output.json': {
                'format': 'json',
                'encoding': 'utf8',
                'store_empty': False,
                'fields': None,
                'indent': 4,
                'item_export_kwargs': {
                    'export_empty_fields': False,
                },
            },
        },
    }
    

    def parse_item(self, response):
        yield {
            'url': response.url,
            'title': response.css('h1::text').get(),
            'date': response.css('time::attr(datetime)').get(),
            'content': remove_tags(" ".join(response.css('article *::text').getall())),
        }




In [None]:
# Need this to run the spider inside the Jupyter notebook
process = CrawlerProcess()
process.crawl(BlogSpider)
process.start()