In [1]:
# Importing in each cell because of the kernel restarts.
import scrapy
import re
from scrapy.crawler import CrawlerProcess

class ESSpider(scrapy.Spider):
    # Naming the spider is important if you are running more than one spider of
    # this class simultaneously.
    name = "ESS"
    
    # URL(s) to start with.
    start_urls = [
        'http://www.everydaysexism.com',
    ]

    # Use XPath to parse the response we get.
    def parse(self, response):
        
        # Iterate over every <article> element on the page.
        for article in response.xpath('//article'):
            
            # Yield a dictionary with the values we want.
            yield {
                'name': article.xpath('header/h2/a/@title').extract_first(),
                'date': article.xpath('header/section/span[@class="entry-date"]/text()').extract_first(),
                'text': article.xpath('section[@class="entry-content"]/p/text()').extract(),
                'tags': article.xpath('*/span[@class="tag-links"]/a/text()').extract()
            }
        # Get the URL of the previous page.
        next_page = response.xpath('//div[@class="nav-previous"]/a/@href').extract_first()
        
        # There are a LOT of pages here.  For our example, we'll just scrape the first 9.
        # This finds the page number. The next segment of code prevents us from going beyond page 9.
        pagenum = int(re.findall(r'\d+',next_page)[0])
        
        # Recursively call the spider to run on the next page, if it exists.
        if next_page is not None and pagenum < 10:
            next_page = response.urljoin(next_page)
            # Request the next page and recursively parse it the same way we did above
            yield scrapy.Request(next_page, callback=self.parse)

# Tell the script how to run the crawler by passing in settings.
# The new settings have to do with scraping etiquette.          
process = CrawlerProcess({
    'FEED_FORMAT': 'json',         # Store data in JSON format.
    'FEED_URI': 'data.json',       # Name our storage file.
    'LOG_ENABLED': False,          # Turn off logging for now.
    'ROBOTSTXT_OBEY': True,
    'USER_AGENT': 'ThinkfulDataScienceBootcampCrawler (thinkful.com)',
    'AUTOTHROTTLE_ENABLED': True,
    'HTTPCACHE_ENABLED': True
})

# Start the crawler with our spider.
process.crawl(ESSpider)
process.start()
print('Success!')

Success!


In [2]:
import pandas as pd

# Checking whether we got data from all 9 pages
ESSdf=pd.read_json('data.json', orient='records')
print(ESSdf.shape)
print(ESSdf.head())

(90, 4)
        date                                   name  \
0 2019-05-03                               Veronica   
1 2019-05-03                                   Zara   
2 2019-05-03                                   Zara   
3 2019-05-03  Veronika Didusenko, Miss Ukraine 2018   
4 2019-05-03                                  Kelly   

                                                tags  \
0                                             [Home]   
1                                     [Public space]   
2                         [Public space, University]   
3  [Media, missuniverse, missworld, singlemoms, W...   
4       [Friendships, Home, Public space, Workplace]   

                                                text  
0  [My brother says that there should be an All M...  
1  [When I was fifteen I went skating with my sis...  
2  [Two stories in different places, but they hap...  
3  [I am a professional model and have been walki...  
4  [I got called the mother of the business as on