In [1]:
# Import scrapy
import scrapy
# Import the CrawlerProcess: for running the spider
from scrapy.crawler import CrawlerProcess
#Pandas for organizing the received data
import pandas as pd
import time

In [2]:
url_short = 'https://www.indeed.com/jobs?q=data+scientist&l=San+Francisco+Bay+Area,+CA&radius=50&start=0'

# Create the Spider class
class SFO_DS_Spider(scrapy.Spider):
    name = "sfo_ds_spider"
    
    custom_settings = {
        'COOKIES_ENABLED': False,
        'DOWNLOAD_DELAY': 4
    }
    
  # start_requests method
    def start_requests(self):
        yield scrapy.Request(url = url_short, callback = self.parse, dont_filter=True)
        
  # First parsing method
    def parse(self, response):
        for url in response.css('h2.title a::attr(href)').getall():
            yield response.follow(url = url, callback = self.parse_pages)
        
        next_page = response.xpath('//a[@aria-label="Next"]/@href').get()
        if next_page is not None:
                time.sleep(1)
                yield scrapy.Request(response.urljoin(next_page),self.parse)

  # Second parsing method
    def parse_pages(self, response):
        job_title = response.xpath('//h3[contains(@class,"jobsearch-JobInfoHeader-title")]/text()').extract()
        dc_dict['title'].append(job_title)
        job_body = response.xpath('//div[contains(@class,"jobsearch-jobDescriptionText")]//text()').extract()
        dc_dict['body'].append(" ".join([b.strip() for b in job_body]))
        job_bullets = response.xpath('//li//text()').extract()
        dc_dict['bullets'].append([b.strip() for b in job_bullets])

# Initialize the dictionary **outside** of the Spider class
dc_dict = {}
for key in ['title', 'body', 'bullets']:
    dc_dict[key] = []

# Run the Spider
process = CrawlerProcess()
process.crawl(SFO_DS_Spider)
process.start()

2020-05-24 19:01:03 [scrapy.utils.log] INFO: Scrapy 2.0.1 started (bot: scrapybot)
2020-05-24 19:01:03 [scrapy.utils.log] INFO: Versions: lxml 4.4.2.0, libxml2 2.9.10, cssselect 1.1.0, parsel 1.5.2, w3lib 1.20.0, Twisted 20.3.0, Python 3.7.6 | packaged by conda-forge | (default, Mar 23 2020, 22:45:16) - [Clang 9.0.1 ], pyOpenSSL 19.1.0 (OpenSSL 1.1.1g  21 Apr 2020), cryptography 2.9.2, Platform Darwin-19.4.0-x86_64-i386-64bit
2020-05-24 19:01:03 [scrapy.utils.log] DEBUG: Using reactor: twisted.internet.selectreactor.SelectReactor
2020-05-24 19:01:03 [scrapy.crawler] INFO: Overridden settings:
{'COOKIES_ENABLED': False, 'DOWNLOAD_DELAY': 4}
2020-05-24 19:01:03 [scrapy.extensions.telnet] INFO: Telnet Password: 033f5e480e1eb3dc
2020-05-24 19:01:03 [scrapy.middleware] INFO: Enabled extensions:
['scrapy.extensions.corestats.CoreStats',
 'scrapy.extensions.telnet.TelnetConsole',
 'scrapy.extensions.memusage.MemoryUsage',
 'scrapy.extensions.logstats.LogStats']
2020-05-24 19:01:03 [scrapy.mid

2020-05-24 19:02:39 [scrapy.downloadermiddlewares.redirect] DEBUG: Redirecting (302) to <GET https://www.indeed.com/viewjob?jk=544e3faafc2bf1d7&from=serp&vjs=3> from <GET https://us.conv.indeed.com/rc/clk?jk=544e3faafc2bf1d7&ctk=1e93e2u8gf33j802&t=cr&rctype=oth&orgclktk=1e93e2u8kf33j801&vjs=3&wwwho=4m_xAU4HGQbAlJdZTkhlA8hEG_rFObn1>
2020-05-24 19:02:43 [scrapy.downloadermiddlewares.redirect] DEBUG: Redirecting (302) to <GET https://www.indeed.com/viewjob?jk=43f922f9974f06bb&from=serp&vjs=3> from <GET https://us.conv.indeed.com/rc/clk?jk=43f922f9974f06bb&ctk=1e93e337obmn5802&t=cr&rctype=oth&orgclktk=1e93e337sbmn5801&vjs=3&wwwho=4m_xAU4HGQbAlJdZTkhlA8hEG_rFObn1>
2020-05-24 19:02:48 [scrapy.core.engine] DEBUG: Crawled (200) <GET https://www.indeed.com/viewjob?jk=3289d2026bac7001&from=serp&vjs=3> (referer: https://www.indeed.com/jobs?q=data+scientist&l=San+Francisco+Bay+Area,+CA&radius=50&start=0)
2020-05-24 19:02:53 [scrapy.core.engine] DEBUG: Crawled (200) <GET https://www.indeed.com/view

2020-05-24 19:05:03 [scrapy.extensions.logstats] INFO: Crawled 20 pages (at 7 pages/min), scraped 0 items (at 0 items/min)
2020-05-24 19:05:03 [scrapy.core.engine] DEBUG: Crawled (200) <GET https://www.indeed.com/viewjob?jk=23eda85606ddabf1&from=serp&vjs=3> (referer: https://www.indeed.com/jobs?q=data+scientist&l=San+Francisco+Bay+Area%2C+CA&radius=50&start=10)
2020-05-24 19:05:07 [scrapy.downloadermiddlewares.redirect] DEBUG: Redirecting (302) to <GET https://us.conv.indeed.com/rc/clk?jk=70388dfaf7fc2709&ctk=1e93e9qrrf1fa802&t=cr&rctype=oth&orgclktk=1e93e9qs0f1fa800&vjs=3&wwwho=4m_xAU4HGQbAlJdZTkhlA8hEG_rFObn1> from <GET https://www.indeed.com/rc/clk?jk=70388dfaf7fc2709&fccid=1b866506aec22461&vjs=3>
2020-05-24 19:05:12 [scrapy.downloadermiddlewares.redirect] DEBUG: Redirecting (302) to <GET https://us.conv.indeed.com/rc/clk?jk=593e22de9139a6f1&ctk=1e93e9vvpnhi6802&t=cr&rctype=oth&orgclktk=1e93e9vvunhi6800&vjs=3&wwwho=4m_xAU4HGQbAlJdZTkhlA8hEG_rFObn1> from <GET https://www.indeed.com/r

2020-05-24 19:07:23 [scrapy.core.engine] DEBUG: Crawled (200) <GET https://www.indeed.com/viewjob?jk=351c40eeba1585a5&from=serp&vjs=3> (referer: https://www.indeed.com/jobs?q=data+scientist&l=San+Francisco+Bay+Area%2C+CA&radius=50&start=30)
2020-05-24 19:07:27 [scrapy.downloadermiddlewares.redirect] DEBUG: Redirecting (302) to <GET https://www.indeed.com/viewjob?jk=eec051c0dd0eb1d2&from=serp&vjs=3> from <GET https://us.conv.indeed.com/rc/clk?jk=eec051c0dd0eb1d2&ctk=1e93ebm0hf37n802&t=cr&rctype=oth&orgclktk=1e93ebm0kf37n801&vjs=3&wwwho=4m_xAU4HGQbAlJdZTkhlA8hEG_rFObn1>
2020-05-24 19:07:33 [scrapy.downloadermiddlewares.redirect] DEBUG: Redirecting (302) to <GET https://www.indeed.com/viewjob?jk=c52ddc743a13f227&from=serp&vjs=3> from <GET https://us.conv.indeed.com/rc/clk?jk=c52ddc743a13f227&ctk=1e93ebqfjnguf802&t=cr&rctype=oth&orgclktk=1e93ebqfnnguf801&vjs=3&wwwho=4m_xAU4HGQbAlJdZTkhlA8hEG_rFObn1>
2020-05-24 19:07:37 [scrapy.downloadermiddlewares.redirect] DEBUG: Redirecting (302) to <GE

In [None]:
dc_dict

In [4]:
df_raw = pd.DataFrame(dc_dict)

In [5]:
df_raw.shape

(40, 3)