In [1]:
import scrapy
from scrapy.crawler import CrawlerProcess
import pandas as pd
from twisted.internet import reactor

class BestJobsSpider(scrapy.Spider):
    name = 'best_jobs'
    start_urls = ['https://money.usnews.com/careers/best-jobs/search']

    def start_requests(self):
        headers = {
            'User-Agent': 'Mozilla/5.0 (Windows NT 10.0; Win64; x64) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/58.0.3029.110 Safari/537.3'}
        for url in self.start_urls:
            yield scrapy.Request(url, headers=headers, callback=self.parse)

    def parse(self, response):
        # Extracting job titles
        job_titles = response.css('.job-title::text').extract()
        
        # Extracting median salaries
        median_salaries = response.css('.data-table .median-salary .salary-amount::text').extract()
        
        # Extracting job satisfaction scores
        satisfaction_scores = response.css('.data-table .satisfaction-score::text').extract()
        
        # Extracting job growth rates
        growth_rates = response.css('.data-table .job-growth-rate .rate::text').extract()

        # Create a DataFrame to store the extracted data
        df = pd.DataFrame({
            'Job Title': job_titles,
            'Median Salary': median_salaries,
            'Job Satisfaction Score': satisfaction_scores,
            'Job Growth Rate': growth_rates
        })

        # Print the DataFrame (optional)
        print(df)

        # You can also save the DataFrame to a CSV file if needed
        # df.to_csv('best_jobs_data.csv', index=False)
        reactor.stop()

# Run the spider in a separate process
process = CrawlerProcess()
process.crawl(BestJobsSpider)
process.start()
reactor.run()


2024-03-30 11:50:12 [scrapy.utils.log] INFO: Scrapy 2.11.1 started (bot: scrapybot)
2024-03-30 11:50:12 [scrapy.utils.log] INFO: Versions: lxml 5.1.1.0, libxml2 2.12.6, cssselect 1.2.0, parsel 1.9.0, w3lib 2.1.2, Twisted 24.3.0, Python 3.12.2 (main, Mar 24 2024, 11:54:47) [GCC 11.4.0], pyOpenSSL 24.1.0 (OpenSSL 3.2.1 30 Jan 2024), cryptography 42.0.5, Platform Linux-6.1.75-060175-generic-x86_64-with-glibc2.35
2024-03-30 11:50:12 [scrapy.addons] INFO: Enabled addons:
[]


See the documentation of the 'REQUEST_FINGERPRINTER_IMPLEMENTATION' setting for information on how to handle this deprecation.
  return cls(crawler)

2024-03-30 11:50:12 [scrapy.utils.log] DEBUG: Using reactor: twisted.internet.epollreactor.EPollReactor
2024-03-30 11:50:12 [scrapy.extensions.telnet] INFO: Telnet Password: acd5226d44fbaf8e
2024-03-30 11:50:12 [scrapy.middleware] INFO: Enabled extensions:
['scrapy.extensions.corestats.CoreStats',
 'scrapy.extensions.telnet.TelnetConsole',
 'scrapy.extensions.memusage.Mem

In [1]:
import pandas as pd
import scrapy
from scrapy.crawler import CrawlerProcess

class BestJobsSpider(scrapy.Spider):
    name = 'best_jobs_spider'
    
    # Define the URL to scrape
    start_urls = ['https://money.usnews.com/careers/best-jobs/search']

    def parse(self, response):
        # Extract data from the first 5 items
        jobs = response.css('.List__StyledList-hn9llu-0.cSUItl li')[:5]
        
        # Iterate over each job and extract relevant information
        for job in jobs:
            title = job.css('a ::text').get()
            median_salary = job.css('.jsx-2431333955 ::text').get()
            job_description = job.css('p ::text').get()

            yield {
                'Title': title.strip() if title else None,
                'Median Salary': median_salary.strip() if median_salary else None,
                'Job Description': job_description.strip() if job_description else None
            }

# Create a Scrapy CrawlerProcess
process = CrawlerProcess(settings={
    'FEED_FORMAT': 'csv',
    'FEED_URI': 'best_jobs.csv'
})

# Start the crawling process with our spider
process.crawl(BestJobsSpider)
process.start()


2024-03-30 11:55:59 [scrapy.utils.log] INFO: Scrapy 2.11.1 started (bot: scrapybot)
2024-03-30 11:55:59 [scrapy.utils.log] INFO: Versions: lxml 5.1.1.0, libxml2 2.12.6, cssselect 1.2.0, parsel 1.9.0, w3lib 2.1.2, Twisted 24.3.0, Python 3.12.2 (main, Mar 24 2024, 11:54:47) [GCC 11.4.0], pyOpenSSL 24.1.0 (OpenSSL 3.2.1 30 Jan 2024), cryptography 42.0.5, Platform Linux-6.1.75-060175-generic-x86_64-with-glibc2.35
2024-03-30 11:55:59 [scrapy.addons] INFO: Enabled addons:
[]


See the documentation of the 'REQUEST_FINGERPRINTER_IMPLEMENTATION' setting for information on how to handle this deprecation.
  return cls(crawler)

2024-03-30 11:55:59 [scrapy.utils.log] DEBUG: Using reactor: twisted.internet.epollreactor.EPollReactor
2024-03-30 11:55:59 [scrapy.extensions.telnet] INFO: Telnet Password: 88449e8704ca0779
  exporter = cls(crawler)

2024-03-30 11:55:59 [scrapy.middleware] INFO: Enabled extensions:
['scrapy.extensions.corestats.CoreStats',
 'scrapy.extensions.telnet.TelnetConsole',
 'scr

In [2]:
import pandas as pd
import scrapy
from scrapy.crawler import CrawlerProcess

class MySpider(scrapy.Spider):
    name = 'myspider'
    
    # Define the URL to scrape
    start_urls = ['https://money.usnews.com/careers/best-jobs/search']

    def parse(self, response):
        # Extract data from the first 5 job listings
        job_listings = response.css('a.job-title-link::text')[:5].extract()
        median_salaries = response.css('div.salary-container::text')[:5].extract()
        job_scores = response.css('div.job-score::text')[:5].extract()

        # Create a DataFrame to store the data
        df = pd.DataFrame({
            'Job Title': job_listings,
            'Median Salary': median_salaries,
            'Job Score': job_scores
        })

        # Save the DataFrame to a CSV file
        df.to_csv('job_data.csv', index=False)

# Create a Scrapy CrawlerProcess
process = CrawlerProcess(settings={
    'FEED_FORMAT': 'csv',
    'FEED_URI': 'job_data.csv'
})

# Start the crawling process with our spider
process.crawl(MySpider)
process.start()


2024-03-30 11:57:15 [scrapy.utils.log] INFO: Scrapy 2.11.1 started (bot: scrapybot)
2024-03-30 11:57:15 [scrapy.utils.log] INFO: Versions: lxml 5.1.1.0, libxml2 2.12.6, cssselect 1.2.0, parsel 1.9.0, w3lib 2.1.2, Twisted 24.3.0, Python 3.12.2 (main, Mar 24 2024, 11:54:47) [GCC 11.4.0], pyOpenSSL 24.1.0 (OpenSSL 3.2.1 30 Jan 2024), cryptography 42.0.5, Platform Linux-6.1.75-060175-generic-x86_64-with-glibc2.35
2024-03-30 11:57:15 [scrapy.addons] INFO: Enabled addons:
[]


See the documentation of the 'REQUEST_FINGERPRINTER_IMPLEMENTATION' setting for information on how to handle this deprecation.
  return cls(crawler)

2024-03-30 11:57:15 [scrapy.utils.log] DEBUG: Using reactor: twisted.internet.epollreactor.EPollReactor
2024-03-30 11:57:15 [scrapy.extensions.telnet] INFO: Telnet Password: 476e4f401c338c35
  exporter = cls(crawler)

2024-03-30 11:57:15 [scrapy.middleware] INFO: Enabled extensions:
['scrapy.extensions.corestats.CoreStats',
 'scrapy.extensions.telnet.TelnetConsole',
 'scr

ReactorNotRestartable: 