In [None]:
# Step 1: Define the scraper class
glassdoor_job_scraper = """
from seleniumbase import BaseCase
import pandas as pd
import time

class GlassdoorJobScraper(BaseCase):
    def scrape_jobs(self, keyword, num_jobs, verbose):
        '''Gathers jobs as a dataframe, scraped from Glassdoor'''
        
        # Open the Glassdoor job search page
        url = f'https://www.glassdoor.com/Job/jobs.htm?sc.keyword={keyword}&locT=C&locId=1147401&locKeyword=San%20Francisco,%20CA&jobType=all&fromAge=-1&minSalary=0&includeNoSalaryJobs=true&radius=100&cityId=-1&minRating=0.0&industryId=-1&sgocId=-1&seniorityType=all&companyId=-1&employerSizes=0&applicationType=0&remoteWorkType=0'
        self.open(url)
        self.set_window_size(1120, 1000)
        jobs = []

        while len(jobs) < num_jobs:
            self.sleep(4)

            try:
                self.click('div.selected')
            except:
                pass
            self.sleep(0.1)

            try:
                self.click('button.ModalStyle__xBtn___29PT9')
            except:
                pass
            
            try:
                job_buttons = self.find_elements('li.JobsList_jobListItem__wjTHv')  # Updated class name for job listings
                print(f"Found {len(job_buttons)} job listings on the page.")
            except:
                print("No job listings found on the page.")
                break

            for job_button in job_buttons:
                print(f"Progress: {len(jobs)}/{num_jobs}")
                if len(jobs) >= num_jobs:
                    break

                try:
                    job_button.click()
                    self.sleep(1)
                except:
                    print("Element not clickable.")
                    continue
                
                collected_successfully = False
                
                while not collected_successfully:
                    try:
                        company_name = self.get_text('div.d-flex.justify-content-between.align-items-start a')
                        location = self.get_text('div.d-flex.justify-content-between.align-items-start span.pr-xxsm')
                        job_title = self.get_text('div.css-1vg6q84.e1tk4kwz4')
                        job_description = self.get_text('div.jobDescriptionContent.desc')
                        collected_successfully = True
                    except:
                        print("Error collecting job details. Retrying...")
                        self.sleep(5)

                try:
                    salary_estimate = self.get_text('span.css-1imh2hq')
                except:
                    salary_estimate = -1
                
                try:
                    rating = self.get_text('span.css-1m5m32b')
                except:
                    rating = -1

                if verbose:
                    print(f"Job Title: {job_title}")
                    print(f"Salary Estimate: {salary_estimate}")
                    print(f"Job Description: {job_description[:500]}")
                    print(f"Rating: {rating}")
                    print(f"Company Name: {company_name}")
                    print(f"Location: {location}")

                try:
                    self.click('//div[@class="tab" and @data-tab-type="overview"]')
                    try:
                        headquarters = self.get_text('//div[@class="infoEntity"]//label[text()="Headquarters"]//following-sibling::*')
                    except:
                        headquarters = -1

                    try:
                        size = self.get_text('//div[@class="infoEntity"]//label[text()="Size"]//following-sibling::*')
                    except:
                        size = -1

                    try:
                        founded = self.get_text('//div[@class="infoEntity"]//label[text()="Founded"]//following-sibling::*')
                    except:
                        founded = -1

                    try:
                        type_of_ownership = self.get_text('//div[@class="infoEntity"]//label[text()="Type"]//following-sibling::*')
                    except:
                        type_of_ownership = -1

                    try:
                        industry = self.get_text('//div[@class="infoEntity"]//label[text()="Industry"]//following-sibling::*')
                    except:
                        industry = -1

                    try:
                        sector = self.get_text('//div[@class="infoEntity"]//label[text()="Sector"]//following-sibling::*')
                    except:
                        sector = -1

                    try:
                        revenue = self.get_text('//div[@class="infoEntity"]//label[text()="Revenue"]//following-sibling::*')
                    except:
                        revenue = -1

                    try:
                        competitors = self.get_text('//div[@class="infoEntity"]//label[text()="Competitors"]//following-sibling::*')
                    except:
                        competitors = -1
                except:
                    headquarters = -1
                    size = -1
                    founded = -1
                    type_of_ownership = -1
                    industry = -1
                    sector = -1
                    revenue = -1
                    competitors = -1

                if verbose:
                    print(f"Headquarters: {headquarters}")
                    print(f"Size: {size}")
                    print(f"Founded: {founded}")
                    print(f"Type of Ownership: {type_of_ownership}")
                    print(f"Industry: {industry}")
                    print(f"Sector: {sector}")
                    print(f"Revenue: {revenue}")
                    print(f"Competitors: {competitors}")
                    print("@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@")

                jobs.append({"Job Title": job_title,
                             "Salary Estimate": salary_estimate,
                             "Job Description": job_description,
                             "Rating": rating,
                             "Company Name": company_name,
                             "Location": location,
                             "Headquarters": headquarters,
                             "Size": size,
                             "Founded": founded,
                             "Type of ownership": type_of_ownership,
                             "Industry": industry,
                             "Sector": sector,
                             "Revenue": revenue,
                             "Competitors": competitors})

            try:
                self.click('//li[@class="next"]//a')
            except:
                print(f"Scraping terminated before reaching target number of jobs. Needed {num_jobs}, got {len(jobs)}.")
                break

        return pd.DataFrame(jobs)
"""
# Save the script as a .py file
with open('glassdoor_job_scraper.py', 'w') as file:
    file.write(glassdoor_job_scraper)

In [None]:
# Step 2: Define the test class
test_glassdoor_job_scraper = """
from seleniumbase import BaseCase
from glassdoor_job_scraper import GlassdoorJobScraper

class TestGlassdoorJobScraper(GlassdoorJobScraper):
    def test_scraping(self):
        df = self.scrape_jobs("data scientist", 5, True)
        print(df)
        assert len(df) > 0

if __name__ == "__main__":
    from seleniumbase import run
    run.main("test_glassdoor_job_scraper.py")
"""
# Save the script as a .py file
with open('test_glassdoor_job_scraper.py', 'w') as file:
    file.write(test_glassdoor_job_scraper)

In [None]:
# Run pytest on the created test file
!pytest test_glassdoor_job_scraper.py --maxfail=1 --disable-warnings -q