In [2]:
from core import CompanyProduct

from glassdoor.search import find_review
from glassdoor.scraper import scrape_reviews, scrape_jobs, Url

target = CompanyProduct.same("Instacart")
review_page = find_review(target)
company, company_id = Url.parse_review_url(review_page.link)


In [3]:
glassdoor_job_results = await scrape_jobs(Url.jobs(company, company_id), max_pages=1)
glassdoor_review_results = await scrape_reviews(review_page.link, max_pages=1)


[32m2024-08-05 20:27:07.950[0m | [1mINFO    [0m | [36mglassdoor.scraper[0m:[36mscrape_jobs[0m:[36m82[0m - [1mscraping job listings from https://www.glassdoor.com/Jobs/Instacart-Jobs-E714486.htm?[0m
[32m2024-08-05 20:27:19.878[0m | [1mINFO    [0m | [36mglassdoor.scraper[0m:[36mscrape_jobs[0m:[36m90[0m - [1mscraped first page of jobs of https://www.glassdoor.com/Jobs/Instacart-Jobs-E714486.htm?, scraping remaining 5 pages[0m
[32m2024-08-05 20:27:22.394[0m | [1mINFO    [0m | [36mglassdoor.scraper[0m:[36mscrape_jobs[0m:[36m97[0m - [1mscraped 80 jobs from https://www.glassdoor.com/Jobs/Instacart-Jobs-E714486.htm? in 6 pages[0m
[32m2024-08-05 20:27:22.403[0m | [1mINFO    [0m | [36mglassdoor.scraper[0m:[36mscrape_reviews[0m:[36m110[0m - [1mscraping reviews from https://www.glassdoor.com/Reviews/Instacart-Reviews-E714486.htm[0m
[32m2024-08-05 20:27:33.319[0m | [1mINFO    [0m | [36mglassdoor.scraper[0m:[36mscrape_reviews[0m:[36m118[0m - 

In [7]:
from typing import Optional
from pydantic import BaseModel

class GlassdoorJob(BaseModel):
    """Basic job listing info from the company page on Glassdoor"""
    ageInDays: int
    goc: str
    jobTitleText: str
    locationName: str
    payCurrency: str
    payPercentile10: Optional[int]
    payPercentile50: Optional[int]
    payPercentile90: Optional[int]
    payPeriod: Optional[str]
    salarySource: Optional[str]
    seoJobLink: str

    
jobs = [GlassdoorJob(**result) for result in glassdoor_job_results]
jobs = sorted(jobs, key=lambda job: job.jobTitleText)

jobs

[GlassdoorJob(ageInDays=146, goc='data scientist', jobTitleText='Decision Scientist Contractor, NextGen', locationName='Remote', payCurrency='USD', payPercentile10=146848, payPercentile50=154856, payPercentile90=162864, payPeriod='HOURLY', salarySource='EMPLOYER_PROVIDED', seoJobLink='https://www.glassdoor.com/job-listing/decision-scientist-contractor-nextgen-instacart-JV_KO0,37_KE38,47.htm?jl=1009182685437'),
 GlassdoorJob(ageInDays=280, goc='field service technician', jobTitleText='Field Technician - Los Angeles, CA, United States (flex hours/contract role)', locationName='Los Angeles, CA', payCurrency='USD', payPercentile10=41743, payPercentile50=51254, payPercentile90=62933, payPeriod='ANNUAL', salarySource='ESTIMATED', seoJobLink='https://www.glassdoor.com/job-listing/field-technician-los-angeles-ca-united-states-flex-hours-contract-role-instacart-JV_IC1146821_KO0,70_KE71,80.htm?jl=1008952310073'),
 GlassdoorJob(ageInDays=280, goc='field service technician', jobTitleText='Field Te

In [21]:
from typing import NamedTuple, Optional
from datetime import datetime
from pydantic import BaseModel

class JobTitle(BaseModel):
    id: int
    text: str

class GlassdoorReview(BaseModel):
    """Wrapper around a Glassdoor review to make autocomplete easier"""
    # raw fields
    advice: Optional[str]
    cons: Optional[str]
    lengthOfEmployment: int
    pros: Optional[str]
    ratingOverall: int
    reviewId: int
    summary: str

    # processed fields
    jobTitle: Optional[JobTitle]
    reviewDateTime: Optional[datetime]

    
    # @classmethod
    # def from_dict(cls, advice, cons, lengthOfEmployment, pros, ratingOverall, reviewId, summary, jobTitle, reviewDateTime, **_kwargs):
    #     # TODO: from_dict is a misleading name
    #     job_title = jobTitle["text"] if jobTitle else None
    #     date_time = datetime.strptime(reviewDateTime, "%Y-%m-%dT%H:%M:%S.%f")

    #     return cls(
    #         advice, cons, lengthOfEmployment, pros, ratingOverall, reviewId, summary, job_title, date_time
    #     )

    @classmethod
    def parse_reviews(cls, raw_results: dict):
        """Parse Glassdoor reviews from the raw API response"""
        parsed_reviews = [
            cls(**review)
            for review in raw_results["reviews"]
        ]
        parsed_reviews = sorted(parsed_reviews, key=lambda x: x.reviewDateTime, reverse=False)

        return parsed_reviews

reviews = GlassdoorReview.parse_reviews(glassdoor_review_results)
reviews

# from pydantic import ValidationError
# from pprint import pprint
# for review in glassdoor_review_results["reviews"]:
#     try:
#         parsed_review = GlassdoorReview(**review)
#     except ValidationError as e:
#         print(e)
#         pprint(review)
#         break

[GlassdoorReview(advice=None, cons='Inconsistent orders, lots of $0 tips', lengthOfEmployment=0, pros='Freedom to create own schedule', ratingOverall=5, reviewId=86685677, summary='Would Recommend', jobTitle=None, reviewDateTime=datetime.datetime(2024, 4, 25, 11, 39, 49, 743000)),
 GlassdoorReview(advice="To improve employee satisfaction and productivity, it is crucial to address favoritism and ensure fair treatment for all employees. Managers should receive proper training to understand their team's roles and responsibilities better, providing the necessary support and guidance. Enhancing communication between departments and protecting employees from cross-functional conflicts will create a more harmonious and efficient work environment. Focus on offering clear career growth paths to retain and develop talent within the company.", cons='-"Unlimited PTO" is misleading when you\'re constantly covering for others\' workloads\n-Favoritism prevalent in the workplace\n-Significant changes 