In [2]:
from core import CompanyProduct
from search import search, SearchResult

def find_indeed_jobs(target: CompanyProduct) -> SearchResult:
    results = list(search(f'site:www.indeed.com/cmp "{target.company}"', num=1, debug=False))
    if results:
        return results[0]

company_search_results = find_indeed_jobs(CompanyProduct('Rad AI', 'Omni'))
company_search_results

SearchResult(title='RAD AI Jobs and Careers | Indeed.com', link='https://www.indeed.com/cmp/Rad-Ai/jobs', snippet='14 RAD AI jobs. Apply to the latest jobs near you. Learn about salary, employee reviews, interviews, benefits, and work-life balance.', formattedUrl='https://www.indeed.com/cmp/Rad-Ai/jobs')

In [3]:
from pprint import pprint
import src.scrapfly_scrapers.indeed as indeed
indeed.BASE_CONFIG["cache"] = True

company_result = await indeed.scrape_search(company_search_results.link)
# pprint(result)


[32m2024-08-06 17:01:38.458[0m | [1mINFO    [0m | [36mindeed[0m:[36mscrape_search[0m:[36m49[0m - [1mscraping search: https://www.indeed.com/cmp/Rad-Ai/jobs[0m
[32m2024-08-06 17:01:39.534[0m | [1mINFO    [0m | [36mindeed[0m:[36mscrape_search[0m:[36m64[0m - [1mfound total pages 0 search pages[0m


scraping remaining -1.0 pages


In [4]:
from pydantic import BaseModel, model_validator
from typing import List, Dict, Optional

class Salary(BaseModel):
    currency: str
    salaryTextFormatted: bool
    source: Optional[str]
    text: Optional[str]

    @model_validator(mode='before')
    def _allow_missing_optional(cls, data):
        if "source" not in data:
            data["source"] = None
        if "text" not in data:
            data["text"] = None
        return data

class Attribute(BaseModel):
    label: str
    suid: str

class Attributes(BaseModel):
    attributes: List[Attribute]
    label: str

class JobOverview(BaseModel):
    createDate: int
    displayTitle: str
    expired: bool
    formattedLocation: str
    formattedRelativeTime: str
    jobLocationCity: str
    jobkey: str
    pubDate: int
    remoteLocation: bool
    title: str
    salarySnippet: Salary
    truncatedCompany: str
    taxonomyAttributes: List[Attributes]

job_overviews = [JobOverview(**job) for job in company_result]
job_overviews

[JobOverview(createDate=1705091739000, displayTitle='[ Choose Your Own Role ]', expired=False, formattedLocation='Remote', formattedRelativeTime='30+ days ago', jobLocationCity='Remote', jobkey='3b224a1fc510b30f', pubDate=1705039200000, remoteLocation=True, title='[ Choose Your Own Role ]', salarySnippet=Salary(currency='', salaryTextFormatted=False, source=None, text=None), truncatedCompany='Rad AI', taxonomyAttributes=[Attributes(attributes=[Attribute(label='Full-time', suid='CF3CP')], label='job-types'), Attributes(attributes=[], label='shifts'), Attributes(attributes=[Attribute(label='Remote', suid='DSQF7')], label='remote'), Attributes(attributes=[Attribute(label='Health savings account', suid='7KV6C'), Attribute(label='Health insurance', suid='EY33Q'), Attribute(label='Dental insurance', suid='FQJ2X'), Attribute(label='Flexible spending account', suid='G85UP'), Attribute(label='Paid time off', suid='HW4J4'), Attribute(label='Vision insurance', suid='RZAT2'), Attribute(label='401(

In [5]:
jobKeys = [job.jobkey for job in job_overviews]
jobKeys

# jobs = jobKeys[:2]
job_detail_results = await indeed.scrape_jobs(jobKeys)


[32m2024-08-06 17:01:39.572[0m | [1mINFO    [0m | [36mindeed[0m:[36mscrape_jobs[0m:[36m89[0m - [1mscraping 14 job listings[0m


In [6]:
from pprint import pprint

pprint(job_detail_results[0])

{'companyImagesModel': {'ejiBannerAsBackground': False,
                        'enhancedJobDescription': False,
                        'featuredEmployer': False,
                        'headerImageUrl': None,
                        'logoAltText': 'Rad AI logo',
                        'logoImageOverlayLower': False,
                        'logoUrl': None,
                        'showBannerTop': False,
                        'showEnhancedJobImp': False,
                        'showIconInTitle': False},
 'companyName': 'Rad AI',
 'companyOverviewLink': 'https://www.indeed.com/cmp/Rad-Ai?campaignid=mobvjcmp&from=mobviewjob&tk=1i4kg5vhvjiik800&fromjk=29e297a73748ec9b',
 'companyReviewLink': 'https://www.indeed.com/cmp/Rad-Ai/reviews?campaignid=mobvjcmp&cmpratingc=mobviewjob&from=mobviewjob&tk=1i4kg5vhvjiik800&fromjk=29e297a73748ec9b&jt=Principal+Machine+Learning+Research+Scientist',
 'companyReviewModel': None,
 'description': '<div>\n'
                ' <h2 class="jobSectionHeader

In [7]:
import urllib.parse

class JobDetails(BaseModel):
    companyName: str
    companyOverviewLink: str
    companyReviewLink: str
    description: str # html formatted
    formattedLocation: str
    jobNormTitle: Optional[str]
    jobTitle: str
    jobType: str
    jobTypes: Optional[List[str]]
    location: Optional[str]
    remoteLocation: bool
    remoteWorkModel: Dict
    salaryCurrency: Optional[str]
    salaryMax: Optional[int]
    salaryMin: Optional[int]
    salaryType: Optional[str]
    subtitle: str

    @property
    def job_key(self):
        """Extract the job key from existing data; it's not provided in a separate field"""
        # Extract the GET param "fromjk" from the companyOverviewLink
        query = urllib.parse.urlparse(self.companyOverviewLink).query
        return urllib.parse.parse_qs(query)["fromjk"][0]
    
    @property
    def job_link(self):
        """Permalink to the job listing"""
        return f"https://www.indeed.com/viewjob?jk={self.job_key}"


job_details = [JobDetails(**job) for job in job_detail_results]


from markdownify import markdownify as md

for job_detail in job_details:
    print(f"""
# {job_detail.jobTitle} at {job_detail.companyName}
- {job_detail.formattedLocation}
- {job_detail.jobType}

{md(job_detail.description)[:300]}
""")


# Principal Machine Learning Research Scientist at Rad AI
- Remote
- Full-time


**About Rad AI**
----------------


 We have raised $80\+ million to date from venture funds and just closed on our series B financing with investors Khosla Ventures, Gradient (Google’s AI fund) and ARTIS. We’ve also formed a partnership with Google to collaborate on the future of generative AI to 


# Accounting Manager at Rad AI
- Remote
- Full-time


**About Rad AI**
----------------


 We have raised $80\+ million to date from venture funds and just closed on our series B financing with investors Khosla Ventures, Gradient (Google’s AI fund) and ARTIS. We’ve also formed a partnership with Google to collaborate on the future of generative AI to 


# Legal Counsel at Rad AI
- Remote
- Full-time


**About Rad AI**
----------------


 We have raised $80\+ million to date from venture funds and just closed on our series B financing with investors Khosla Ventures, Gradient (Google’s AI fund) and ARTIS. We’ve

In [8]:
job_details[0].job_link

'https://www.indeed.com/viewjob?jk=29e297a73748ec9b'

In [9]:
jd_markdowns = [f"""
# [{job_detail.jobTitle}]({job_detail.job_link}) at {job_detail.companyName}

{md(job_detail.description)}
""" for job_detail in job_details]

In [10]:
unified_markdown = "\n".join(jd_markdowns)
print(f"Unified Markdown:\n{len(unified_markdown):,} characters")

Unified Markdown:
68,196 characters


In [11]:
from typing import List
from langchain_core.prompts import ChatPromptTemplate
from langchain_core.messages.ai import AIMessage
from langchain_openai import ChatOpenAI

from core import CompanyProduct
from dotenv import load_dotenv

load_dotenv()


prompt = ChatPromptTemplate.from_messages(
    [
        (
            "system",
            """
You're an expert in gleaning information from corporate job descriptions, and you'll be provided with several open job descriptions from a single company in markdown format. 
You'll also be provided with a recent job title of a prospective candidate.

Review all job descriptions and summarize key information and insights that may be relevant for this candidate.
Examples of information that would be useful include:
- For highly relevant roles, a summary of what's special or unique about the roles at this company compared to other companies working in the same field
- If there are different seniority levels of relevant roles, a summary of general expectations for each level
- In a software engineering role, a summary of technologies used or skills required separated by type (e.g, machine learning, data engineering, backend engineering, frontend engineering)
- A summary of any unique benefits or perks offered by the company
- A summary of the company's culture and values as reflected in the job descriptions
- A summary of the company's growth and expansion plans as reflected in the job descriptions

Format the output as a markdown document.
When summarizing, reference the source of the information with a markdown link, as in ([Job Title](https://permalink)).

At the end of the document, include a list of the sources that were used to generate the summary as a list of markdown links.
            """,
        ),
        (
            "human",
            """
            Company: {company_name}
            Recent job title(s) of the candidate: {candidate_title}
            
            Job descriptions: 
            {text}
            """,
        ),
    ]
)


def summarize(
    target: CompanyProduct, candidate_title: str, job_description_markdowns: List[str], debug=True
) -> AIMessage:
    """Summarize a list of news articles"""
    unified_markdown = "\n\n".join(article for article in job_description_markdowns)

    if debug:
        print(f"{len(unified_markdown):,} characters in unified context")

    llm = ChatOpenAI(model="gpt-4o-mini", temperature=0)

    runnable = prompt | llm
    result = runnable.invoke({"text": unified_markdown, "company_name": target.company, "candidate_title": candidate_title})

    return result

summary = summarize(CompanyProduct('Rad AI', 'Omni'), "Senior Machine Learning Engineer, Principal Research Scientist", jd_markdowns)
print(summary.content)

68,209 characters in unified context
# Summary of Key Information for Rad AI

## Company Overview
Rad AI is a rapidly growing healthcare AI company that has raised over $80 million in funding, including a recent Series B round. The company has formed a partnership with Google to advance generative AI in healthcare and is recognized as a leading innovator in the field. Rad AI's mission is to empower physicians with AI to improve patient care, reduce burnout, and enhance diagnostic accuracy.

## Unique Aspects of Roles at Rad AI
- **Interdisciplinary Collaboration**: Many roles emphasize collaboration between technical teams and clinical experts, highlighting the importance of integrating AI solutions into real-world healthcare settings.
- **Focus on Innovation**: The company encourages a culture of innovation, with roles designed to push the boundaries of AI applications in healthcare.
- **Remote-First Culture**: Rad AI operates as a remote-first company, allowing for location flexibili

In [14]:
with open(f"evaluation/job_description_summary_radai.md", "w") as f:
    f.write(summary.content)