In [None]:
import requests
from bs4 import BeautifulSoup
import pandas as pd
import json

In [None]:
#Get job listings from linkedin
def getListingsInPage(jobTitle, location, num):

    #get listings for specific job title and location
    listUrl = f"https://www.linkedin.com/jobs-guest/jobs/api/seeMoreJobPostings/search?keywords={jobTitle}&location={location}&geoId=&trk=public_jobs_jobs-search-bar_search-submit&start={num}"

    response = requests.get(listUrl)
    listData = response.text
    listSoup = BeautifulSoup(listData, "html.parser")
    pageListings = listSoup.find_all("li")

    return pageListings

In [None]:
#Extract some info from a job listing
def getIdTitleDateLoc(pageListings):
#Get the job listing ID for each listing in the search
    idTitleDateLoc = []
    for job in pageListings:
        baseCardDiv = job.find("div",{"class":"base-card"})

        jobID = baseCardDiv.get("data-entity-urn").split(":")[3]

        try:
            jobTitle = job.find("h3",{"class" : "base-search-card__title"}).text.strip()
        except:
            jobTitle = None

        try:
            postingDate = job.find("time", {"class": "job-search-card__listdate"}).get("datetime")
        except:
            postingDate = None

        try:
            location = job.find("span", {"class": "job-search-card__location"}).text.strip()
        except:
            location = None

        idTitleDateLoc.append([jobID,jobTitle, postingDate,location])
    return idTitleDateLoc

In [None]:
def getRequirements(soup):
    sectionTitles = [
        "Minimum Requirements", "Qualifications", "Job Requirements", 
        "Minimum Qualifications", "Skills", "Experience Required", "Skills Required", 
        "Requirements", "Preferred Qualifications", "You will be a great fit if you have:",
        "Must Have", "Skills and Qualifications", 'What you bring to the table?', 
        'Required Skills', 'What we are looking for in you', 'Skills and Responsibilities:',
        'Who You Might Be', 'Experience', 'Job Description', 'What You\'ll Need', 'What We Look For', 
        'What you bring', 'Who you are', 'Basic Qualifications', 'Your Qualifications', 
        'Required experience and skills'
    ]

    for title in sectionTitles:
        section = soup.find(lambda tag: tag.name in ["strong", "h2", "h3", "h4"] and title.lower() in tag.text.lower())
        if section:
            ul = section.find_next("ul")
            if ul:
                return [li.text.strip() for li in ul.find_all("li")]
    return []


In [None]:
def getCriteria(soup):
    criteria = {}
    criteriaList = soup.find("ul", {"class": "description__job-criteria-list"})
    if criteriaList:
        for item in criteriaList.find_all("li", {"class": "description__job-criteria-item"}):
            header = item.find("h3", {"class": "description__job-criteria-subheader"})
            value = item.find("span", {"class": "description__job-criteria-text description__job-criteria-text--criteria"})
            if header and value:
                criteria[header.text.strip().lower()] = value.text.strip()
    return criteria


In [None]:
def addToJson(idTitleDateLoc, file):
    dbfsPath = f"/dbfs/mnt/data/{file}"
    jobPostings = []
    existingIds = set()

    try:
        with open(dbfsPath, 'r') as jsonFile:
            print("Added.")
            jobPostings = json.load(jsonFile)
            existingIds = {job["jobID"] for job in jobPostings}
    except FileNotFoundError:
        print("No existing data found. Starting fresh.")
    
    for job in idTitleDateLoc:
        if job[0] in existingIds:
            continue

        jobUrl = f"https://www.linkedin.com/jobs-guest/jobs/api/jobPosting/{job[0]}"
        jobResponse = requests.get(jobUrl)
        jobData = jobResponse.text
        jobSoup = BeautifulSoup(jobData, "html.parser")

        jobPost = {
            "jobID": job[0],
            "jobTitle": job[1],
            "DatePosted": job[2],
            "Location": job[3]
        }

        try:
            jobPost["companyName"] = jobSoup.find("a", {"class": "topcard__org-name-link topcard__flavor--black-link"}).text.strip()
        except AttributeError:
            jobPost["companyName"] = None
        
        jobPost["requirements"] = getRequirements(jobSoup)
        jobPost.update(getCriteria(jobSoup))

        jobPostings.append(jobPost)

    # Write the updated job postings to the file once after processing all jobs
    with open(dbfsPath, 'w') as jsonFile:
        json.dump(jobPostings, jsonFile, indent=4)

    return jobPostings


In [None]:
def getDataForJobTitle(title, loc, end, f):
    n = 10
    while n != end:
        try:
            pageListings = getListingsInPage(title, loc, str(n))
            details = getIdTitleDateLoc(pageListings)
            addToJson(details, f)
        except:
            continue
        n+=10
    print("Data successfully added for "+title+ ' postings in ' + loc)