### Scraping NodeFlair Job Postings using BeautifulSoup

In [9]:
# !pip install beautifulsoup4
# !pip install requests
# !pip install pandas

# !pip install selenium
# !pip install webdriver-manager

In [2]:
import time
from datetime import datetime, timedelta
import requests 
import pandas as pd

from bs4 import BeautifulSoup 
from selenium import webdriver
from webdriver_manager.chrome import ChromeDriverManager

#### Input Search URL and get HTML
- Use selenium to load the page because the webpage loads the data dynamically (i.e. the full HTML will not be returned by using BeautifulSoup)

In [93]:
# chromedriver_path = "C:/Users/user/chromedriver.exe"
driver = webdriver.Chrome(executable_path = "C:/Users/user/chromedriver.exe")

  driver = webdriver.Chrome(executable_path = "C:/Users/user/chromedriver.exe")


In [119]:
num_pages = 247

# Initialise list to store details of each individual job listing post
metadata = []

# Iterate through all pages
for page_num in range(1, num_pages+1):
    url = f"https://www.nodeflair.com/jobs?query=cloud&page={page_num}&sort_by=recent#"
    driver.get(url) 
    time.sleep(3)

    # Get Soup using BeautifulSoup after loading complete HTML
    page_source = driver.page_source
    soup = BeautifulSoup(page_source, "html.parser")

    # Find all job posting cards
    job_listings = soup.find_all("div", class_="col-12 col-sm-12 col-md-6 col-lg-4")
    
    # Iterate through all (12) job listings on a page to get link to individual listing and date posted
    for listing in job_listings:
        job_url = listing.find("a").get("href")
        date = listing.find("p", {"style": "font-style: italic; color: rgb(131, 131, 131);"}).get_text()

        metadata.append([job_url, date])

In [121]:
# Convert each job posting to a row in a DataFrame
job_listings_df = pd.DataFrame(data = metadata, columns=["URL", "Date posted"])
job_listings_df

Unnamed: 0,URL,Date posted
0,/jobs/53907,about 11 hours ago
1,/jobs/53898,about 11 hours ago
2,/jobs/53894,about 11 hours ago
3,/jobs/53893,about 11 hours ago
4,/jobs/53891,about 11 hours ago
...,...,...
2959,/jobs/46008,about 2 months ago
2960,/jobs/46001,about 2 months ago
2961,/jobs/45989,about 2 months ago
2962,/jobs/45982,about 2 months ago


In [122]:
job_listings_df.to_csv("nodeflair_links.csv")

#### Function to get data from each job posting
- Job Title ✔
- Company ✔
- Salary ✔
- Job Type ✔
- Seniority ✔
- Years of Experience ✔
- Tech Stack ✔
- Job Description ✔
  - Click Read More 

In [88]:
def getJobTitle(soup):
    job_title = soup.find("h1", class_="job-title-name").get_text()
    return job_title

def getCompanyName(soup):
    company_name = soup.find("h2", class_="company-title-name").get_text()
    return company_name

def getSeniorityTechStacks(soup):
    # Both Seniority and Tech Stack have the same HTML class 
    tags = [tag.get_text() for tag in soup.find_all("div", class_="grey-tag")]
    cleaned_tags = [tag.replace("\n", "").replace(" ", "") for tag in tags]

    seniority_tags = ["Mid", "Senior", "Lead", "Manager", "Director", "Principal", "Intern", "Junior"]
    seniority = []
    tech_stack = []

    # Differentiating between tags for Seniority and Tech Stack
    for tag in cleaned_tags:
        if tag in seniority_tags:
            seniority.append(tag)
        else:
            tech_stack.append(tag)

    return seniority, tech_stack

def getSalaryJobTypeYears(soup):
    # Salary, Job Type, Years of Experience do not have classes tagged to them 
    # Some information can be missing. Find if the headers exist first
    all_b = soup.find_all("b")

    # Can only be identified by text between <br> tags
    all_br = soup.find_all("br")

    required_headers = ["Salary", "Job Type", "Years of Experience"]
    headers = []
    for header in all_b:
        text = header.get_text()
        if text in required_headers:
            headers.append(text)

    output = []
    for b in all_br:
        # Content after the <br> tag
        next_s = b.nextSibling

        if str(type(next_s)) == '<class \'bs4.element.NavigableString\'>':
            text = next_s.get_text()
            if text != "\n":
                output.append(text.strip())

    salary = '-'
    job_type = '-'
    years_of_experience = '-'

    for i in range(len(headers)):
        if headers[i] == "Salary":
            salary = output[i]
        elif headers[i] == "Job Type":
            job_type = output[i]
        elif headers[i] == "Years of Experience":
            years_of_experience = output[i]
    
    return salary, job_type, years_of_experience


def getJobDesc(soup):
    job_description_container = soup.find("div", id="job-description")

    text_list = []
    for text in job_description_container.find_all("div"):
        text = text.get_text()
        if text != "":
            text_list.append(text)

    for text in job_description_container.find_all("ul"):
        points = text.find_all("li")
        for point in points:
            if point != "":
                text_list.append(point.get_text())

    return " ".join(text_list)

In [47]:
def getData(soup):
    job_title = getJobTitle(soup)
    company_name = getCompanyName(soup)
    seniority, techstack = getSeniorityTechStacks(soup)
    salary, job_type, years_of_experience = getSalaryJobTypeYears(soup)
    

    job_desc = getJobDesc(soup)

    data = [job_title, company_name, seniority, salary, job_type, years_of_experience, techstack, job_desc]

    return data

In [89]:
def scrapePost(job_url):
    print("scraping:", job_url)

    try: 
        new_url = f"https://www.nodeflair.com/{job_url}"
        driver.get(new_url) 

        # Manually expand the size of the container to display all text
        expand_text_toggle = driver.execute_script("document.getElementById('job-description').style.height = '1000px';")

        # Manually display all hidden text 
        read_more_toggle = driver.execute_script("document.getElementById('job-description').getAttribute('aria-expanded').innerHTML = 'true';")
        time.sleep(3)
    
        page_source = driver.page_source
        soup = BeautifulSoup(page_source, "html.parser")

        data = getData(soup)
    
    except Exception as e:
        print(e)
        page_source = driver.page_source
        soup = BeautifulSoup(page_source, "html.parser")

        data = getData(soup)
        
    print(" done scraping")
    return data

In [155]:
def getDatePosted(text):
    date_of_scraping = datetime.now()
    date_posted = ''

    if "hours" in text:
        date = date_of_scraping + timedelta(days=-1)
        date_posted = date.date().strftime('%y-%m-%d')
        
    elif "months" in text:
        n_months = text.split(" ")[1]
        date = date_of_scraping + timedelta(months=-n_months)
        date_posted = date.date().strftime('%y-%m-%d')

    return date_posted

In [131]:
job_listings_df = pd.read_csv("../data/nodeflair/nodeflair_links.csv", index_col=0)
job_listings_df.head()

Unnamed: 0,URL,Date posted
0,/jobs/53907,about 11 hours ago
1,/jobs/53898,about 11 hours ago
2,/jobs/53894,about 11 hours ago
3,/jobs/53893,about 11 hours ago
4,/jobs/53891,about 11 hours ago


In [114]:
def format_df(df):
    df["Job Title"] = df["data"].apply(lambda x: x[0])
    df["Company Name"] = df["data"].apply(lambda x: x[1])
    df["Seniority"] = df["data"].apply(lambda x: x[2])
    df["Salary"] = df["data"].apply(lambda x: x[3])
    df["Job Type"] = df["data"].apply(lambda x: x[4])
    df["Years of Experience"] = df["data"].apply(lambda x: x[5])
    df["Tech Stack"] = df["data"].apply(lambda x: x[6])
    df["Job Desc"] = df["data"].apply(lambda x: x[7])
    # job_listings_df["Date posted"] = job_listings_df["Date posted"].apply(getDatePosted)
    return df

In [136]:
print("Date of scraping:", datetime.now())
output_df = pd.DataFrame()
start = 0
end = 100

while end <= len(job_listings_df):
    try:
        driver = webdriver.Chrome(executable_path = "C:/Users/user/chromedriver.exe")
        job_listings_df_100 = job_listings_df[start:end+1]
        job_listings_df_100["data"] = job_listings_df_100["URL"].apply(scrapePost)
        print(f"{end} done")
        
        output_df = pd.concat([output_df, format_df(job_listings_df_100)])
        start = end+1
        end += 100

    except:
        driver = webdriver.Chrome(executable_path = "C:/Users/user/chromedriver.exe")
        job_listings_df_100 = job_listings_df[start:end+1]
        job_listings_df_100["data"] = job_listings_df_100["URL"].apply(scrapePost)
        print(f"{end} done")
        
        output_df = pd.concat([output_df, format_df(job_listings_df_100)])
        start = end+1
        end += 100

scraping: /jobs/53907
 done scraping
scraping: /jobs/53898
 done scraping
scraping: /jobs/53894
 done scraping
scraping: /jobs/53893
 done scraping
scraping: /jobs/53891
 done scraping
scraping: /jobs/53885
 done scraping
scraping: /jobs/53879
 done scraping
scraping: /jobs/53876
 done scraping
scraping: /jobs/53870
 done scraping
scraping: /jobs/53869
 done scraping
scraping: /jobs/53862
 done scraping
scraping: /jobs/53861
 done scraping
scraping: /jobs/53860
 done scraping
scraping: /jobs/53856
 done scraping
scraping: /jobs/53852
 done scraping
scraping: /jobs/53849
 done scraping
scraping: /jobs/53847
 done scraping
scraping: /jobs/53845
 done scraping
scraping: /jobs/53842
 done scraping
scraping: /jobs/53839
 done scraping
scraping: /jobs/53838
 done scraping
scraping: /jobs/53837
 done scraping
scraping: /jobs/53836
 done scraping
scraping: /jobs/53835
 done scraping
scraping: /jobs/53833
 done scraping
scraping: /jobs/53832
 done scraping
scraping: /jobs/53830
 done scraping
s

A value is trying to be set on a copy of a slice from a DataFrame.
Try using .loc[row_indexer,col_indexer] = value instead

See the caveats in the documentation: https://pandas.pydata.org/pandas-docs/stable/user_guide/indexing.html#returning-a-view-versus-a-copy
  job_listings_df["data"] = job_listings_df["URL"].apply(scrapePost)


In [140]:
output_df

Unnamed: 0,URL,Date posted,data,Job Title,Company Name,Seniority,Salary,Job Type,Years of Experience,Tech Stack,Job Desc
0,/jobs/53907,about 11 hours ago,"[ReactJS Developer (Full Stack), Apar Technolo...",ReactJS Developer (Full Stack),Apar Technologies,"[Mid, Junior]","$6,419 - $8,819 SGD / Monthly",Permanent,2-3 years,"[Docker, CloudFoundry, Spring, SonarQube, TDD,...",We are looking for a candidate to fill in the ...
1,/jobs/53898,about 11 hours ago,"[Manager, SRE, Rakuten Viki, [Manager], -, Per...","Manager, SRE",Rakuten Viki,[Manager],-,Permanent,Information not provided,"[Docker, API, PagerDuty, GKE, ELK, Sprint, UNI...",The SRE team at Viki is responsible for buildi...
2,/jobs/53894,about 11 hours ago,"[DevOps Engineer, GovTech, [Junior], $5,800 - ...",DevOps Engineer,GovTech,[Junior],"$5,800 - $9,600 SGD / Monthly",Permanent,At least 2 years,"[Docker, DockerCompose, Fluentd, Clair, Packer...",Our team in GovTech works on highly impactful ...
3,/jobs/53893,about 11 hours ago,"[VP, System Analyst, United Overseas Bank Limi...","VP, System Analyst",United Overseas Bank Limited (UOB),[Senior],-,Permanent,10-15 years,"[ETL, Oracle, Experian, Strategy, Teradata, Ql...",The Technology and Operations function is comp...
4,/jobs/53891,about 11 hours ago,"[DevOps & Lab Manager (SG - Edge), Dell Techno...",DevOps & Lab Manager (SG - Edge),Dell Technologies,[Manager],"$11,000 - $22,000 SGD / Monthly",Permanent,At least 12 years,"[Docker, Strategy, Container, Microsoft, CI, N...",Dell Technologies is seeking an entrepreneuria...
...,...,...,...,...,...,...,...,...,...,...,...
2896,/jobs/46156,about 2 months ago,"[DevOps Engineer, 2C2P, [Mid], $7,000 - $10,00...",DevOps Engineer,2C2P,[Mid],"$7,000 - $10,000 SGD / Monthly",Permanent,At least 3 years,"[GitLab, HTTP, UDP, TCP, ShellScript, GitLabCI...",2C2P is looking for a .NET DevOps Engineer to ...
2897,/jobs/46153,about 2 months ago,"[DevOps Engineer, FINXFLO, [], -, Permanent, I...",DevOps Engineer,FINXFLO,[],-,Permanent,Information not provided,"[Strategy, ShellScript, CI, Shell, UNIX, JavaS...",Alpha Stone Capital is looking for an amazing ...
2898,/jobs/46151,about 2 months ago,"[DevOps Engineer, Quilt.AI, [Mid], -, Permanen...",DevOps Engineer,Quilt.AI,[Mid],-,Permanent,At least 3 years,"[Next.js, Docker, Cloudflare, DockerSwarm, Str...",As part of a growing team consisting of ML exp...
2899,/jobs/46149,about 2 months ago,"[Software Engineer, Zoku Integrated Commerce, ...",Software Engineer,Zoku Integrated Commerce,[Mid],-,Permanent,At least 3 years,"[API, Magento, CI, DOM, Node.js, NoSQL, JavaSc...",We are hiring software engineers with expertis...


In [139]:
output_df.to_csv("nodeflair_jobpostings.csv")