In [50]:
# Import the necessary Libraries
from typing import Optional, Dict, Tuple
import pandas as pd
from bs4 import BeautifulSoup
from selenium import webdriver
from selenium.webdriver.common.by import By
from selenium.webdriver.common.keys import Keys
from selenium.webdriver.chrome.service import Service as ChromeService
from webdriver_manager.chrome import ChromeDriverManager
from time import sleep
from rich import print
import urllib

In [51]:
# Add settings and Configurations
chrome_options = webdriver.ChromeOptions()
#chrome_options.add_argument('--headless')
chrome_options.add_argument('--no-sandbox')
chrome_options.add_argument('--disable-dev-shm-usage')

In [53]:
browser = webdriver.Chrome(service=ChromeService(ChromeDriverManager().install()), options = chrome_options)

In [54]:
def build_indeed_job_search_url(job_title, location, radius=None):
    """
    Builds a URL for searching jobs on Indeed with a specified job title, location, and optional radius.

    Args:
      job_title: The job title for the search (string).
      location: The desired location for the job search (city, state, or zip code).
      radius: The search radius in miles (optional, integer).

    Returns:
      A string containing the formatted Indeed job search URL.
    """

    # Indeed base URL for job search
    indeed_base_url = "https://www.indeed.com/jobs?"

    # Encode job title and location for safe URL inclusion
    encoded_job_title = urllib.parse.quote(job_title)
    encoded_location = urllib.parse.quote(location)

    # Build the URL with job title and location parameters
    search_url = indeed_base_url + f"q={encoded_job_title}&l={encoded_location}"

    # Add radius parameter if provided
    if radius:
        search_url += f"&radius={radius}"
    return search_url

In [55]:
search_params = [
    'Dental Jobs#Boston, MA# 100 miles',
    'Dental Jobs#Houston, TX# 100 miles',
    'Dental Jobs#Greensboro, NC# 50 miles',
    'Dental Jobs#High point, NC# 50 miles', 
    'Dental Jobs#Wintson-Salem, NC# 50 miles',
    'Dental Jobs#Los Angeles, CA# 100 miles',
    'Dental Jobs#Cleveland OH# 100 miles',
]

In [57]:
def preprocess_links(search_param):
    job, location, distance = search_param.split('#')
    dist= int(distance.strip().split(' ')[0])
    return (job, location, dist)

In [58]:
parsed_output = [preprocess_links(search_param) for search_param in search_params]

links = []
for parsed in parsed_output:
    job, location, distance = parsed
    link = build_indeed_job_search_url(job, location, distance)
    links.append(link)
    
print(links)

In [59]:
def modify_job_link_id(original_url, new_id):
  """
  Modifies the job link in the provided URL by replacing the last ID with a new ID.

  Args:
      original_url: The original job link URL.
      new_id: The new ID to use in the modified link.

  Returns:
      The modified job link URL with the new ID.
  """

  # Split the URL at the last '&' to separate parameters
  url_parts = original_url.split('&')[:-1]  # Exclude the last part containing the ID

  # Extract the last ID (assuming it's after 'vjk=')
  last_id = url_parts[-1].split('=')[-1]

  # Rebuild the URL with the new ID
  modified_url = '&'.join(url_parts) + '&vjk=' + new_id

  return modified_url

In [61]:
def get_data(job_listing):
    # Extract job title
    title = job_listing.find("a").find("span").text.strip()
    
    # Extract company name if available, otherwise assign an empty string
    try:
        company = job_listing.find('span', class_='css-92r8pb eu4oa1w0').text.strip()
    except AttributeError:
        company = ''
    
    # Extract job location if available, otherwise assign an empty string
    try:
        location  = job_listing.find('div', class_='css-1p0sjhy eu4oa1w0').text.strip()
    except AttributeError:
        location = ''
        
    # Extract salary information if available, otherwise assign an empty string
    try:
        salary  = job_listing.find('div', class_='metadata salary-snippet-container css-5zy3wz eu4oa1w0').text.strip()
    except AttributeError:
        salary = ''
    
    # Extract job type if available, otherwise assign an empty string
    try:
        job_type = job_listing.find('div', class_='metadata css-5zy3wz eu4oa1w0').text.strip()
    except AttributeError:
        job_type = ''
    
    # Extract date posted
    date_posted = job_listing.find('span', class_='css-qvloho eu4oa1w0').text.strip()
    
    # Extract job summary
    summary = job_listing.find('div', class_='css-9446fg eu4oa1w0').text.strip()
    
    # Extract Job Link
    link_id = job_listing.find('a').get('data-jk')
    link = modify_job_link_id(current_url, link_id)
    
    
    # Return a tuple containing all the extracted information
    return (title, company, location, salary, job_type, date_posted, summary, link)

In [76]:
browser = webdriver.Chrome(service=ChromeService(ChromeDriverManager().install()), options = chrome_options)
job_list = []
j = 0
for job_search in links:
    print(f'Scraping Link {j+1}')
    website = job_search
    i = 0
    while True:
        browser.get(website)

        # Get the HTML source code of the page after it has fully loaded
        html = browser.page_source

        # Parse the HTML using BeautifulSoup
        soup = BeautifulSoup(html, 'html.parser')

        current_url = browser.current_url

        job_listings = [get_data(i)for i in soup.find_all('div', class_='job_seen_beacon')]
        job_list.extend(job_listings)
        
        # Extract the next page link from the href attribute
        # Search for a with specific attributes
        desired_attributes = {
            "aria-label": "Next Page",
            "data-testid": "pagination-page-next"  # Optional, if present in your HTML
        }

        
        try:
            next_page_button = soup.find('a', desired_attributes).get('href')
            website = modify_job_link_id(current_url, next_page_button)
        except:
            break
        i+=1
        print(f'Scraping Page {i+1}')
        print(website)
    j+=1
    print(f'Scraped page {j+1}')

# Convert list of records into a DataFrame
print('Converting to Dataframe')
df = pd.DataFrame(job_list, columns=['Title', 'Company', 'Location', 'Salary', 'Job Type', 'Date Posted', 'Summary', 'Job Link'])

# Save DataFrame to a CSV file
df.to_csv('jobs_data.csv', index=False)

print("Data saved to indeeds_job_data.csv")

In [81]:
df = pd.read_csv('jobs_data.csv')
df.head()

Unnamed: 0,Title,Company,Location,Salary,Job Type,Date Posted,Summary,Job Link
0,Dental Technician,Tufts University,"Boston, MA 02111 (Chinatown area)",,,PostedPosted 30+ days ago,A formal education in dental laboratory techno...,https://www.indeed.com/jobs?q=Dental+Jobs&l=Bo...
1,Call Center Coordinator - School of Dental Med...,Tufts University,"Boston, MA 02111 (Chinatown area)",,,PostedPosted 1 day ago,The Dental School Call Center department is re...,https://www.indeed.com/jobs?q=Dental+Jobs&l=Bo...
2,Research Assistant - School of Dental Medicine,Tufts University,"Boston, MA 02111 (Chinatown area)",,,PostedPosted 30+ days ago,Tufts University School of Dental Medicine see...,https://www.indeed.com/jobs?q=Dental+Jobs&l=Bo...
3,Dental Assistant,Massachusetts General Hospital(MGH),"Boston, MA 02113 (North End area)",,,PostedPosted 7 days ago,Stocks dental operatories and maintains dental...,https://www.indeed.com/jobs?q=Dental+Jobs&l=Bo...
4,Adjunct Clinical Instructor - Dental Hygiene,MCPHS,"Boston, MA 02115 (Fenway area)",,,PostedPosted 30+ days ago,Understands the scope of practice for dental h...,https://www.indeed.com/jobs?q=Dental+Jobs&l=Bo...


In [79]:
df = df.drop(columns=['Salary', 'Job Type'])
df.head()

Unnamed: 0,Title,Company,Location,Date Posted,Summary,Job Link
0,Dental Technician,Tufts University,"Boston, MA 02111 (Chinatown area)",PostedPosted 30+ days ago,A formal education in dental laboratory techno...,https://www.indeed.com/jobs?q=Dental+Jobs&l=Bo...
1,Call Center Coordinator - School of Dental Med...,Tufts University,"Boston, MA 02111 (Chinatown area)",PostedPosted 1 day ago,The Dental School Call Center department is re...,https://www.indeed.com/jobs?q=Dental+Jobs&l=Bo...
2,Research Assistant - School of Dental Medicine,Tufts University,"Boston, MA 02111 (Chinatown area)",PostedPosted 30+ days ago,Tufts University School of Dental Medicine see...,https://www.indeed.com/jobs?q=Dental+Jobs&l=Bo...
3,Dental Assistant,Massachusetts General Hospital(MGH),"Boston, MA 02113 (North End area)",PostedPosted 7 days ago,Stocks dental operatories and maintains dental...,https://www.indeed.com/jobs?q=Dental+Jobs&l=Bo...
4,Adjunct Clinical Instructor - Dental Hygiene,MCPHS,"Boston, MA 02115 (Fenway area)",PostedPosted 30+ days ago,Understands the scope of practice for dental h...,https://www.indeed.com/jobs?q=Dental+Jobs&l=Bo...


In [80]:
df.to_csv('cleaned job posting.csv')