In [2]:
# Import the necessary Libraries
from typing import Optional, Dict, Tuple
import pandas as pd
from bs4 import BeautifulSoup
from selenium import webdriver
from selenium.webdriver.common.by import By
from selenium.webdriver.common.keys import Keys
from selenium.webdriver.chrome.service import Service as ChromeService
from webdriver_manager.chrome import ChromeDriverManager
from time import sleep
from rich import print
import urllib

In [3]:
# Add settings and Configurations
chrome_options = webdriver.ChromeOptions()
#chrome_options.add_argument('--headless')
chrome_options.add_argument('--no-sandbox')
chrome_options.add_argument('--disable-dev-shm-usage')

In [4]:
browser = webdriver.Chrome(service=ChromeService(ChromeDriverManager().install()), options = chrome_options)

In [5]:
def build_indeed_job_search_url(job_title, 
                                location, 
                                radius=None, 
                                fromage=None):
    """
    Builds a URL for searching jobs on Indeed with a specified job title, location, and optional radius.

    Args:
      job_title: The job title for the search (string).
      location: The desired location for the job search (city, state, or zip code).
      radius: The search radius in miles (optional, integer).

    Returns:
      A string containing the formatted Indeed job search URL.
    """
    # Indeed base URL for job search
    indeed_base_url = "https://www.indeed.com/jobs?"
    # Encode job title and location for safe URL inclusion
    encoded_job_title = urllib.parse.quote(job_title)
    encoded_location = urllib.parse.quote(location)

    # Build the URL with job title and location parameters
    search_url = indeed_base_url + f"q={encoded_job_title}&l={encoded_location}"

    # Add radius parameter if provided
    if radius:
        search_url += f"&radius={radius}"
    if fromage:
        search_url += f"&fromage={fromage}"
    return search_url

In [14]:
search_params = [
    'Dental Practices#Boston, MA# 100 miles',
    'Dental Practices#Houston, TX# 100 miles',
    'Dental Practices#Greensboro, NC# 50 miles',
    'Dental Practices#High point, NC# 50 miles', 
    'Dental Practices#Wintson-Salem, NC# 50 miles',
    'Dental Practices#Los Angeles, CA# 100 miles',
    'Dental Practices#Cleveland OH# 100 miles',
]

In [15]:
def preprocess_links(search_param):
    job, location, distance = search_param.split('#')
    dist= int(distance.strip().split(' ')[0])
    time = 7
    return (job, location, dist, time)

In [16]:
parsed_output = [preprocess_links(search_param) for search_param in search_params]

links = []
for parsed in parsed_output:
    job, location, distance, fromage = parsed
    link = build_indeed_job_search_url(job, location, distance, fromage)
    links.append(link)
    
print(links)

In [11]:
def modify_job_link_id(original_url, new_id):
  """
  Modifies the job link in the provided URL by replacing the last ID with a new ID.

  Args:
      original_url: The original job link URL.
      new_id: The new ID to use in the modified link.

  Returns:
      The modified job link URL with the new ID.
  """

  # Split the URL at the last '&' to separate parameters
  url_parts = original_url.split('&')[:-1]  # Exclude the last part containing the ID

  # Extract the last ID (assuming it's after 'vjk=')
  last_id = url_parts[-1].split('=')[-1]

  # Rebuild the URL with the new ID
  modified_url = '&'.join(url_parts) + '&vjk=' + new_id

  return modified_url

In [17]:
def get_data(job_listing):
    # Extract job title
    title = job_listing.find("a").find("span").text.strip()
    
    # Extract company name if available, otherwise assign an empty string
    try:
        company = job_listing.find('span', class_='css-92r8pb eu4oa1w0').text.strip()
    except AttributeError:
        company = ''
    
    # Extract job location if available, otherwise assign an empty string
    try:
        location  = job_listing.find('div', class_='css-1p0sjhy eu4oa1w0').text.strip()
    except AttributeError:
        location = ''
        
    # Extract salary information if available, otherwise assign an empty string
    try:
        salary  = job_listing.find('div', class_='metadata salary-snippet-container css-5zy3wz eu4oa1w0').text.strip()
    except AttributeError:
        salary = ''
    
    # Extract job type if available, otherwise assign an empty string
    try:
        job_type = job_listing.find('div', class_='metadata css-5zy3wz eu4oa1w0').text.strip()
    except AttributeError:
        job_type = ''
    
    # Extract date posted
    date_posted = job_listing.find('span', class_='css-qvloho eu4oa1w0').text.strip()
    
    # Extract job summary
    summary = job_listing.find('div', class_='css-9446fg eu4oa1w0').text.strip()
    
    # Extract Job Link
    link_id = job_listing.find('a').get('data-jk')
    link = modify_job_link_id(current_url, link_id)
    
    
    # Return a tuple containing all the extracted information
    return (title, company, location, salary, job_type, date_posted, summary, link)

In [19]:
browser = webdriver.Chrome(service=ChromeService(ChromeDriverManager().install()), options = chrome_options)
job_list = []
j = 0
for job_search in links:
    print(f'Scraping Link {j+1}')
    website = job_search
    i = 0
    while True:
        browser.get(website)

        # Get the HTML source code of the page after it has fully loaded
        html = browser.page_source

        # Parse the HTML using BeautifulSoup
        soup = BeautifulSoup(html, 'html.parser')

        current_url = browser.current_url

        job_listings = [get_data(i)for i in soup.find_all('div', class_='job_seen_beacon')]
        job_list.extend(job_listings)
        
        # Extract the next page link from the href attribute
        # Search for a with specific attributes
        desired_attributes = {
            "aria-label": "Next Page",
            "data-testid": "pagination-page-next"  # Optional, if present in your HTML
        }

        
        try:
            next_page_button = soup.find('a', desired_attributes).get('href')
            website = modify_job_link_id(current_url, next_page_button)
        except:
            break
        i+=1
        print(f'Scraping Page {i+1}')
        print(website)
    j+=1
    print(f'Scraped page {j+1}')

# Convert list of records into a DataFrame
print('Converting to Dataframe')
df = pd.DataFrame(job_list, columns=['Title', 'Company', 'Location', 'Salary', 'Job Type', 'Date Posted', 'Summary', 'Job Link'])

# Save DataFrame to a CSV file
df.to_csv('indeeds_jobs_data.csv', index=False)

print("Data saved to indeeds_job_data.csv")

In [20]:
df = pd.read_csv('indeeds_jobs_data.csv')
df

Unnamed: 0,Title,Company,Location,Salary,Job Type,Date Posted,Summary,Job Link
0,Production Worker - Purelink - 1st shift,Kayem Foods,"Chelsea, MA 02150",,,PostedPosted 3 days ago,SUMMARY: Responsible for producing product in ...,https://www.indeed.com/jobs?q=Dental%20Practic...
1,Employee Relations Officer,Boston College,"Chestnut Hill, MA 02467",,,PostedPosted 2 days ago,"Boston College Introduction Founded in 1863, B...",https://www.indeed.com/jobs?q=Dental%20Practic...
2,"Compliance Officer, Assistant Vice President, ...",State Street,"Boston, MA 02111 (Central area)",,,PostedPosted 1 day ago,Who we are looking for We are seeking to recru...,https://www.indeed.com/jobs?q=Dental%20Practic...
3,Patient Experience Representative (Call center...,Boston Children's Hospital,"Brookline, MA",,,PostedPosted 6 days ago,"At Boston Children’s Hospital, the quality of ...",https://www.indeed.com/jobs?q=Dental%20Practic...
4,Dental Assistant,South Boston Community Health Center,"Boston, MA 02127 (South Boston area)",,,PostedToday,"Date Posted: April 25, 2024 Job Summary The de...",https://www.indeed.com/jobs?q=Dental%20Practic...
...,...,...,...,...,...,...,...,...
4428,Machine Operator/Welding (2nd Shift),Welding Alloys USA,"New Castle, PA 16105",,,EmployerActive 2 days ago,Welding Alloys is a global leader in the produ...,https://www.indeed.com/jobs?q=Dental%20Practic...
4429,Certified Recovery Coach,Detroit Recovery Project,"Detroit, MI 48203 (Greenfield Park area)",,,PostedPosted 2 days ago,As a Certified Community Behavioral Health Cli...,https://www.indeed.com/jobs?q=Dental%20Practic...
4430,AIM Palliative RN,Empatia,"Clinton Township, MI 48038",,,PostedToday,Current RN license to practice in the state.\n...,https://www.indeed.com/jobs?q=Dental%20Practic...
4431,Medical Assistant,Dr. Rebecca D Lashbrook,"Meadville, PA 16335",,,PostedToday,The ideal candidate will provide support to me...,https://www.indeed.com/jobs?q=Dental%20Practic...


In [21]:
df = df.drop(columns=['Salary', 'Job Type'])
df.head()

Unnamed: 0,Title,Company,Location,Date Posted,Summary,Job Link
0,Production Worker - Purelink - 1st shift,Kayem Foods,"Chelsea, MA 02150",PostedPosted 3 days ago,SUMMARY: Responsible for producing product in ...,https://www.indeed.com/jobs?q=Dental%20Practic...
1,Employee Relations Officer,Boston College,"Chestnut Hill, MA 02467",PostedPosted 2 days ago,"Boston College Introduction Founded in 1863, B...",https://www.indeed.com/jobs?q=Dental%20Practic...
2,"Compliance Officer, Assistant Vice President, ...",State Street,"Boston, MA 02111 (Central area)",PostedPosted 1 day ago,Who we are looking for We are seeking to recru...,https://www.indeed.com/jobs?q=Dental%20Practic...
3,Patient Experience Representative (Call center...,Boston Children's Hospital,"Brookline, MA",PostedPosted 6 days ago,"At Boston Children’s Hospital, the quality of ...",https://www.indeed.com/jobs?q=Dental%20Practic...
4,Dental Assistant,South Boston Community Health Center,"Boston, MA 02127 (South Boston area)",PostedToday,"Date Posted: April 25, 2024 Job Summary The de...",https://www.indeed.com/jobs?q=Dental%20Practic...


In [22]:
df.to_csv('cleaned job posting.csv')

In [33]:
df = pd.read_csv('cleaned job posting.csv', index_col=0)
df.head()

Unnamed: 0,Title,Company,Location,Date Posted,Summary,Job Link
0,Production Worker - Purelink - 1st shift,Kayem Foods,"Chelsea, MA 02150",PostedPosted 3 days ago,SUMMARY: Responsible for producing product in ...,https://www.indeed.com/jobs?q=Dental%20Practic...
1,Employee Relations Officer,Boston College,"Chestnut Hill, MA 02467",PostedPosted 2 days ago,"Boston College Introduction Founded in 1863, B...",https://www.indeed.com/jobs?q=Dental%20Practic...
2,"Compliance Officer, Assistant Vice President, ...",State Street,"Boston, MA 02111 (Central area)",PostedPosted 1 day ago,Who we are looking for We are seeking to recru...,https://www.indeed.com/jobs?q=Dental%20Practic...
3,Patient Experience Representative (Call center...,Boston Children's Hospital,"Brookline, MA",PostedPosted 6 days ago,"At Boston Children’s Hospital, the quality of ...",https://www.indeed.com/jobs?q=Dental%20Practic...
4,Dental Assistant,South Boston Community Health Center,"Boston, MA 02127 (South Boston area)",PostedToday,"Date Posted: April 25, 2024 Job Summary The de...",https://www.indeed.com/jobs?q=Dental%20Practic...


In [29]:
len(df['Job Link'].unique())

4433