# Web Scraping - Indeed.com
General steps for Web Scraping
1. Check whether the website allows web scraping
2. Obtain the source code (HTML File) by using the website URL
3. Download the website content
4. Parse the content using keywords tags for elements of interest
5. Extract relevant data/features
6. Organize raw data in structured format (e.g., CSV)

### Instal Firefox, Selenium, Gecko Driver, Beautiful Soup

In [None]:
#Install firefox
!apt-get update
!apt install firefox

#Install selenium
!pip install selenium

#Updating and installing firefox libraries
!apt-get update && apt-get install -y wget bzip2 libxtst6 libgtk-3-0 libx11-xcb-dev libdbus-glib-1-2 libxt6 libpci-dev && rm -rf /var/lib/apt/lists/*

#Installing Geck Driver
!wget https://github.com/mozilla/geckodriver/releases/download/v0.24.0/geckodriver-v0.24.0-linux64.tar.gz
!tar -xvzf geckodriver*
!chmod +x geckodriver
!export PATH=$PATH:/path-to-extracted-file/.

#Instal beautifulsoup
!pip install beautifulsoup4

### Install UC

In [1]:
%pip install selenium
%pip install beautifulsoup4
%pip install undetected-chromedriver

Collecting selenium
  Downloading selenium-4.16.0-py3-none-any.whl (10.0 MB)
     ---------------------------------------- 10.0/10.0 MB 8.2 MB/s eta 0:00:00
Collecting trio~=0.17
  Downloading trio-0.23.1-py3-none-any.whl (448 kB)
     -------------------------------------- 448.3/448.3 kB 7.0 MB/s eta 0:00:00
Collecting trio-websocket~=0.9
  Downloading trio_websocket-0.11.1-py3-none-any.whl (17 kB)
Collecting exceptiongroup>=1.0.0rc9
  Downloading exceptiongroup-1.2.0-py3-none-any.whl (16 kB)
Collecting outcome
  Downloading outcome-1.3.0.post0-py2.py3-none-any.whl (10 kB)
Collecting sniffio>=1.3.0
  Using cached sniffio-1.3.0-py3-none-any.whl (10 kB)
Collecting wsproto>=0.14
  Downloading wsproto-1.2.0-py3-none-any.whl (24 kB)
Collecting h11<1,>=0.9.0
  Downloading h11-0.14.0-py3-none-any.whl (58 kB)
     ---------------------------------------- 58.3/58.3 kB 3.0 MB/s eta 0:00:00
Installing collected packages: sniffio, outcome, h11, exceptiongroup, wsproto, trio, trio-websocket, selen

### Import Dependencies

In [2]:
import selenium.webdriver as webdriver
from selenium.webdriver.firefox.service import Service
from selenium.webdriver.firefox.options import Options as FirefoxOptions

import pandas as pd
from bs4 import BeautifulSoup
from datetime import datetime
from selenium import webdriver
from selenium.webdriver.common.by import By

import random
import time

import undetected_chromedriver as uc

### Define Position and Location

In [3]:
## Enter a job position
position = "data+scientist"
## Enter a location (City, State or Zip or remote)
locations = "united+states"

def get_url(position, location):
    url_template = "https://www.indeed.com/jobs?q={}&l={}"
    url = url_template.format(position, location)
    return url

url = get_url(position, locations)
dataframe = pd.DataFrame(columns=["Title", "Company", "Location", "Rating", "Date", "Salary", "Description", "Links"])
print(url)

https://www.indeed.com/jobs?q=data+scientist&l=united+states


### Set Path to Webdriver

In [5]:
# legacy
driver_path = '/content/geckodriver'
firefox_driver_path = '/content/geckodriver'

# random user agent
user_agents = ['Mozilla/5.0 (Windows NT 10.0; Win64; x64; rv:120.0) Gecko/20100101 Firefox/120.0',
              'Mozilla/5.0 (Macintosh; Intel Mac OS X 14.1; rv:109.0) Gecko/20100101 Firefox/120.0',
              'Mozilla/5.0 (X11; Linux i686; rv:109.0) Gecko/20100101 Firefox/120.0',
              'Mozilla/5.0 (iPhone; CPU iPhone OS 14_1 like Mac OS X) AppleWebKit/605.1.15 (KHTML, like Gecko) FxiOS/120.0 Mobile/15E148 Safari/605.1.15',
              'Mozilla/5.0 (iPad; CPU OS 14_1 like Mac OS X) AppleWebKit/605.1.15 (KHTML, like Gecko) FxiOS/120.0 Mobile/15E148 Safari/605.1.15']
random_user_agent = random.choice(user_agents)

# options for chrome driver
options = ['--headless',
           '--no-sandbox',
           f'--user-agent={random_user_agent}',
           '--disable-blink-features=AutomationControlled']
chrome_options = uc.ChromeOptions()
for option in options:
  chrome_options.add_argument(option)

# initialize the driver
driver = uc.Chrome(version_main=119, options=chrome_options)

### Scrape Job Postings

In [6]:
## Number of postings to scrape
postings = 10

jn=0
res=[]
for i in range(0, postings, 10):
    driver.get(url + "&start=" + str(i))
    driver.implicitly_wait(3)
    res.append(driver.page_source)
    
    jobs = driver.find_elements(By.CLASS_NAME, 'job_seen_beacon')
    #print(jobs)

    for job in jobs:
        #print(job)
        result_html = job.get_attribute('innerHTML')
        #print(result_html)
        soup = BeautifulSoup(result_html, 'html.parser')
        #print(soup , '\n')

        jn += 1

        liens = job.find_elements(By.TAG_NAME, "a")
        #print(liens)
        links = liens[0].get_attribute("href")
        #print(links)

        title = soup.select('.jobTitle')[0].get_text().strip()
        print(title)

        #company = soup.find_all(attrs={'data-testid': 'company-name'})[0].get_text().strip()
        #print(company)
        try:
            company = soup.find_all(attrs={'data-testid': 'company-name'})[0].get_text().strip()
            #print(company)
        except:
            company = 'NaN'
        print(company)
        #location = soup.select('.companyLocation')[0].get_text().strip() #origional
        #location = soup.select('.company_location')[0].get_text().strip()
        location = soup.find_all(attrs={'data-testid': 'text-location'})[0].get_text().strip()
        print(location)
        try:
            salary = soup.select('.salary-snippet-container')[0].get_text().strip()
        except:
            salary = 'NaN'
        try:
            rating = soup.select('.ratingNumber')[0].get_text().strip()
        except:
            rating = 'NaN'
        try:
            date = soup.select('.date')[0].get_text().strip()
        except:
            date = 'NaN'
        try:
            description = soup.select('.job-snippet')[0].get_text().strip()
        except:
            description = ''

        dataframe = pd.concat([dataframe, pd.DataFrame([{'Title': title,
                                          "Company": company,
                                          'Location': location,
                                          'Rating': rating,
                                          'Date': date,
                                          "Salary": salary,
                                          "Description": description,
                                          "Links": links}])], ignore_index=True)
        print("Job number {0:4d} added - {1:s}".format(jn,title))

Data Scientist
ConnectiveRx
Pittsburgh, PA 15275
Job number    1 added - Data Scientist
Data Scientist
OpenRoad Lending
Fort Worth, TX 76137 (Far North area)
Job number    2 added - Data Scientist
Data Scientist (L5) - Member Product
Netflix
Remote
Job number    3 added - Data Scientist (L5) - Member Product
Data Scientist Specialist/Bioinformatics Specialist
3M
Maplewood, MN
Job number    4 added - Data Scientist Specialist/Bioinformatics Specialist
Jr. Data Scientist
ClimateAI
Remote
Job number    5 added - Jr. Data Scientist
E-commerce Data Scientist - Health and Wellness - California REMOTE
Stingray Direct
Remote in California
Job number    6 added - E-commerce Data Scientist - Health and Wellness - California REMOTE
2024 Business Intelligence Analyst & Data Scientist - New College Graduate Opportunity!
Applied Materials
Austin, TX 78724
Job number    7 added - 2024 Business Intelligence Analyst & Data Scientist - New College Graduate Opportunity!
Data Scientist
INTEL
Phoenix, AZ
J

In [7]:
print(res)
dataframe.head()

['<html dir="ltr" lang="en" class="js-focus-visible" data-js-focus-visible=""><head>\n    <link rel="shortcut icon" href="/images/favicon.ico">\n    <title>Data Scientist Jobs, Employment in United States | Indeed.com</title>\n    <meta http-equiv="content-type" content="text/html; charset=utf-8">\n    <meta name="description" content="10,188 Data Scientist jobs available in United States on Indeed.com. Apply to Data Scientist, Business Intelligence Analyst, E-commerce Specialist and more!">\n    <meta name="referrer" content="origin-when-cross-origin">\n    <link rel="next" href="/jobs?q=data+scientist&amp;l=united+states&amp;jlid=dd616958bd9ddc12&amp;forceLocation=-1&amp;start=10">\n    <link rel="canonical" href="https://www.indeed.com/q-data-scientist-jobs.html">\n    <link rel="alternate" href="android-app://com.indeed.android.jobsearch/https/www.indeed.com/m/jobs?q=data+scientist&amp;l=united+states&amp;start=0">\n    <link rel="alternate" href="ios-app://https/www.indeed.com/m/j

Unnamed: 0,Title,Company,Location,Rating,Date,Salary,Description,Links
0,Data Scientist,ConnectiveRx,"Pittsburgh, PA 15275",,PostedPosted 5 days ago,,Candidates must be able to demonstrate aptitud...,https://www.indeed.com/pagead/clk?mo=r&ad=-6NY...
1,Data Scientist,OpenRoad Lending,"Fort Worth, TX 76137 (Far North area)",,EmployerActive 7 days ago,"$80,000 - $180,000 a year",Excellent communication skills and the ability...,https://www.indeed.com/pagead/clk?mo=r&ad=-6NY...
2,Data Scientist (L5) - Member Product,Netflix,Remote,,PostedPosted 8 days ago,,"You’ll work with other data scientists, data a...",https://www.indeed.com/rc/clk?jk=9776597e1463c...
3,Data Scientist Specialist/Bioinformatics Speci...,3M,"Maplewood, MN",,PostedPosted 30+ days ago •Many applications i...,,Interface with data scientists from across the...,https://www.indeed.com/rc/clk?jk=b828ccfd3e115...
4,Jr. Data Scientist,ClimateAI,Remote,,PostedPosted 30+ days ago •Many applications i...,,Background in data analysis and machine learni...,https://www.indeed.com/rc/clk?jk=f420fd336c4b8...


### Scrape Full Job Descriptions

In [8]:
Links_list = dataframe['Links'].tolist()
#Links_list

In [9]:
descriptions=[]
for i in Links_list:
    driver.get(i)
    driver.implicitly_wait(random.randint(3, 8))
    jd = driver.find_element(By.XPATH, '//div[@id="jobDescriptionText"]').text
    descriptions.append(jd)
    time.sleep(random.randint(5,10))

dataframe['Descriptions'] = descriptions

### Save Results

In [10]:
# Convert the dataframe to a csv file
date = datetime.today().strftime('%Y-%m-%d')
dataframe.to_csv(date + "_" + position + "_" + locations + ".csv", index=False)

In [11]:
dataframe.head()

Unnamed: 0,Title,Company,Location,Rating,Date,Salary,Description,Links,Descriptions
0,Data Scientist,ConnectiveRx,"Pittsburgh, PA 15275",,PostedPosted 5 days ago,,Candidates must be able to demonstrate aptitud...,https://www.indeed.com/pagead/clk?mo=r&ad=-6NY...,"ConnectiveRx is a leading, technology-enabled ..."
1,Data Scientist,OpenRoad Lending,"Fort Worth, TX 76137 (Far North area)",,EmployerActive 7 days ago,"$80,000 - $180,000 a year",Excellent communication skills and the ability...,https://www.indeed.com/pagead/clk?mo=r&ad=-6NY...,OpenRoad Lending is a leading provider of auto...
2,Data Scientist (L5) - Member Product,Netflix,Remote,,PostedPosted 8 days ago,,"You’ll work with other data scientists, data a...",https://www.indeed.com/rc/clk?jk=9776597e1463c...,"Remote, United States\nData Science and Engine..."
3,Data Scientist Specialist/Bioinformatics Speci...,3M,"Maplewood, MN",,PostedPosted 30+ days ago •Many applications i...,,Interface with data scientists from across the...,https://www.indeed.com/rc/clk?jk=b828ccfd3e115...,Job Description:\nData Scientist Specialist/Bi...
4,Jr. Data Scientist,ClimateAI,Remote,,PostedPosted 30+ days ago •Many applications i...,,Background in data analysis and machine learni...,https://www.indeed.com/rc/clk?jk=f420fd336c4b8...,Culture\nAt ClimateAi we are driven by a unite...
