### Web Scraping
This is a web scraper I built using Selenium and Beautiful Soup to scrape information from Linkedin. From what I read online, Linkedin's API is pretty restrictive in terms of the amount of information it allows you to attain. Hence, I opted to scrape the information myself manually. Anyways, I wanted to practice building a scraper too :) 

I collected the following information:
- Job title
- Company name
- Country
- When the job opening was posted
- Number of applicants
- Job hyperlink
- Job description

Scraping Linkedin has its challenges too. Linkedin, like many other websites, uses an *infinity scroll*, meaning instead of clicking a button to navigate to the next page, you had to scroll to the bottom of the current page and wait for the next page to *append* to the existing page. This was a slight inconvenience but it was overcame with the *scroll* function. What could not be overcome, however, was Linkedin's anti-scraping feature, which allowed a user to only view the first 1000 jobs of a search. But it was alright for me as I felt that 1000 results per search was sufficient.

#### Import libraries

In [1]:
from selenium import webdriver
from selenium.common.exceptions import NoSuchElementException, ElementClickInterceptedException
from time import sleep
from selenium.webdriver.common.keys import Keys
# chromedriver.exe must be in the same folder

from urllib.request import Request, urlopen
from bs4 import BeautifulSoup as soup
import pandas as pd

In [None]:
## ONLY RUN IF CHROMEDRIVER IS OUTDATED ##
from selenium import webdriver
from webdriver_manager.chrome import ChromeDriverManager

driver = webdriver.Chrome(ChromeDriverManager().install())

In [2]:
def scroll(browser, timeout):
    # Get scroll height
    last_height = browser.execute_script('return document.body.scrollHeight')
    
    while True:
        # Scroll down to the bottom
        browser.execute_script('window.scrollTo(0, document.body.scrollHeight);')
        
        # Wait for page to load
        sleep(timeout)
        
        try:
            seemorejobs_button = browser.find_element_by_xpath('//*[@id="main-content"]/div/section/button').click()
            sleep(timeout)
        except:
            pass

        
        # Calculate new scroll height and compare w last scroll height
        new_height = browser.execute_script('return document.body.scrollHeight')
        if new_height == last_height:
            break
            
        last_height = new_height

# Scrape Linkedin | Create CSV File

In [3]:
def linkedin_scraper(jobs):
    for job in jobs:
        search_this_job = job
        url = 'https://www.linkedin.com/jobs'
        job_search_bar_xpath = '/html/body/main/section[1]/section/div[2]/section[2]/form/section[1]/input'
        show_more_button_xpath = '/html/body/main/section[1]/section[3]/div/section/button[1]'

        # Access linkedin
        browser = webdriver.Chrome()
        browser.get(url)
        sleep(1)

        job_search_bar = browser.find_element_by_xpath(job_search_bar_xpath)
        job_search_bar.click()
        job_search_bar.send_keys(search_this_job)
        job_search_bar.send_keys(Keys.ENTER)

        sleep(1)

        # Scroll to bottom of infinity page
        scroll(browser, 3) # second parameter refers to number of sec the webdriver sleeps for, incr if your machine is slow

        # Soup main page and filter out the hyperlinks
        main_soup = soup(browser.page_source, 'lxml')
        containers =  main_soup.findAll('a', {'class': 'result-card__full-card-link'})
        links = [i['href'] for i in containers]

        # Write results to csv file
        filename = 'linkedin_' + search_this_job + '.csv'
        f = open(filename, 'w',  encoding="utf-8")
        headers = 'Job Title,Company Name,Country,When Posted,No. of Applicants,Hyperlink,Job Description\n'
        f.write(headers)

        #### Souping each page ####

        for num in range(len(links)):
            print('now at job number: ' + str(num + 1) + ' of ' + str(len(links)) + ' ...')
            try:
                try:
                    current_link = links[num]
                    browser.get(current_link)
                    current_soup = soup(browser.page_source, 'lxml')
                except:
                    continue

                # Press Show More dropdown button
                try:
                    search_for_showmore_button = browser.find_element_by_xpath(show_more_button_xpath).click()
                except:
                    pass

                # Job Hyperlink
                job_hyperlink = links[num]

                # Job title
                try:
                    job_title = current_soup.find('h1', {'class': 'topcard__title'}).text.replace(',', '|')
                except:
                    pass

                # Company name
                try:
                    coy_name = current_soup.find('a', {'data-tracking-control-name': 'public_jobs_topcard_org_name'}).text.replace(',', '|')
                except:
                    coy_name = current_soup.find('span', {'class': 'topcard__flavor'}).text.replace(',', '|')

                # Country
                country = current_soup.find('span', {'class': 'topcard__flavor topcard__flavor--bullet'}).text.replace(',', '|')

                # Posted how long ago
                try:
                    when_posted = current_soup.find('span', {'class': 'topcard__flavor--metadata posted-time-ago__text'}).text.replace(',', '|')
                except:
                    when_posted = 'nil'

                # Number of applicants
                try:
                    num_applicants = current_soup.find('span', {'class': 'topcard__flavor--metadata topcard__flavor--bullet num-applicants__caption'}).text.replace(',', '|')
                except:
                    num_applicants = current_soup.find('figcaption', {'class': 'num-applicants__caption'}).text

                # Job Description BoW
                job_desc_bow = current_soup.find('div', {'class': 'show-more-less-html__markup'})
                if job_desc_bow == None:
                    job_desc_bow = current_soup.find('div', {'class': 'description__text description__text--rich'})
                job_desc_bow = str(job_desc_bow)
                job_desc_bow = job_desc_bow.replace(',', '')

            except:
                print('AN ERROR WITH THIS JOB OCCURRED, CONTINUING TO NEXT JOB')
                job_title = 'error'
                coy_name = 'error'
                country = 'error'
                when_posted = 'error'
                num_applicants = 'error'
                job_hyperlink = 'error'
                tokens_str = 'error'

            f.write(job_title + ',' + coy_name + ',' + country + ',' + when_posted + ',' + num_applicants + ',' + job_hyperlink + ',' + str(job_desc_bow) + '\n')

        f.close()

#### Input job searches to scrape

In [4]:
search_these_jobs = ['data science']
linkedin_scraper(search_these_jobs)

now at job number: 1 of 25 ...
now at job number: 2 of 25 ...
now at job number: 3 of 25 ...
now at job number: 4 of 25 ...
now at job number: 5 of 25 ...
now at job number: 6 of 25 ...
now at job number: 7 of 25 ...
now at job number: 8 of 25 ...
now at job number: 9 of 25 ...
now at job number: 10 of 25 ...
now at job number: 11 of 25 ...
now at job number: 12 of 25 ...
now at job number: 13 of 25 ...
now at job number: 14 of 25 ...
now at job number: 15 of 25 ...
now at job number: 16 of 25 ...
now at job number: 17 of 25 ...
now at job number: 18 of 25 ...
now at job number: 19 of 25 ...
now at job number: 20 of 25 ...
now at job number: 21 of 25 ...
now at job number: 22 of 25 ...
now at job number: 23 of 25 ...
now at job number: 24 of 25 ...
now at job number: 25 of 25 ...
