# LinkedIn Web Crawler

In [1]:
# Extra Dependencies
# from selenium.webdriver.chrome.service import Service as ChromeService
# from webdriver_manager.chrome import ChromeDriverManager
# from selenium.webdriver.support.wait import WebDriverWait
# from selenium.webdriver.support import expected_conditions as EC
# from selenium.webdriver.common.by import By
# from selenium.webdriver.common.action_chains import ActionChains
# # from urllib.parse import urljoin


# Import Dependencies
from selenium import webdriver
import time
from time import sleep
from bs4 import BeautifulSoup
import requests
import os
from config import my_directory

# create webdriver object
driver = webdriver.Chrome()

## Change ```keywords``` variable to search the job you want. 
%20 stands for whitespace

In [2]:
##### Web scrapper for infinite scrolling page #####
keywords = 'data%20science'
url = f'https://www.linkedin.com/jobs/search/?currentJobId=3472306869&geoId=103644278&keywords={keywords}&location=United%20States&refresh=true'
driver.get(url)
# time.sleep(2)  # Allow 2 seconds for the web page to open
scroll_pause_time = 2 # You can set your own pause time. My laptop is a bit slow so I use 1 sec
screen_height = driver.execute_script("return window.screen.height;")   # get the screen height of the web
i = 1

while True:
    # scroll one screen height each time
    driver.execute_script("window.scrollTo(0, {screen_height}*{i});".format(screen_height=screen_height, i=i))  
    i += 1
    time.sleep(scroll_pause_time)
    # update scroll height each time after scrolled, as the scroll height can change after we scrolled the page
    scroll_height = driver.execute_script("return document.body.scrollHeight;")  
    # Break the loop when the height we need to scroll to is larger than the total scroll height
    if (screen_height) * i > scroll_height:
        break 

##### Extract LinkedIn URLs #####
urls = []
soup = BeautifulSoup(driver.page_source, "html.parser")
for link in soup.find_all('div', class_='base-card'):
    urls.append(link.a['href'])

In [3]:
len(urls)

173

In [4]:
urls

['https://www.linkedin.com/jobs/view/internship-data-analytics-at-bath-body-works-3803014922?refId=fCEr%2BP%2BqDX3zYSjvUR6OMg%3D%3D&trackingId=EH1mft77OTBOWim3MB1ktg%3D%3D&position=1&pageNum=0&trk=public_jobs_jserp-result_search-card',
 'https://www.linkedin.com/jobs/view/market-data-analyst-intern-at-e-l-f-beauty-3804185298?refId=fCEr%2BP%2BqDX3zYSjvUR6OMg%3D%3D&trackingId=pyazjplm9ySFadb484gu%2BQ%3D%3D&position=2&pageNum=0&trk=public_jobs_jserp-result_search-card',
 'https://www.linkedin.com/jobs/view/junior-data-scientist-at-team-remotely-inc-3817705793?refId=fCEr%2BP%2BqDX3zYSjvUR6OMg%3D%3D&trackingId=1yXQq4yfbzOh6GzIi57tfQ%3D%3D&position=3&pageNum=0&trk=public_jobs_jserp-result_search-card',
 'https://www.linkedin.com/jobs/view/internship-data-science-at-bath-body-works-3803019329?refId=fCEr%2BP%2BqDX3zYSjvUR6OMg%3D%3D&trackingId=kQGIkqM3%2F56bQfrqKOKFbA%3D%3D&position=4&pageNum=0&trk=public_jobs_jserp-result_search-card',
 'https://www.linkedin.com/jobs/view/data-scientist-intern

In [5]:
# checking if there are any duplicates
len(urls) != len(set(urls))

False

In [6]:
for i in urls:
    if (i.__contains__("meta")):
        print(i)

https://www.linkedin.com/jobs/view/data-scientist-small-business-group-at-meta-3816287761?refId=dUCpBILZIlCgy%2BAW4GhPnQ%3D%3D&trackingId=ndgcOi6oXNYUFzOu0Jz59Q%3D%3D&position=1&pageNum=6&trk=public_jobs_jserp-result_search-card


## Directly change ```my_directory``` variable or create a config.py file to import ```my_directory```

In [7]:
# my_directory = "C:/<example_Desktop>/<example_directory>"

os.chdir(my_directory)

SCRAPED_JOBS_PATH = my_directory + "/jobs"

SCRAPED_JOBS_PATH_EXIST = os.path.exists(SCRAPED_JOBS_PATH)

if SCRAPED_JOBS_PATH_EXIST == True:
    pass
else:
    os.mkdir(SCRAPED_JOBS_PATH)

## Loop through ```urls``` list to create a job description file for each job

In [8]:
for url in urls:
    page = requests.get(url)
    time.sleep(2)
    soup = BeautifulSoup(page.text, 'lxml')
    os.chdir(SCRAPED_JOBS_PATH)
    
    for div in soup.find_all('div', class_='top-card-layout__entity-info'):
        
        position_title = div.h1.text
        company = (div.h4.div.span.a.text).lstrip().rstrip()
        
        file_name = f"{position_title.replace(' ', '')}-{company.replace(' ', '')}.txt"
        file_name_exists = os.path.exists(SCRAPED_JOBS_PATH + "/" + file_name)
        
        if file_name_exists:
            pass
        else:
            try:
                with open(file_name, "w", encoding="utf-8") as my_file:

                    my_file.write("Position title: " + position_title)
                    my_file.write("\n")
                    my_file.write("\n")

                    my_file.write("Company: " + company)
                    my_file.write("\n")
                    my_file.write("\n")

                    section = soup.find_all('section', class_='show-more-less-html')

                    for i in section:
                        div = soup.find('div', class_='show-more-less-html__markup')

                        try:
                            my_file.write(div.text)
                            my_file.write("\n")

                        except FileNotFoundError:
                            pass
                        
                    my_file.write("Job URL: " + url)
                    
            except (FileNotFoundError, OSError) as e:
                pass

In [9]:
driver.close()