### Guides Used:
Initial tutorial: https://maoviola.medium.com/a-complete-guide-to-web-scraping-linkedin-job-postings-ad290fcaa97f

For login: https://www.geeksforgeeks.org/scrape-linkedin-using-selenium-and-beautiful-soup-in-python/

The scraper runs first by starting a webdriver instance. Then, it'll navigate to the LinkedIn login page and log in. The rest of the scrape is set up in 2 function below. Empty lists are intialized globally so they can be accessed by all cells of the notebook. The scraper loop function takes in a LinkedIn job search url, a starting page number, and an ending page number. It loads the starting url, navigates to the bottom of the page, and presses the button corresponding to the starting page number. Then, it calls the scrape function which grabs a list of all jobs on the page and navigates through them one by one. It clicks on the job card to pull up the more detailed window with the job description, and scrapes the information from it. Then, it naviagtes to the next job in the list and repeats the process. Once it has scraped all jobs in the list, it returns to the scraper loop function which navigates to the next page and calls the scrape function again. Once the loop detects it is on the same page as then given ending page number, it'll call the scraper one last time and end the loop. A list of the current datetime is then concatenated with the rest of the job lists into a pandas dataframe where it is exported into a csv file. The csv files are then re-imported into the project and concatenated into one giant pandas dataframe. This dataframe is given basic formatting and verified that all the cells look correct. Then, it's given advanced formatting where select columns are split using regex into multiple columns and other select columns are formatted with True/ False values. Then, the description is analyzed using spacy into keywords, namely key skills, which are then one-hot encoded. Each listing is then manually rating from 1 to 3 based on how relevant and promising a listing it is. The whole dataframe is then split into training and test sets. The training set is used to train a logistic regression model, verified with the test set. New data is then obtained through the scraper and passed through the cleaning and transformation pipeline before being sent through the regression model and categorized by relevance. A final function returns most recent and highest-rated listings.

In [None]:
#pip install selenium

In [None]:
from selenium import webdriver
from selenium.webdriver.common.by import By
import time
import pandas as pd
import numpy as np
import math

In [None]:
wd = webdriver.Chrome()

#### Logging in

In [None]:
login_url = "https://www.linkedin.com/login?fromSignIn=true&trk=guest_homepage-basic_nav-header-signin"
print("logging in")
wd.get(login_url)
time.sleep(2)
user = wd.find_element(By.ID, "username")
user.send_keys("****YOUR USERNAME HERE****")
passw = wd.find_element(By.ID, "password")
passw.send_keys("****YOUR PASSWORD HERE****")
wd.find_element(By.XPATH, "//button[@type='submit']").click()

### Scraping the jobs
Only 25 load per page so we need to scroll to the bottom of the page to load all 25, scrape them, and move to the next page.

In [None]:
#initialize lists globally so all funcs can access
ids = []
date_posted = []
date_scraped = []
title = [] 
company = []
location = []
promoted_easyapply = []
emp_info = []
co_info = []
connections_alumni = []
link = []
descr = []
poster_name = []
poster_link = []
job_title_category = []

def initialize():
    #used to re-initialize lists in case of testing, errors, or new scrapes
    answ = input("Are you sure you want to empty all lists? Y/N")
    if answ.lower() in ["n", "no"]:
        print("NOT reinitializing lists")
        return
    
    print("reinitializing lists")
    global ids, date_posted, date_scraped, title, company, location, promoted_easyapply, emp_info, co_info, connections_alumni, link, descr, poster_name, poster_link, job_title_category
    ids = []
    date_posted = []
    date_scraped = []
    title = [] 
    company = []
    location = []
    promoted_easyapply = []
    emp_info = []
    co_info = []
    connections_alumni = []
    link = []
    descr = []
    poster_name = []
    poster_link = []
    job_title_category = []

In [None]:
def scraper_loop(start_url, start_pg=1, end_pg=5):
    print("loading starting page")
    wd.get(start_url)
    time.sleep(5)
    
    numJobs = wd.find_element(By.CLASS_NAME, "jobs-search-results-list__subtitle").get_attribute("innerText")[:-8]
    numJobs = int(numJobs.replace(",",""))
    print("found "+str(numJobs)+" jobs posted in the last 24 hours")
    
    max_pgs = math.ceil(numJobs/25)
    end_pg = min(max_pgs, end_pg)
    
    total_scrape_time = ((end_pg - start_pg) + 1) * 195
    print("maximum scrape time: roughly "+str(total_scrape_time)+" seconds")
    
    curr_pg = start_pg
    while curr_pg <= end_pg:
        print("loading page "+str(curr_pg)+" of "+str(end_pg))
        pages = wd.find_element(By.CLASS_NAME, "artdeco-pagination__pages.artdeco-pagination__pages--number").find_elements(By.CSS_SELECTOR, "button")
        #if len(pages) < curr_pg:
            #break
    
        if curr_pg >= 10:
            pages[6].click()
            time.sleep(5)
        else:
            pages[curr_pg-1].click()
            time.sleep(5)
            
        
        print("scrolling to bottom of page")
        for i in range(0,5):
            wd.execute_script("window.scrollTo(0,document.body.scrollHeight);")
            time.sleep(2)
            
        jobs = wd.find_element(By.CLASS_NAME, "jobs-search-results-list").find_elements(By.CLASS_NAME, "jobs-search-results__list-item")
        print("found "+str(len(jobs))+" of maximum 25 jobs")
        
            
        curr_pg+=1
        
        print("scraping...")
        scrape(jobs)
        
    #done with the loop and the scrape!
    print("DONE!!")
        
    
def scrape(jobs):
    num_jobs = str(len(jobs))
    i = 1
    for job in jobs:
        print(str(i)+"/"+num_jobs)
        i+=1
        job.click()
        time.sleep(2)
    
        #job id
        ids.append(job.find_element(By.CLASS_NAME, "job-card-container").get_attribute("data-job-id"))
    
        #job title
        title.append(job.find_element(By.CLASS_NAME, "job-card-list__title").get_attribute("innerText"))
        #print(title[-1])
    
        #job company
        company.append(job.find_element(By.CLASS_NAME, "job-card-container__primary-description").get_attribute("innerText"))
    
        #job location
        location.append(job.find_element(By.CLASS_NAME, "job-card-container__metadata-item").get_attribute("innerText"))
    
        #easy apply/ promoted
        promoted_easyapply.append([i.get_attribute("innerText") for i in job.find_elements(By.CLASS_NAME, "job-card-list__footer-wrapper")])
    
        #advanced info holding variable
        job.click()
        time.sleep(5)
        try:
            adv = wd.find_elements(By.CLASS_NAME, "jobs-unified-top-card__job-insight")
            icons = [i.find_element(By.CSS_SELECTOR, "li-icon").get_attribute("type") for i in adv]
    
            #job info
            if "job" in icons:
                emp_info.append(adv.pop(0).get_attribute("innerText"))
            else:
                emp_info.append(np.NaN)
    
            #company info
            if "company" in icons:
                co_info.append(adv.pop(0).get_attribute("innerText"))
            else:
                co_info.append(np.NaN)
    
            #alumni and connections
            if "people" in icons:
                ppl = adv.pop(0)
                connections_alumni.append([ppl.get_attribute("innerText"), [i.get_attribute("href") for i in ppl.find_elements(By.CLASS_NAME, "app-aware-link")]])
            else:
                connections_alumni.append(np.NaN)
        except:
            #if there's a problem getting the icons (sometimes it can't find li-icon) just append np.NaN and manually fill in when cleaning
            emp_info.append(np.NaN)
            co_info.append(np.NaN)
            connections_alumni.append(np.NaN)
            
    
        #job listing link
        link.append(job.find_element(By.CSS_SELECTOR, "a").get_attribute("href"))
    
        #job description
        descr.append(wd.find_element(By.CLASS_NAME, "jobs-box__html-content").get_attribute("innerText"))
    
        #hirer info
        try:
            hirer_box = wd.find_element(By.CLASS_NAME, "hirer-card__container")
            poster_name.append(hirer_box.find_element(By.CLASS_NAME, "jobs-poster__name").get_attribute("innerText"))
            poster_link.append(hirer_box.find_element(By.CLASS_NAME, "app-aware-link").get_attribute("href"))
        except:
            poster_name.append(np.NaN)
            poster_link.append(np.NaN)


In [None]:
initialize()

In [None]:
#data scientist 24-hr scraper
url = "https://www.linkedin.com/jobs/search/?currentJobId=3705632803&f_TPR=r86400&geoId=90000084&keywords=Data%20Scientist&location=San%20Francisco%20Bay%20Area&refresh=true"
new_url = "https://www.linkedin.com/jobs/search/?currentJobId=3714619029&f_TPR=r86400&geoId=90000084&keywords=Data%20Scientist&location=San%20Francisco%20Bay%20Area&refresh=true&start=900"
scraper_loop(new_url, 37, 100)

In [None]:
#data analyst 24-hr scraper
url = "https://www.linkedin.com/jobs/search/?keywords=Data%20Analyst&location=San%20Francisco%20Bay%20Area&locationId=&geoId=90000084&f_TPR=r86400&position=1&pageNum=0"
scraper_loop(url, 1, 100)

In [None]:
#turn columns into a dataframe
curr_date = time.strftime("%m-%d-%Y", time.localtime())
date_posted = [curr_date for i in ids]
date_scraped = date_posted
job_title_category = ["data analyst" for i in ids]
scraped_dict = {"date":date_posted, "title": title, "company":company, "location":location, "link":link, "full description":descr, "promoted/easy apply":promoted_easyapply, "employment info":emp_info, "company info":co_info, "recruiter name":poster_name, "recruiter profile link":poster_link, "connections and alumni":connections_alumni, "scraped on":date_scraped, "searched job title":job_title_category}
scraped_df = pd.DataFrame(data=scraped_dict, index=ids)
scraped_df.head()

In [None]:
scraped_df.drop_duplicates(subset=scraped_df.index)

In [None]:
#export dataframe to csv
name = "listings_df_"+"data_analyst_"+str(date_scraped[0])+".csv"
scraped_df.to_csv("D:\\Scraped Datasets\\"+name)

# Data Cleaning, Transformation, and Manipulation Pipeline

In [None]:
#import data
listings_danalyst_09_05_2023 = pd.read_csv("D:\Scraped Datasets\listings_df_data_analyst_09-05-2023.csv")
listings_dscientist_09_05_2023 = pd.read_csv("D:\Scraped Datasets\listings_df_data_scientist_09-05-2023.csv")

In [None]:
base_listings = [listings_danalyst_09_05_2023, listings_dscientist_09_05_2023]

In [None]:
#combine
listings = pd.concat(base_listings, ignore_index=True)
listings.head()

In [None]:
#fix column headers
listings.insert(0, "job id", listings["Unnamed: 0"])
listings.drop(["Unnamed: 0"], axis=1, inplace=True)
listings.head()

In [None]:
#separate location and physical location
temp_loc = listings["location"]
temp_loc = temp_loc.str.split("(", expand=True)
temp_loc[1] = temp_loc[1].str[:-1]
listings["location"] = temp_loc[0]
listings.insert(5, "location type", temp_loc[1])
listings.head()

In [None]:
#create promoted and easy apply columns
promote_func = lambda x: True if "Promoted" in x else False
ea_func = lambda x: True if "Easy Apply" in x else False
temp_promoted = listings["promoted/easy apply"].apply(promote_func)
temp_ea = listings["promoted/easy apply"].apply(ea_func)
listings.insert(8, "promoted", temp_promoted)
listings.insert(9, "easy apply", temp_ea)
listings.drop(["promoted/easy apply"], axis=1, inplace=True)
listings.head()

In [None]:
listings["employment info"]

In [None]:
#create pay, hours, and experience columns
#listings["employment info"][0]
#temp_emp_info = listings["employment info"].str.rsplit("·")

pay_lower = listings["employment info"].str.extract(r'(\$[0-9]{1,3},[0-9]{3})')[0]
pay_higher = listings["employment info"].str.extract(r'(-\s\$[0-9]{1,3},[0-9]{3})')[0].str[1:]
pay_type = 
#temp_hours =
#temp_exp =  
pay_higher