# Scripts for automatic job searching through job websites

## General functions

In [72]:
from selenium import webdriver
from selenium.webdriver.common.keys import Keys
from selenium.webdriver.common.by import By
from selenium.webdriver.support.ui import WebDriverWait
from selenium.webdriver.support import expected_conditions as EC
from selenium.common.exceptions import NoSuchElementException 
from bs4 import BeautifulSoup as bs
from datetime import datetime
from time import sleep
import pandas as pd
import requests
import json
import re
import os

def by_xpath(xpath, fill=None, enter=False, wait=0):
    '''Fills in an input field or clicks on an icon.'''
    
    try:
        elem = WebDriverWait(driver, 10).until(
            EC.element_to_be_clickable((By.XPATH, xpath))) # finds the specified field
    finally:
        if fill != None:
            elem.click()
            elem.clear()
            elem.send_keys(fill)
            sleep(wait)
            if enter == True:
                elem.send_keys(Keys.RETURN)
            
        else:
            if enter == True:
                elem.click()
                sleep(1)
                elem.send_keys(Keys.RETURN)
            else:
                sleep(wait)
                elem.click()
    return None

def if_element(xpath):
    '''Determines if element present in page.'''
    
    try:
        time.sleep(3)
        driver.find_element_by_xpath(xpath)
    except NoSuchElementException:
        return False
    return True

def elem_siblings(url, tag, class_):
    '''Returns the number siblings of given html block'''
    
    soup = bs((requests.get(url)).text)
    return len(soup.find(tag, class_=class_))

def jobs_tofile(filename, keyword, location):
    '''Extracts job headlines and their hrefs from web page
    
    Parameters:
        filename - [str] name of the file without extention
        head - [boolean] weather to append df with or withoud header
        
    Returns: None'''
    
    current_date = datetime.now().strftime("%d-%b") # current date [Day - Month]
    soup = bs((requests.get(driver.current_url)).text) # soup object

    headlines = [(block.h3.get_text(), keyword, location, block['href']) for block in soup.find_all('a', 
        class_=re.compile("listed-job-posting listed-job-posting--is-link*"))]
    
    df = pd.DataFrame(headlines, columns=['job_title', 'keyword', 'location', 'href'])
    df.to_csv(filename + '_' + current_date + '.csv', sep='\t', 
              encoding='utf-8', mode='a', header=False, index=False)

def scan_pages(keyword, location, filename):
    '''Clicks through all pages and extracts jobs from them.'''
    
    nr_of_pages = elem_siblings(driver.current_url, 'ul', class_='pagination__pages')
    
    if nr_of_pages == 1:
        jobs_tofile(filename, keyword, location)
        return False
    else:
        for page in range(2, nr_of_pages + 1):
    
            try:
                elem = WebDriverWait(driver, 10).until(
                    EC.element_to_be_clickable((By.XPATH, '//a[contains(text(),"{}")]'.format(page))))
            finally:
                elem.click()
            jobs_tofile(filename, keyword, location)
    return True

def show_jobs(filename):
    '''Prints out DF of jobs.
        Parameters:
            filename : [str]
        Returns None'''
    current_date = datetime.now().strftime("%d-%b") # current date [Day - Month]
    
    df = pd.read_csv(filename + '_' + current_date + '.csv', sep='\t', header=None,
                    names=['job_title', 'keyword', 'location', 'href'])
    print(df)

def search_jobs(url, keywords, locations, filename, date='week'):
    '''Finds jobs on linkedin, stores into .csv 
    file and prints out the resulting DF.
        
        Parameters:
            url : [str] web page address with search options.
            keywords: [list] job titles.
            locations: [list] locations / regions /states.
            filename: [str] name of the file where data is printed.
            date : [str] "24"    : past 24 hours
                         "week   : past week
                         "month" : past month
                         "any"   : anytime
                         
        Returns None'''
    
    dates = {'24' : '0','week' : '1','month' : '2','any' : '3'}
    
    assert type(keywords) == list, "Error, keyword not 'list' type!"
    assert type(locations) == list, "Error, locations not 'list' type!"
    assert date in dates, 'Error, no such date! Use either:\n"24", "week", "month", "any"'
    
    driver.get(url) # navigates to gven URL
    
    for i, location in enumerate(locations):
        for j, keyword in enumerate(keywords):
            
            # filling search fields
            sleep(2)
            by_xpath('//*[@id="keyword-box-input"]', fill=keyword)
            by_xpath('//*[@id="location-box-input"]', fill=location)
            by_xpath('//button[@type="submit"]')
            
            if i + j == 0:
                by_xpath('//*[@id="location-box-input"]', fill=location) # buggy
                by_xpath('//button[@type="submit"]')
                
                # filters
                by_xpath('//button[text()="Date Posted"]') # which period
                by_xpath('//label[@for="TIME_POSTED-{}"]'.format(dates[date]))
                by_xpath('//button[@type="submit"][contains(text(),"Apply")]')
                by_xpath('//button[text()="Experience Level"]') # exp levels
                by_xpath("//label[@for='EXPERIENCE-0']")
                by_xpath("//label[@for='EXPERIENCE-1']")
                by_xpath("/html/body/main/section[2]/div/div[2]/div[1]/ul/li[4]"
                         "/form/div/div/fieldset/div[2]/button[2]")
                by_xpath('//select[@id="sort-options"]') # sort by date
                by_xpath('//option[@value="DD"]')
                
                jobs_tofile(filename, keyword, location)
                scan_pages(keyword, location, filename)
            else:
                scan_pages(keyword, location, filename)
    driver.quit()

### LinkedIn

In [74]:
driver = webdriver.Firefox() # setting up the driver

search_jobs(url="https://ca.linkedin.com/jobs/",
            keywords=['Data Scientist', 'Machine Learning', 'Data Analyst'],
            locations=['British Columbia', 'Alberta'],
            filename='linkedin')

### Testing LinkedIn without logging in

In [27]:
# setting up the driver
driver = webdriver.Firefox()

# The driver.get method will navigate to a page given by the URL.
driver.get("https://ca.linkedin.com/jobs/")

# search field
by_xpath('//*[@id="keyword-box-input"]', fill='Data Scientist')
by_xpath('//*[@id="location-box-input"]', fill='British Columbia')
by_xpath('//button[@type="submit"]')

# for some reason theres a bug and have to fill one field again
by_xpath('//*[@id="location-box-input"]', fill='British Columbia')
by_xpath('//button[@type="submit"]')

# date posted
by_xpath('//button[text()="Date Posted"]')
by_xpath('//label[@for="TIME_POSTED-0"]')
by_xpath('//button[@type="submit"][contains(text(),"Apply")]')

# experience levels
by_xpath('//button[text()="Experience Level"]')
by_xpath("//label[@for='EXPERIENCE-0']")
by_xpath("//label[@for='EXPERIENCE-1']")
#by_xpath("//button[@type='submit'][contains(text(),'Apply')]", enter=True)
by_xpath("/html/body/main/section[2]/div/div[2]/div[1]/ul/li[4]/form/div/div/fieldset/div[2]/button[2]")

# sort by
by_xpath('//select[@id="sort-options"]')
by_xpath('//option[@value="DD"]')

# read off job titles and hrefs from each page
nr_of_pages = elem_siblings(driver.current_url, 'ul', class_='pagination__pages')
jobs_tofile('linkedin', 'Data Scientist', 'British Columbia', head=True)
for page in range(2, nr_of_pages + 1):
    
    try:
        elem = WebDriverWait(driver, 10).until(
            EC.element_to_be_clickable((By.XPATH, '//a[contains(text(),"{}")]'.format(page))))
    finally:
        elem.click()
    #by_xpath('//a[contains(text(),"{}")]'.format(page))
    jobs_tofile('linkedin')
    
driver.quit()

# show_jobs('linkedin')