In [None]:
#load packages for scraping
from selenium import webdriver
from bs4 import BeautifulSoup
from selenium.webdriver.common.by import By
from selenium.webdriver.support.ui import WebDriverWait
from selenium.webdriver.support import expected_conditions as EC
from selenium.webdriver.firefox.options import Options
import time
import getpass
import sys
from tqdm import tqdm

#load packages for data manipulation and storage
import numpy as np
import pandas as pd
import re
from random import randint
from random import uniform

from sqlalchemy import create_engine

from collections import defaultdict

In [None]:
def save_data(fname, df, database_filename, if_exists='append'):
    engine = create_engine('sqlite:///'+ database_filename)
    df.to_sql(fname, engine, index=False, if_exists=if_exists)
    
def load_data(fname, database_filename):
    engine = create_engine('sqlite:///' + database_filename)
    df = pd.read_sql_table(fname, con=engine)
    return df

In [None]:
#store user name and password
session_key = input('username: ')
session_password = getpass.getpass('password: ')
n = input('number of jobs: ')

#initialize starting url
start_url = 'https://www.linkedin.com'

#initialize driver using headless Firefox
options = Options()
# options.add_argument('--headless')
driver = webdriver.Firefox(firefox_options = options)
driver.get(start_url)

#passing username and password to login
username = driver.find_element_by_name('session_key')
username.send_keys(session_key)
time.sleep(uniform(1,3))
password = driver.find_element_by_name('session_password')
password.send_keys(session_password)
time.sleep(uniform(1,3))

driver.find_element_by_id('login-submit').click()


In [None]:
driver.find_element_by_id('jobs-nav-item').click()
position_name = 'data analyst'
#fill in position name and search
time.sleep(uniform(1,3))
search = driver.find_element(By.XPATH,"//input[@placeholder='Search jobs']")
search.send_keys(position_name)
time.sleep(uniform(1,3))
driver.find_element_by_class_name('jobs-search-box__submit-button.button-secondary-large-inverse').click()

In [None]:
trigger = driver.find_elements_by_xpath("//*[contains(@id,'-trigger')]")
for t in trigger:
    try:
        t.click()
    except:
        time.sleep(randint(1,3))
        next
time.sleep(randint(1,4))
classic_view = driver.find_elements_by_xpath("//button[contains(@class,'dropdown')]")[5]
time.sleep(randint(1,2))
classic_view.click()

In [None]:
#initialize a new set for storing job url from each page
job_url = defaultdict(list)

#initialize a new list for avoiding duplicated job entries
url_test = []

#wait for page loading
try:
    element = WebDriverWait(driver, 10).until(
        EC.presence_of_element_located(
            (By.XPATH, 
                "//a[@data-control-name='A_jobssearch_job_result_click']")))
finally:
    #execute loop to collect job urls from each page
    while len(job_url['urls']) < int(n):
        #scrolling down the page to load all data
        driver.execute_script("window.scroll(0, 1080);")
        time.sleep(randint(1,3))
        driver.execute_script("window.scroll(1080, 2160);")
        time.sleep(randint(1,5))
        driver.execute_script("window.scroll(2160, 3240);")
        time.sleep(randint(1,4))
        driver.execute_script("window.scroll(3240, 4320);")
        time.sleep(randint(1,5))

        #get page source and parse by BeautifulSoup
        source = driver.page_source
        bsObj = BeautifulSoup(source, 'lxml')

        # debug # print(bsObj.find('a',{'data-control-name':'A_jobssearch_job_result_click'}))
        for url in bsObj.find_all('a',{'data-control-name':'A_jobssearch_job_result_click'}):
            #debug # print(url.attrs['href'])
            if url.attrs['href'][0:19] not in url_test:
                job_url['urls'].append(url.attrs['href'])
                url_test.append(url.attrs['href'][0:19])
            else:
                next

        #monitor the number of urls collected
        print(f"Number of urls collected: {len(job_url['urls'])}")
        url_df = pd.DataFrame.from_dict(job_url)
        save_data('urls_DA', url_df, 'LinkedinJob', if_exists='replace')

        #navigate to the next page
        try:
            driver.find_element(By.XPATH, "//button[@class = 'next']").click()
            time.sleep(randint(1,5))

        except:
            break
    print(f"Number of urls collected: {len(job_url['urls'])}")

In [None]:
urls = load_data('urls_DA', 'LinkedinJob')
urls = urls.urls.values.tolist()
print(f'Number of urls: {len(urls)}')

In [None]:
df_dict = defaultdict(list)
error_list = []

In [None]:
for url in tqdm(urls[526:]):
    try:
        driver.get(start_url+url)
        time.sleep(randint(5,8))
        see_more = driver.find_elements_by_xpath("//button[contains(@class, 'artdeco-button')]")
        for sm in see_more:
            if sm.text.lower() == 'see more':
                sm.click()
        time.sleep(randint(1,5))
        source = driver.page_source
        bsObj = BeautifulSoup(source, 'lxml')
    except:
        print('Try again')
        try:
            see_more = driver.find_elements_by_xpath("//button[contains(@class, 'artdeco-button')]")
            for sm in see_more:
                if sm.text.lower() == 'see more':
                    sm.click()
            time.sleep(randint(1,5))
            source = driver.page_source
            bsObj = BeautifulSoup(source, 'lxml')
        except:
            print(f'Error: {url}')
            error_list.append(url)
            

        # Get job title
    try:
        title = bsObj.find('h1').get_text()
    #     if title:
    #         print(f'Title: {title}')
        df_dict['title'].append(title)
    except:
        df_dict['title'].append(np.nan)
        print(f'Cannot get title {url}')

        # Get company name and location
    try:
        company_info = re.sub(' {2,}', '', bsObj.find('h3').get_text()).strip('\n').split('\n')
        company = company_info[1]
    #     if company:
    #         print(f'Company: {company}')
        df_dict['company'].append(company)

        location = company_info[-1]
    #     if location:
    #         print(f'Location: {location}')
        df_dict['location'].append(location)
    except:
        df_dict['company'].append(np.nan)
        df_dict['location'].append(np.nan)
        print(f'Cannot get company or location {url}')

    try:
        # Get job description
        description = bsObj.find('div', {'id':'job-details'}).get_text().strip()
        df_dict['description'].append(description)
    except:
        df_dict['description'].append(np.nan)
        print(f'Cannot get description {url}')

    try:
        # Get seniority
        seniority = bsObj.find('div',{'class':'jobs-description-details'}).find('p', attrs={'class':'jobs-box__body js-formatted-exp-body'})
    #     if seniority:
    #         print(f'Seniority: {seniority}')
        if seniority is None:
            df_dict['seniority'].append(np.nan)
        else:
            df_dict['seniority'].append(seniority.get_text())
    except:
        df_dict['seniority'].append(np.nan)
        print(f'Cannot get seniority {url}')

    try:
        # Get employment type
        employment_type = bsObj.find('div',{'class':'jobs-description-details'}).find('p', attrs={'class':'jobs-box__body js-formatted-employment-status-body'})
    #     if employment_type:
    #         print(f'Employment Type: {employment_type}')
        if employment_type is None:
            df_dict['employment_type'].append(np.nan)
        else:
            df_dict['employment_type'].append(employment_type.get_text())
    except:
        df_dict['employment_type'].append(np.nan)
        print(f'Cannot get employent type {url}')

    try:
        # Get company industry
        industry = bsObj.find('div',{'class':'jobs-description-details'}).find_all('ul')[0].get_text().strip().split('\n')
        industry = '/'.join(industry)
        df_dict['industry'].append(industry)
    except:
        df_dict['industry'].append(np.nan)
        print(f'Cannot get industry {url}')

    try:
        # Get job function
        function = bsObj.find('div',{'class':'jobs-description-details'}).find_all('ul')[1].get_text().strip().split('\n')
        function = '/'.join(function)
        df_dict['functions'].append(function)
    except:
        df_dict['functions'].append(np.nan)
        print(f'Cannot get function {url}')
    try:
        # Get skills
        skill = bsObj.find('div', {'class':'jobs-box__group'}).get_text().strip().replace('No match', '').split('\n')[3:]
        skill = [x for x in skill if x != '']
        skill = '/'.join(skill)
        df_dict['skills'].append(skill)
    except:
        df_dict['skills'].append(np.nan)
        print(f'Cannot get skills {url}')

    time.sleep(randint(1,5))
    if len(df_dict['title']) % 10 == 0:
        df = pd.DataFrame.from_dict(df_dict)
        #initial parsing for text data
        #df.company = df.company.apply(lambda x: re.sub('[\n]*','',re.sub('^[ ]*','', x)))
        #df.location = df.location.apply(lambda x: re.sub('\n[ ]*', '', re.sub('^(\n.*\n[ ]*)','', x)))
        #df.industry = df.industry.apply(lambda x: re.sub('^,','', re.sub('\n', ',', x)))
        #df.description = df.description.apply(lambda x: re.sub('( [ ]+)', '', x).strip())
        #df.functions = df.functions.apply(lambda x: re.sub('\n', '', x))

        save_data('listingDataAnalyst', df, 'LinkedinJob')
        df_dict = defaultdict(list)

In [None]:
df = pd.DataFrame.from_dict(df_dict)
save_data('listingDataAnalyst', df, 'LinkedinJob')

In [None]:
driver.get('https://www.linkedin.com/jobs/view/989513065/?eBP=JOB_SEARCH_ORGANIC&refId=9671756e-77b9-4c9e-833d-7859f564371c&trk=d_flagship3_search_srp_jobs')

In [None]:
#close the driver
driver.close()