## Scraping Job Posting Data from Indeed 
Source:
https://towardsdatascience.com/scraping-job-posting-data-from-indeed-using-selenium-and-beautifulsoup-dfc86230baac

strategy:

* Get all the job posting links
* Click each link and parse text from the job posting page 
* Store the parsed text data

The questions to address
* optimizing query strategy

Researching job analysis with machine learning

https://www.jobspikr.com/blog/analysis-of-machine-learning-job-listings-data-reveals-the-key-skills/

 Further, we find that data science jobs can be grouped into three main personas: Core data scientists, researchers, and big data specialists. https://www.glassdoor.com/research/data-scientist-personas/
 


### 1. Get all the job posting links

In [227]:
# all import statements

from bs4 import BeautifulSoup
import pandas as pd

from selenium import webdriver
import re
from collections import defaultdict

# to write from list to csv file
import csv

from fuzzywuzzy import fuzz
from fuzzywuzzy import process

from nltk.tokenize import sent_tokenize, word_tokenize

# to get string punctuation constant
import string
from re import sub

In [3]:
 """
    Given the url of a page, this function returns the soup object.
    
    Parameters:
        url: the link to get soup object for
    
    Returns:
        soup: soup object
    """

def get_soup(url):
    driver = webdriver.Chrome("C:/Users/lili/Documents/icode/scraping/chromedriver")
    driver.get(url)
    html = driver.page_source
    soup = BeautifulSoup(html, 'html.parser')
    driver.close()
    return soup
#soup = get_soup("https://il.indeed.com/jobs?q=data+scientist&l=tel+aviv&start=10")


In [4]:
   """
    Grab all non-sponsored job posting links from a Indeed search result page using the given soup object
    
    Parameters:
        soup: the soup object corresponding to a search result page
                e.g. https://ca.indeed.com/jobs?q=data+scientist&l=Toronto&start=20
    
    Returns:
        urls: a python list of job posting urls
    
    """
def grab_job_links(soup):
    urls = []
    for link in soup.find_all('h2', {'class': 'jobtitle'}):
        partial_url = link.a.get('href')
        url = 'https://il.indeed.com' + partial_url
        urls.append(url)
    return urls

#urls = grab_job_links(soup)


In [5]:
 """
    Grab number of result pages, from a Indeed search result page using the given soup object
    
    Parameters:
        soup: the soup object corresponding to a search result page
                e.g. https://ca.indeed.com/jobs?q=data+scientist&l=Toronto&start=20
    
    Returns:
        num_pages: integer number of result pages
    
    """
def grab_num_pages(soup):
    page_in_search = soup.find(name='div', attrs={'id':"searchCount"}).get_text()
    p = re.compile('\d+')
    num_pages = p.findall(page_in_search)
    num_pages = int(num_pages[1])
    return num_pages


In [6]:
"""
    Get all the job posting URLs resulted from a specific search.
    
    Parameters:
        query: job title to query
        num_pages: number of pages needed
        location: city to search in
    
    Returns:
        urls: a list of job posting URL's (when num_pages valid)
        max_pages: maximum number of pages allowed ((when num_pages invalid))
    """
query = ["data+scientist"]
location = ["Tel+Aviv"]

def get_urls(query, num_pages, location):
    # We always need the first page
    base_url = 'https://il.indeed.com/jobs?q={}&l={}'.format(query, location)
    soup = get_soup(base_url)
    urls = grab_job_links(soup)
    num_listings = grab_num_pages(soup)
    print("num_listings",num_listings)
    num_pages_calc = int(num_listings/10)
    print("num_pages_calc",num_pages_calc)

    # starting page 2
    for i in range(2, num_pages_calc+1):
            num = (i-1) * 10
            base_url = 'https://il.indeed.com/jobs?q={}&l={}&start={}'.format(query, location, num)
            try:
                soup = get_soup(base_url)
                # We always combine the results back to the list
                urls += grab_job_links(soup)
            except:
                continue
    return urls 
    
# finished stage 1 we have urls a list  with job posting links

In [7]:
# just a check that every thing works so far

urls = get_urls(query, 5, location)
print(len(urls))
print(urls)

num_listings 181
num_pages_calc 18
178
['https://il.indeed.com/rc/clk?jk=e3e47792c609fde3&fccid=c869809b954e0ce1&vjs=3', 'https://il.indeed.com/rc/clk?jk=eda154783a2eb5de&fccid=3ccff773fd1c9a46&vjs=3', 'https://il.indeed.com/rc/clk?jk=ba46a8ae2e4ba9f8&fccid=cad845b9fe80e774&vjs=3', 'https://il.indeed.com/rc/clk?jk=5e289a3addfaf33d&fccid=734cb5a01ee60f80&vjs=3', 'https://il.indeed.com/company/BigaBid/jobs/Data-Scientist-29ca8238428aa6ba?fccid=620c28d86cfedd30&vjs=3', 'https://il.indeed.com/rc/clk?jk=b84afe72af7b3aca&fccid=502732c3841cad8b&vjs=3', 'https://il.indeed.com/rc/clk?jk=98d309ccd697954a&fccid=734cb5a01ee60f80&vjs=3', 'https://il.indeed.com/company/BIYOND,-AWS-PARTNER/jobs/Data-Scientist-34600ddffe510397?fccid=5a712464d4a73a07&vjs=3', 'https://il.indeed.com/rc/clk?jk=2d6732ccf0f8278a&fccid=1c6aeebf4483d9bf&vjs=3', 'https://il.indeed.com/rc/clk?jk=46beaaf8f0caa9b1&fccid=5f54c81c18e58104&vjs=3', 'https://il.indeed.com/company/BigaBid/jobs/Data-Scientist-29ca8238428aa6ba?fccid=620c

### 2. Click each link and parse text from the job posting page

In [8]:
"""
    Get the text portion including both title and job description of the job posting from a given url
    
    Parameters:
        url: The job posting link
        
    Returns:
        title: the job title (if "data scientist" is in the title)
        posting: the job posting content    
    """

def get_posting1(url):
    soup = get_soup(url)
    title = soup.find(name='h3').getText().lower()
    posting = soup.find(name='div', attrs={'class': "jobsearch-JobComponent"}).get_text()
    return title, posting.lower()
    

In [9]:
'''
    get title + discribtion for every job posting url
    
    parameters: 
            url list 'urls'
            
    returns: 
            list with tuples job title and description
'''

posting_list =[]
for i in range(0, len(urls)):
    posting = get_posting1(urls[i])
    posting_list.append(posting)
  

In [10]:
# check len posting list 

print(len(posting_list))
print(posting_list)


178
[('data scientist', 'data scientisttaboola-תל אביב -יפו, מחוז תל אביבapply nowapply on company sitesave this jobtaboola, the world’s largest content recommendation platform, is hiring a data scientist.\n\nin this job you will:\n\nbe responsible for the entire algorithmic life-cycle in the company: data analytics, prototyping of new ideas, implementing algorithms in a production environment and then monitoring and maintaining them\nturn algorithm prototypes into shippable products that will have a significant and immediate impact on the company’s revenue\nwork on a daily basis with some of the hottest trends in today’s job market: machine/deep learning, big data analytics and cloud computing\napply your scientific knowledge and creativity to analyze large volumes of diverse data and develop algorithms to solve complex problems\ninfluence directly on the way billions of people discover the internet\nwork on projects such as internet personalization, content feed, real time bidding, v

In [16]:
# save posting list to csv
# so will not have to run all process again


df = pd.DataFrame(posting_list)
df.head()
df.to_csv('posting_list.csv')


### 3. Get list of unique postings by keyword

In [12]:
'''
    gets company name from description
    
    parameters: 
            listing
            
    returns: 
            company name
'''


def pick_company(position):
    pos_com = re.compile('^[^-]+')
    sentence_l = pos_com.findall(position[1])
    company = sentence_l[0][len(position[0]):]
    
    return company
pick_company(posting_list[0])

'taboola'

In [64]:
'''
   filters listing list for listings with specific keywords
    
    parameters: 
            keywords and listings list
            
    returns: 
            sublist of listings which contain keywords
'''


key_words = [('data','scientist'), ('data', 'analyst')]

def filter_listings(key_word, posting_list):
    listx = []
    for k in posting_list:
        if key_word[0] in k[0] and key_word[1] in k[0]:
            if k in listx:
                continue
            else:
                listx.append(k)
    return listx

data_sci_des =filter_listings(('data','scientist'), posting_list)
print(len(data_sci_des))

72


In [67]:
'''
   generate list of non-identical duplicates
    
    parameters: 
            list of postings
            
    returns: 
            list with duplicates only
'''
def list_duplicates(data_sci_des):
    print(len(data_sci_des)," lengthlist")
    duplicates = []
    for posting in range (len(data_sci_des)):
        looping = posting + 1
        while looping < len(data_sci_des):
           
            if fuzz.token_sort_ratio(data_sci_des[posting], data_sci_des[looping]) >= 70:
                if fuzz.token_sort_ratio(data_sci_des[posting], data_sci_des[looping]) == 100:
                    duplicates.append(data_sci_des[posting])
                else:
                    if data_sci_des[posting][0] == data_sci_des[looping][0]:
                       
                        if fuzz.partial_ratio(pick_company(data_sci_des[posting]), pick_company(data_sci_des[looping]))>80:
                            print(pick_company(data_sci_des[posting]), pick_company(data_sci_des[looping]))
                            print(fuzz.partial_ratio(pick_company(data_sci_des[posting]), pick_company(data_sci_des[looping])))
                            print("yeees")
                            duplicates.append(data_sci_des[posting])
                        else:
                            print(fuzz.partial_ratio(pick_company(data_sci_des[posting]), pick_company(data_sci_des[looping])))
                            print("not ehough")
                            print(0)
                         
                        
                    
                break
            else:
                looping += 1
       
        
    return duplicates
        
            
          

In [66]:
'''
   generate list of postings without non-identical duplicates
    
    parameters: 
            list of postings
            
    returns: 
            list of postings without non identical duplicates.
'''

def delete_duplicates(listing_by_keyword):
    dups = list_duplicates(listing_by_keyword)
    for i in dups:
        listing_by_keyword.remove(i)
    return listing_by_keyword

zzz = delete_duplicates(data_sci_des)
print(len(zzz))

72  lengthlist
lemonade inc. lemonade insurance company
85
yeees
25
not ehough
0
intuit intuit
100
yeees
69


In [265]:
from nltk import word_tokenize
from nltk.corpus import stopwords
from nltk.probability import FreqDist

my_text = ''
for zz in zzz:
    my_text += zz[1]
#print(my_text)

tokens = word_tokenize(my_text)
print(len(tokens))
#print(tokens)
print('number of unique words')
print(len(set(tokens)))

filtered_words = [word.lower() for word in tokens if word not in stopwords.words('english')]
print('no stopwords')
print(len(filtered_words))
words_freq = {}

for word in filtered_words:
    if word in words_freq.keys():
        words_freq[word] += 1
        continue
    else:
        words_freq[word] =1
print(words_freq)


23987
number of unique words
3412
no stopwords
17630
{'data': 400, 'scientisttaboola-תל': 1, 'אביב': 63, '-יפו': 62, ',': 1521, 'מחוז': 83, 'תל': 74, 'אביבapply': 57, 'nowapply': 68, 'company': 94, 'sitesave': 51, 'jobtaboola': 1, 'world': 40, '’': 85, 'largest': 5, 'content': 5, 'recommendation': 6, 'platform': 14, 'hiring': 3, 'scientist': 91, '.': 676, 'job': 156, ':': 192, 'responsible': 10, 'entire': 9, 'algorithmic': 9, 'life-cycle': 1, 'analytics': 25, 'prototyping': 2, 'new': 58, 'ideas': 12, 'implementing': 6, 'algorithms': 70, 'production': 19, 'environment': 28, 'monitoring': 4, 'maintaining': 4, 'turn': 2, 'algorithm': 21, 'prototypes': 6, 'shippable': 1, 'products': 31, 'significant': 5, 'immediate': 1, 'impact': 18, 'revenue': 1, 'work': 84, 'daily': 3, 'basis': 4, 'hottest': 2, 'trends': 6, 'today': 2, 'market': 3, 'machine/deep': 1, 'learning': 160, 'big': 35, 'cloud': 21, 'computing': 9, 'apply': 19, 'scientific': 8, 'knowledge': 42, 'creativity': 8, 'analyze': 18, 'la

In [267]:
print(len(set(filtered_words)))

3300


### 4. analyze listing content

In [242]:
job_scope =['in this job you will:','requirements']
prog_in =['python','java','tensorflow', 'sql', 'spark', 'kafka', 'r', 'cassandra', 'elasticsearch', 'bigquery', 'google cloud', 'docker', 'scala', 'c++','matlab']
education = ['m.sc.', 'phd.','bsc/msc','master’s', 'computer science', 'cs', 'ee', 'mathematics', 'engineering', 'physics','related','degree', 'bachelor’s','statistics']
experience = ['years', 'experience']
optional = ['nice to have:', '– an advantage']
skills =['communication', 'accuracy','visualization', 'machine learning', 'deep learning']
junior = ['student']

In [171]:
'''
   generate list of composite sentenses in posting
    
    parameters: 
            one posting description string
            
    returns: 
            list of sentences in posting
'''

# analyse sentenses in listing

def break_sentences(listing):

    length_listing = len(listing)
    sentence_l = []
    sentence = re.compile('[^\n]+\n')
    end_sentence = re.compile('[\n](.+)\-.+ago')
    # the last part of the listing does not have line break in the end therefore needs special regex
    
    sentence_l = sentence.findall(listing)
    end_sentence_l = end_sentence.findall(listing)
    if len(sentence_l)> 0:   
        sentence_l.append(end_sentence_l[0])
   
   
    len_sentences = 0
    for i in sentence_l:
        len_sentences += len(i)
    print("listing", length_listing, "sentences", len_sentences)
    if len_sentences/length_listing < 0.9:
        sentence_l = sent_tokenize(listing)
        print("tokenize")
        for i in sentence_l:
            len_sentences += len(i)
        print(len_sentences)
    return sentence_l




In [240]:
'''
   filters the relevant sentence by theme
    
    parameters: 
            theme( eg. education qualification, experience, etc.) and posting
            
    returns: 
            list of relevant sentences
'''
# problem this kind of cleaning breaks relevant expresions such as "computer science"

def thematic_ana(theme, listing):
  
    temp_words = []
    selected = []
    for i in listing:
        i = i.replace('/',' ')
        words = i.split()
        table = str.maketrans('', '', sub('\+', '',string.punctuation))
        stripped = [w.translate(table) for w in words]
        for ii in stripped:
            if ii in theme:
                selected.append(stripped)
                break
            else:
                continue
    return selected


In [241]:
index = 0
for zz in zzz:
    print(index)
    print(zz)
    print('')
    sent_bag = break_sentences(zz[1])
    prog_lang = thematic_ana(prog_in,sent_bag)
    print(prog_lang)
    print()
    academic = thematic_ana(education,sent_bag)
    print(academic)
    print()
    exp = thematic_ana(experience,sent_bag)
    print(exp)
    print()
    index += 1

0
('data scientist', 'data scientisttaboola-תל אביב -יפו, מחוז תל אביבapply nowapply on company sitesave this jobtaboola, the world’s largest content recommendation platform, is hiring a data scientist.\n\nin this job you will:\n\nbe responsible for the entire algorithmic life-cycle in the company: data analytics, prototyping of new ideas, implementing algorithms in a production environment and then monitoring and maintaining them\nturn algorithm prototypes into shippable products that will have a significant and immediate impact on the company’s revenue\nwork on a daily basis with some of the hottest trends in today’s job market: machine/deep learning, big data analytics and cloud computing\napply your scientific knowledge and creativity to analyze large volumes of diverse data and develop algorithms to solve complex problems\ninfluence directly on the way billions of people discover the internet\nwork on projects such as internet personalization, content feed, real time bidding, vide

In [195]:
print(zzz[3])
print()
bagg = break_sentences(zzz[3][1])
temp_words = []
selected = []
for i in bagg:
    print(i)
    temp_words = i.split()
    for word in temp_words:
        word2 = word.replace('/',' ')
        if word2.strip(',') in prog_in:
            print(word2)
            selected.append(i)
            break
        else:
            print("+++++++++++++++++", word2)
            continue

('data scientist student-video indexer team', 'data scientist student-video indexer teammicrosoft-תל אביב -יפו, מחוז תל אביבapply nowapply on company sitesave this jobתל אביב -יפו, מחוז תל אביבvideo indexer data science team has an opening for a student!\nat vi ds team, we research and produce algorithms and models that extract complex insights from all kinds of videos.\nwe are looking for candidates that:enjoy working with dataare hands on with machine learning algorithmsare both creative and thorough when solving problemsare quick and eager to learn new fields, approaches and methodsare able to communicate with product, r&d as well as with customersare truly dedicated to their team\nresponsibilitiesconduct algorithmic r&d in various ml areas (vision, speech, nlp, etc’)work intensively with different types of datawork closely with the engineering team to ensure successful implementation of all algorithms as part of the live product\nqualificationsm.sc. or ph.d. student in cs/ee/physic

In [84]:
# take posting list and count relevant vs. irrelevant
############## how important it is to count relevant vs irrelevant

data_scientist_jobs = defaultdict(list)
#data_analyst_jobs = defaultdict(list)
data_sci_des = []
data_eng_des = []
data_ana_des = []
datasci_related = [] 
non_data_sc = []


for k in posting_list:
    if 'data' in k[0] and (('scientist' in k[0]) or ('science' in k[0])):
        data_sci_des.append(k) 
        if k[0] in data_scientist_jobs.keys():
            data_scientist_jobs[k[0]] += 1
        else:
            data_scientist_jobs[k[0]] = 1
    elif 'data' in k[0] and 'engineer' in k[0]:
        data_eng_des.append(k) 
        if k[0] in data_scientist_jobs.keys():
            data_scientist_jobs[k[0]] += 1
        else:
            data_scientist_jobs[k[0]] = 1
    elif 'data' in k[0] and 'analyst' in k[0]:
        data_ana_des.append(k) 
        if k[0] in data_scientist_jobs.keys():
            data_scientist_jobs[k[0]] += 1
        else:
            data_scientist_jobs[k[0]] = 1
    elif 'data scientist' in k[1]:
        datasci_related.append(k) 
        
    else:
        if k in  non_data_sc:
            continue
        else:
            non_data_sc.append(k) 
        print(k)
        


('applied scientist intern - alexa shopping', 'applied scientist intern - alexa shoppingamazon dev center (tel aviv)-תל אביב -יפו, מחוז תל אביבapply nowapply on company sitesave this jobתל אביב -יפו, מחוז תל אביב\nsecond year msc in computer science, or related field.\nsubmitted a research paper to a top conference.\nstrong cs foundations (data structures and algorithms).\nknowledge of programming languages such as c/c++, java, or python.\ngood writing and verbal english skills.\n\nyou: alexa, i am looking for a summer internship position where i could learn and innovate in ai, and impact the life of millions of people worldwide. it would be great if i could publish about it as well. what do you suggest?\nalexa: the alexa shopping team is looking for interns to help me become the best personal shopping assistant. do you want to hear more?\nyou: yes, please!\nalexa: as an intern in the alexa shopping research team, you will work with top researchers and engineers to explore and devise n

In [85]:
print('data science posting', len(data_sci_des))
print('data engineer posting', len(data_eng_des))
print('data analyst posting', len(data_ana_des))
print('datasci_related', len(datasci_related))
print('non data science', len(non_data_sc))


data science posting 96
data engineer posting 3
data analyst posting 4
datasci_related 45
non data science 11


In [86]:
for h in range(len(non_data_sc)):
    print(h)
    print(non_data_sc[h])
    print()

0
('applied scientist intern - alexa shopping', 'applied scientist intern - alexa shoppingamazon dev center (tel aviv)-תל אביב -יפו, מחוז תל אביבapply nowapply on company sitesave this jobתל אביב -יפו, מחוז תל אביב\nsecond year msc in computer science, or related field.\nsubmitted a research paper to a top conference.\nstrong cs foundations (data structures and algorithms).\nknowledge of programming languages such as c/c++, java, or python.\ngood writing and verbal english skills.\n\nyou: alexa, i am looking for a summer internship position where i could learn and innovate in ai, and impact the life of millions of people worldwide. it would be great if i could publish about it as well. what do you suggest?\nalexa: the alexa shopping team is looking for interns to help me become the best personal shopping assistant. do you want to hear more?\nyou: yes, please!\nalexa: as an intern in the alexa shopping research team, you will work with top researchers and engineers to explore and devise

In [82]:
print(sentence_l)

['associate scientist/scientistcompugen usa-חולון, מחוז תל אביבapply nowapply on company sitesave this jobleading experimental target validation activities according to project wp and timelines. executing experiments relating to target validation activities (research tools generation and qc, expression studies, abs characterization, cell-based assays) from experimental design to final completion including documentation, reporting and presenting\n', 'experience and knowledge in protein purification, qc and analytics (column chromatography, hplc-sec, ms data interpretation, and sds-page)\n', 'highly experienced in cell culture technologies\n', '– over expression systems (lentivirus, retrovirus, transfections, etc.) including single clone generation and characterization, cell sorting\n', '– knock-down/knock out technologies (si\\shrna, crispr)\n', 'molecular biology methods\n', 'biochemical technologies\n', '– western blot, ip, elisa\n', 'cell based in vitro assays\n', '– experience in fa

In [202]:
print(string.punctuation)

!"#$%&'()*+,-./:;<=>?@[\]^_`{|}~


In [None]:
#scraping code:
for city in city_set:
    for start in range(0, max_results_per_city, 10):
        page = requests.get("https://il.indeed.com/jobs?q=data+scientist&l=tel+aviv&start=" + str(start))
        time.sleep(1)  #ensuring at least 1 second between page grabs
        soup = BeautifulSoup(page.text, "lxml", from_encoding="utf-8")
        for div in soup.find_all(name="div", attrs={"class":"row"}): 
            #specifying row num for index of job posting in dataframe
            #num = (len(sample_df) + 1)
            # think this shouldnt be +1
            num = (len(sample_df))
            #creating an empty list to hold the data for each posting
            job_post = [] 
            #append city name
            job_post.append(city) 
            #grabbing job title
            for a in div.find_all(name="a", attrs={"data-tn-element":"jobTitle"}):
   
                job_post.append(a["title"]) 
            #grabbing company name
            company = div.find_all(name="span", attrs={"class":"company"})
            if len(company) > 0:
                for b in company:
                    job_post.append(b.text.strip()) 
            else:
                sec_try = div.find_all(name="span", attrs={"class":"result-link-source"})
                for span in sec_try:
                    job_post.append(span.text) 
            print("this is row 1!!!!")
            print(job_post)
            #grabbing location name
          #  for div in soup.find_all(name="div", attrs={"class":"companyInfoWrapper"}): 
        
                
            c = soup.findAll("span", attrs={'class': "location"})
            for span in c:
                temp1 = 0
                temp1 = span.text
                job_post.append(span.text) 
                print()
                if len(temp1) > 0:
                    break
           
            #grabbing summary text
            d = soup.findAll("span", attrs={"class": "summary"})
            for span in d:
                temp2 = 0
                temp2 = span.text
                job_post.append(span.text.strip())
                if len(temp2) > 0:
                    break
           
            
            print("this is row 2!!!!")
            print(job_post)
            #appending list of job post info to dataframe at index num
            sample_df.iloc[num,:] = job_post
#saving sample_df as a local csv file — define your own local path to save contents 
#sample_df.to_csv("C:\Users\lili\Documents\icode\scraping\ver1.csv", encoding="utf-8")
print(sample_df.head())

# source
* [Web Scraping Job Postings from Indeed](https://medium.com/@msalmon00/web-scraping-job-postings-from-indeed-96bd588dcb4b)
* [Scraping Job Posting Data from Indeed using Selenium and BeautifulSoup](https://towardsdatascience.com/scraping-job-posting-data-from-indeed-using-selenium-and-beautifulsoup-dfc86230baac)
