# JobParser

The general strategy is:
    1. Parse the top 15 pages job links of data science on Indeed and collect all contents in each job.
    2. Process the job contents and either dig out information or create a dynamic job board for my own use

In [90]:
from bs4 import BeautifulSoup # For HTML parsing
import urllib.request as ub
import re # Regular expressions
from time import sleep # To prevent overwhelming the server between connections
from collections import Counter # Keep track of our term counts
from nltk.corpus import stopwords # Filter out stopwords, such as 'the', 'or', 'and'
import pandas as pd # For converting results to a dataframe and bar chart plots
import time 
from tqdm import tqdm
%matplotlib inline

Website parsing function

In [10]:
def text_extractor(website):
    '''
    extract words from html file
    '''
    try:
        site = ub.urlopen(website).read() # Connect to the job posting
    except: 
        return   # Need this in case the website isn't there anymore or some other weird connection problem 
    
    soupObj = BeautifulSoup(site,"lxml") # Get the html from the site
    
    for script in soupObj(["script", "style"]):
        script.extract() # Remove these two elements from the BS4 object
    
    text = soupObj.get_text() # Get the text from this
    
        
    
    lines = (line.strip() for line in text.splitlines()) # break into lines
        
        
    chunks = (phrase.strip() for line in lines for phrase in line.split("  ")) # break multi-headlines into a line each

        
    text = re.sub("[^a-zA-Z.+3]"," ", text)  # Now get rid of any terms that aren't words (include 3 for d3.js)
                                                # Also include + for C++
        
    text = text.lower().split()  # Go to lower case and split them apart
            
    text = list(set(text)) # Last, just get the set of these. Ignore counts (we are just looking at whether a term existed
                            # or not on the website)
    return text

In [11]:
sample = text_extractor('http://www.indeed.com/viewjob?jk=5505e59f8e5a32a4&q=%22data+scientist%22&tk=19ftfgsmj19ti0l3&from=web&advn=1855944161169178&sjdu=QwrRXKrqZ3CNX5W-O9jEvWC1RT2wMYkGnZrqGdrncbKqQ7uwTLXzT1_ME9WQ4M-7om7mrHAlvyJT8cA_14IV5w&pub=pub-indeed')
# sample[:20] # Just show the first 20 words

Grab all the job link on indeed

In [13]:
webStart = "http://www.indeed.com/jobs?q=data+science&jt=fulltime&fromage=1&start=0&pp="

In [41]:
def pageGenerator(n):
    
    """
    return the link of pages that will be used
    """
    links = []
    for i in range(n):
        front = "http://www.indeed.com/jobs?q=data+science&jt=fulltime&fromage=1&start="
        end = "&pp="
        links.append("".join([front,str(i*10),end]))
    return links

def completeLink(link):
    return "http://www.indeed.com" + link

Get all links to job pages on one indeed webpage

In [74]:
def jobLinkCollector(pageAddr):
    
    # load page
    site = ub.urlopen(pageAddr).read()
    soupObj = BeautifulSoup(site,"lxml")
    
    # find the division for the fixed 10 jobs on one page
    jobs = soupObj.find_all("div",attrs = {"data-tn-component":"organicJob"})
    jobLink = []
    
    # iteration for get the link of each job site
    for i in range(len(jobs)):
        link = jobs[i].find("a",attrs={"data-tn-element":"jobTitle"}).get("href")
        jobLink.append(completeLink(link))
    
    return jobLink

Collect the job information

In [79]:
def allJobLinks(n):
    
    # generate the pages for indeed jobs
    jobPages = pageGenerator(n)
    
    jobLinks = []
    
    for page in range(len(jobPages)):
        jobLinks.extend(jobLinkCollector(jobPages[page]))
    
    return jobLinks    

In [83]:
allLinks = allJobLinks(3)
infoList = []

In [91]:
for link in tqdm(allLinks):
    infoList.append(text_extractor(link))
    time.sleep(1)

100%|██████████| 30/30 [00:55<00:00,  1.96s/it]
