In [6]:
# idea; treat each tag ex) <p> or </p> as a single character for the NN to learn
# idea: replace entire "<ul> <\ul>" with a single "^". Maybe the RNN can learn that behavior
# Later, in generated text, replace ^s with lists

# Indeed Webscraper
This webscraper will be [indeed.com](indeed.com) specific; if more data is needed, we'll move to monster.com and other sites afterwards.

Inputs: job titles to scrape on indeed
Outputs: data that's ready to be trained/tested on

### Plan:
1. Get entire relevant job info and save first. We'll parse later.
    * Get "Machine Learning Engineer", "Data Scientist", and "Business Analyst".
    * Header
        * Job title
        * Company
        * Location (City, State, Country)
        * Salary
    * Body
        * Entire body, raw HTML
2. Parse info (gotta think)
    * Need to subdivide
    * Bullet points are stored in -, html list, \*, or sometimes just breaks.
3. Use a markov generator on title dataset

In [12]:
import requests as r
import math
import re
import os
from collections import Counter
from bs4 import BeautifulSoup as bs
import json
from datetime import datetime
import utils

In [6]:
URL = "https://www.indeed.com/jobs?q="
VIEW_JOB = "https://www.indeed.com/viewjob?jk="
SEARCH_LOC = "&l=united+states" # can change this
QUERY_PREFIX = "&start="

In [58]:
# seen = set()
# TODO: Change this back later
seen = posts

def get_posts(query_job, num_posts):
    """ Scrapes job data and returns list of dics
    
    Return dic:
        {job_title: str
         company: str
         location: str
         salary: str
         description: str (raw html)}
         
    :param query_job: title of job to search (with "+"s instead of spaces)
           num_posts: the number of job posts to scrape
    :return list of dictionaries containing job info
    """
    
    jobs = []
    req = 1
    num_scraped = 0
    dud_counter = 0

    while num_scraped < num_posts:
        # get search results
        page = r.get(URL + query_job + SEARCH_LOC + QUERY_PREFIX + str(req * 10))
        soup = bs(page.text, "lxml")
        
        req += 1

        for item in soup.find_all("div", {"class": "unifiedRow"}):
            # break if you got enough posts
            
            if dud_counter > 30:
                req += 20
            
            if num_scraped >= num_posts or dud_counter > 100:
                return jobs
            
            if num_scraped%500 == 0 and num_scraped > 0 and dud_counter == 0:
                print("Processed ", num_scraped, " of ", query_job)
            if dud_counter > 0 and dud_counter==50:
                print("Num duds for", query_job, ": 50")

            # enter subpage
            job_id = item["data-jk"]
            post_raw = r.get(VIEW_JOB + job_id)
            post = bs(post_raw.text, "lxml")
            head = post.find("div", class_="jobsearch-DesktopStickyContainer")

            # extract data
            if head is None:
                continue
            title = head.find("h3").text
            company = head.find("a").text if head.find("a") else None
            
            # skip if it's the same company/job posted
            if (title, company) in seen:
                dud_counter += 1
                continue
            else:
                seen.add((title, company))
            
            try:
                location = (
                    head.find("div", class_="jobsearch-InlineCompanyRating")
                    .find_all("div")[-1]
                    .text
                )
            except:
                location = None

            salary = (
                head.find("div", class_="jobsearch-JobMetadataHeader-item").text
                if head.find("div", class_="jobsearch-JobMetadataHeader-item")
                else None
            )
            description = post.find("div", class_="jobsearch-jobDescriptionText")
            
            jobs.append(
                {
                    "title": str(title),
                    "company": str(company),
                    "location": str(location),
                    "salary": str(salary),
                    "description": str(description),
                }
            )
            # reset dud counter and increase num_scraped
            dud_counter = 0
            num_scraped += 1
    return jobs

In [59]:
# job titles to mine on
titles = [
#     "machine+learning+engineer",
#           "data+scientist",
#           "big+data",
#           "data+engineer",
          "data+analyst",
          "business+analyst",
          "ai+research+scientist"]

for title in titles:
    posts = get_posts(title, 2000)
    name = title + "+" + str(datetime.utcnow().strftime('%m-%d')) + ".json"
    utils.save_json(posts , name)
    print("processed: " + title)
print("done")

Processed  500  of  data+analyst
Num duds for data+analyst : 50
Num duds for data+analyst : 50
Num duds for data+analyst : 50
Num duds for data+analyst : 50
Num duds for data+analyst : 50
Num duds for data+analyst : 50
processed: data+analyst
Processed  500  of  business+analyst
Num duds for business+analyst : 50
processed: business+analyst
Processed  500  of  ai+research+scientist
Num duds for ai+research+scientist : 50
Num duds for ai+research+scientist : 50
Num duds for ai+research+scientist : 50
Num duds for ai+research+scientist : 50
Num duds for ai+research+scientist : 50
Num duds for ai+research+scientist : 50
Num duds for ai+research+scientist : 50
processed: ai+research+scientist
done


2499

In [51]:
# # Save to json, so I don't have to repeat mining
# utils.save_json(ml_posts, ml_eng + "+5000.json")

In [51]:
# reimport json
data_scientist = utils.load_json("data+scientist+11-09.json")
data_engineer = utils.load_json("data+engineer+11-09.json")
big_data = utils.load_json("big+data+11-09.json")
ml_eng = utils.load_json("machine+learning+engineer+11-09.json")

In [52]:
posts = set()
for job in [data_scientist, data_engineer, big_data, ml_eng]:
    for post in job:
        posts.add((post['title'],post['company']))

In [60]:
def split(jobs):
    """ Split the job descriptions into respective datasets
    
    :param jobs: the output from get_posts
    :return body: list of body text strings
            bullets: list of individual bullet points from each post
            titles: list of paragraph/list titles
    """
    
    TAGS_TO_REMOVE = ["<b>", "<div>", "</div>", "<br>", "<i>"]
    
    body = []
    bullets = []
    titles = []
    for post in jobs:
        # first remove the div tags
        text = post["description"][66:]
        text = text[:-6]
        #     print(text)

        # get titles and paragraphs
        for paragraph in re.findall(r"<p>\s*(.*?)\s*</p>", text):
            if "<br/>" in paragraph:
                title_and_body = paragraph.split("<br/>")
                titles.append(title_and_body[0])
                body.extend(title_and_body[1:])
                continue
            if "·" in paragraph:
                bullets.append(paragraph[1:])
                continue
            if len(paragraph) < 60:
                if ":" in paragraph:
                    if paragraph[-1] == ":" or paragraph[-4:] == "</b>":
                        titles.append(paragraph)
                        continue
                if "<b>" in paragraph:
                    titles.extend(re.findall(r"<b>\s*(.*?)\s*</b>", paragraph))
                    continue
            body.append(paragraph)

        # get all bullet lists if any
        if "<ul>" in text:
            for item in re.findall(r"<ul>\s*(.*?)\s*</ul>", text):
                bullets.extend(re.findall(r"<li>\s*(.*?)\s*</li>", item))
        else:
            re.sub('<[^<]+?>', '', text)
            body.append(text)
        
    return body, bullets, titles

In [61]:
body_list, bullet_list, title_list = split(ml_posts)

In [42]:
# # get distribution of paragraph lengths (for generator later)
# body_lens = [len(s) for s in body_list if len(s) > 1]
# # print(max(body_lens))
# longest_s = ""
# for b in body_list:
#     if len(b) > len(longest_s):
#         longest_s = b
# print(longest_s)

The goal of a research engineer at scale is to bring techniques in the fields of computer vision, deep learning and deep reinforcement learning, or natural language processing into a production environment to improve scale.ai ’s products and customer experience. Our research engineers take advantage of our unique access to massive datasets to deliver improvements to our customers.
<br/>We are building a large hybrid human-machine system in service of ML pipelines for dozens of industry-leading customers. We currently complete millions of tasks a month, and will grow to complete billions of tasks monthly. As a Research Engineer, you will:
<br/>Take state of the art models developed internally and from the community, use them in production to solve problems for our customers and taskers.
<br/>Take models currently in production, identify areas for improvement, improve them using retraining and hyperparemeter searches, then deploy without regressing on core model characteristics
<br/>Work

In [62]:
utils.save_list_as_corpus(body_list, '5000_body_list.txt')
utils.save_list_as_corpus(bullet_list, '5000_bullet_list.txt')
utils.save_list_as_corpus(title_list, '5000_title_list.txt')