# Imports


In [51]:
import pandas as pd
import requests
import json
import html
from datetime import date
import csv
from time import sleep
import re

In [3]:
filename ="../greenhouse_companies.csv"
 
# opening the file using "with"
# statement

with open(filename, 'r') as data:
  companies={row[0]:row[1] for row in csv.reader(data)}


In [4]:
def get_companies(filename: str ='companies.csv') -> dict:
    with open(filename, 'r') as data:
        companies = {row[0]:row[1] for row in csv.reader(data)}
    companies_clean = companies
    companies_bad = {}
    tokens = set(companies.keys())
    for token in tokens:
        if not requests.get(f'https://boards-api.greenhouse.io/v1/boards/{token}/jobs'):
            print(token)
            companies_clean.pop(token)
            companies_bad[token] = companies.get(token)


In [5]:
tokens = set(companies.keys())

## Check for bad job tokens

In [6]:
# No output == good
# Output == company api not found
'''
for company in tokens:
    if not requests.get(f'https://boards-api.greenhouse.io/v1/boards/{company}/jobs'):
      print(company)
      
      '''

"\nfor company in tokens:\n    if not requests.get(f'https://boards-api.greenhouse.io/v1/boards/{company}/jobs'):\n      print(company)\n      \n      "

## Job criteria
Finds job titles that match criteria.    
Roles is for job role (engineer, developer, etc), level is for expereince level keywords ('junior', 'associate',etc), exclude is for tersm you don't want ('senior', etc.). Should be lower case.

In [7]:
roles = {'developer','engineer', 'frontend', 'software', 'apprentice', 'apprenticeship', 'front-end', 'backend', 'back-end', 'jr.', 'jr' }
# Includes 'software' for titles that are just 'software enginer', etc.
levels = {'junior', 'entry-level', 'grad', 'graduate', 'apprentice', 'apprenticeship', 'software', 'entry', 'intern', 'i', '1', 'associate'}
# Optional, but helps exclude higher level postiions
exclude = {'senior', 'principal' , 'sr.', 'ii', 'iii' }

# Main Code
Finds jobs and outputs results as csv or excel file

In [8]:
# Return a list of elligble jobs from a set of companies
def find_jobs(companies : dict, roles : set , levels : set , exclude : set = {}) -> list:
  
  tokens = set(companies.keys())
  results = []

  for company in tokens:
    res=requests.get(f'https://boards-api.greenhouse.io/v1/boards/{company}/jobs')
    if res:
      jobs=json.loads(res.text).get('jobs')

      if jobs:
        for job in jobs:
            title=set(job.get('title').lower().split())
            if title.intersection(roles) and title.intersection(levels) and not title.intersection(exclude) :
              job['token'] = company
              job['company'] = companies.get(company)
              results.append(job)

  return results

  

In [65]:
def get_location(locations: list[str]) -> list:
    output = []
    for location in locations:
        if re.search('remote|anywhere|everywhere',location.strip().lower()):
            continue
        res = requests.get(f'https://geocode.maps.co/search?q={location}')
        try:
            data = res.json()[0]
        except:
            continue
        coordinates = [data.get('lon'),data.get('lat')]
        output.append({'name': location, 'point' : {'type' : 'Point', 'coordinates' : coordinates}})
        sleep(0.55)
    return output
    

In [52]:
res = requests.get("https://geocode.maps.co/search?q=columbus")
print(res.json()[0])

{'place_id': 287393305, 'licence': 'Data © OpenStreetMap contributors, ODbL 1.0. https://osm.org/copyright', 'powered_by': 'Map Maker: https://maps.co', 'osm_type': 'relation', 'osm_id': 182706, 'boundingbox': ['39.8086936', '40.1573082', '-83.2101797', '-82.7713119'], 'lat': '39.9622601', 'lon': '-83.0007065', 'display_name': 'Columbus, Franklin County, Ohio, United States', 'class': 'boundary', 'type': 'administrative', 'importance': 0.7294399106820546}


In [64]:
print(re.search('remote|anywhere|everywhere','everywhere canada'))

<re.Match object; span=(0, 10), match='everywhere'>


In [66]:
# Get job details for eligible jobs and return a dataframe with the job data
def get_details(results: list ) -> pd.DataFrame: 
  data = []
  for job in results:
    url = f'https://boards-api.greenhouse.io/v1/boards/{job.get("token")}/jobs/{job.get("id")}'
    res = requests.get(url)

    if res:
      job_detail = json.loads(res.text)
      job_info = {'title': job_detail.get('title'), 'company' : job.get('company'), 'link': job_detail.get('absolute_url'), 
                  'description' : html.unescape(job_detail.get('content')), 'date' : job_detail.get('updated_at'), 
                  'remote' : None, 'greenhouse_id' : job.get('id'), 'greenhouse_api_url' : url }
      
      if job_detail.get('offices'):
        job_info['location'] = [office['location'] for office in job_detail.get('offices')]
        
        if None in job_info['location']:
            job_info['location'] = [office['name'] for office in job_detail.get('offices')]

        job_info['geoJSON'] = get_location(job_info['location'])
      data.append(job_info)

  df = pd.DataFrame()
  df = df.from_records(data)

  return df

In [67]:
# Driver code

results = find_jobs(companies, roles, levels, exclude)

data = get_details(results)

df = pd.DataFrame()
df = df.from_records(data)


In [69]:
df.location.value_counts()

[San Francisco, California, United States]                                             34
[Mountain View, California, United States]                                             26
[Remote]                                                                               18
[Remote Canada, Remote US]                                                             10
[San Mateo, CA, USA]                                                                    9
                                                                                       ..
[Los Angeles, California, United States]                                                1
[New York, NY, United States]                                                           1
[Denver, Colorado, United States, Remote, San Francisco, California, United States]     1
[Sydney, New South Wales, Australia]                                                    1
[Remote Canada]                                                                         1
Name: loca

In [71]:
df[['location']]

Unnamed: 0,location
0,"[San Mateo, CA, USA]"
1,"[San Mateo, CA, USA]"
2,"[San Mateo, CA, USA]"
3,"[San Mateo, CA, USA]"
4,"[Paris, France]"
...,...
236,[Remote]
237,[Remote]
238,[Remote]
239,[Remote]


## Job links

In [None]:
f'Found {len(df.link)} jobs!'

'Found 226 jobs!'

In [None]:
# Job links
df.link

0      https://www.tempus.com/careers/job/?gh_jid=656...
1      https://www.tempus.com/careers/job/?gh_jid=436...
2      https://www.tempus.com/careers/job/?gh_jid=656...
3      https://www.tempus.com/careers/job/?gh_jid=666...
4      https://www.tempus.com/careers/job/?gh_jid=666...
                             ...                        
221    https://boards.greenhouse.io/liveperson/jobs/4...
222    https://boards.greenhouse.io/imprint/jobs/4722...
223             https://ample.com/jobs?gh_jid=4000768005
224             https://ample.com/jobs?gh_jid=4224193005
225             https://ample.com/jobs?gh_jid=4169834005
Name: link, Length: 226, dtype: object