In [1]:
import argparse
import json
import os
import re
import requests
import sys
import time
import unicodecsv as csv

from flask import Flask
from flask_pymongo import PyMongo
from geopy.geocoders import Nominatim
from lxml import html, etree

# mongoDB setup
app = Flask(__name__)
uri = 'mongodb+srv://gil:bert@cluster0-8wvjx.mongodb.net/project2?retryWrites=true'
mongo = PyMongo(app, uri=uri)

# define main function to retrive job listings based on keyword and place. single varibles or lists can be passed.
def parse(keyword, place):
    headers = {	'accept': 'text/html,application/xhtml+xml,application/xml;q=0.9,image/webp,*/*;q=0.8',
                'accept-encoding': 'gzip, deflate, sdch, br',
                'accept-language': 'en-GB,en-US;q=0.8,en;q=0.6',
                'referer': 'https://www.glassdoor.com/',
                'upgrade-insecure-requests': '1',
                'user-agent': 'Mozilla/5.0 (X11; Linux x86_64) AppleWebKit/537.36 (KHTML, like Gecko) Ubuntu Chromium/51.0.2704.79 Chrome/51.0.2704.79 Safari/537.36',
                'Cache-Control': 'no-cache',
                'Connection': 'keep-alive'
        }

    location_headers = {
        'accept': 'text/html,application/xhtml+xml,application/xml;q=0.9,image/webp,*/*;q=0.01',
        'accept-encoding': 'gzip, deflate, sdch, br',
        'accept-language': 'en-GB,en-US;q=0.8,en;q=0.6',
        'referer': 'https://www.glassdoor.com/',
        'upgrade-insecure-requests': '1',
        'user-agent': 'Mozilla/5.0 (X11; Linux x86_64) AppleWebKit/537.36 (KHTML, like Gecko) Ubuntu Chromium/51.0.2704.79 Chrome/51.0.2704.79 Safari/537.36',
        'Cache-Control': 'no-cache',
        'Connection': 'keep-alive'
    }
    
    data = {"term": place, "maxLocationsToReturn": 50}
    location_url = "https://www.glassdoor.co.in/findPopularLocationAjax.htm?"
    try:
#        # Getting location id for search location
        print(f"Retrieving job listings for {keyword} in {place}.")
        location_response = requests.post(location_url, headers=location_headers, data=data).json()
        place_id = location_response[0]['locationId']
        job_listing_url = 'https://www.glassdoor.com/Job/jobs.htm'
        # Form data to get job results
        data = {
            'clickSource': 'searchBtn',
            'sc.keyword': keyword,
            'locT': 'C',
            'locId': place_id,
            'jobType': ''
        }

        job_listings = []
        if place_id:
            response = requests.post(job_listing_url, headers=headers, data=data)
            # extracting data from
            # https://www.glassdoor.com/Job/jobs.htm?suggestCount=0&suggestChosen=true&clickSource=searchBtn&typedKeyword=andr&sc.keyword=android+developer&locT=C&locId=1146821&jobType=
            parser = html.fromstring(response.text)
            # Making absolute url 
            base_url = "https://www.glassdoor.com"
            parser.make_links_absolute(base_url)

            XPATH_ALL_JOB = '//li[@class="jl"]'
            XPATH_NAME = './/a/text()'
            XPATH_JOB_URL = './/a/@href'
            XPATH_LOC = './/span[@class="subtle loc"]/text()'
            XPATH_COMPANY = './/div[@class="flexbox empLoc"]/div/text()'
            XPATH_SALARY = './/span[@class="green small"]/text()'

            listings = parser.xpath(XPATH_ALL_JOB)
            for job in listings:
                raw_job_name = job.xpath(XPATH_NAME)
                raw_job_url = job.xpath(XPATH_JOB_URL)
                raw_lob_loc = job.xpath(XPATH_LOC)
                raw_company = job.xpath(XPATH_COMPANY)
                raw_salary = job.xpath(XPATH_SALARY)

                # Cleaning data
                job_name = ''.join(raw_job_name).strip('–') if raw_job_name else None
                job_location = ''.join(raw_lob_loc) if raw_lob_loc else None
                raw_state = re.findall(",\s?(.*)\s?", job_location)
                state = ''.join(raw_state).strip()
                raw_city = job_location.replace(state, '')
                city = raw_city.replace(',', '').strip()
                company = ''.join(raw_company).replace('–','')
                salary = ''.join(raw_salary).strip()
                job_url = raw_job_url[0] if raw_job_url else None

                jobs = {
                    "Name": job_name,
                    "Company": company,
                    "State": state,
                    "City": city,
                    "Salary": salary,
                    "Location": job_location,
                    "Url": job_url
                }
                job_listings.append(jobs)
            return job_listings
        else:
            print("location id not available")

    except:
        print("Failed to load locations")

    if __name__ == "__main__":

#     ''' eg-:python 1934_glassdoor.py "Android developer", "new york" '''

        argparser = argparse.ArgumentParser()
        argparser.add_argument('keyword', help='job name', type=str)
        argparser.add_argument('place', help='job location', type=str)
        args = argparser.parse_args()
        keyword = args.keyword
        place = args.place
        print("Fetching job details")
        scraped_data = parse(keyword, place)
        print("Writing data to output file")

        with open('%s-%s-job-results.csv' % (keyword, place), 'wb')as csvfile:
            fieldnames = ['Name', 'Company', 'State', 'City', 'Salary', 'Location', 'Url']
            writer = csv.DictWriter(csvfile, fieldnames=fieldnames,quoting=csv.QUOTE_ALL)
            writer.writeheader()
            if scraped_data:
                for data in scraped_data:
                    writer.writerow(data)
            else:
                print("Your search for %s, in %s does not match any jobs"%(keyword,place))

In [2]:
# declare lists to pass to the parse function
search_job = ['Data Scientist', 'Data Engineer', 'Data Analyst', 'Statistician', 'Data Analytics Manager']
search_city = ['Ann Arbor', 'Atlanta', 'Austin', 'Boston', 'Charlotte', 'Chicago', 'Cincinnati', 'Columbia', 'Dallas', 'Denver', 'Houston', 'Jacksonville', 'Los Angeles', 'Louisville', 'Miami', 'Minneapolis', 'Nashville', 'New York', 'Newport News', 'Phoenix', 'San Diego', 'San Francisco', 'Seattle', 'Virginia Beach']

# test variables to save time
# search_job = ['Data Scientist', 'Data Engineer']
# search_city = ['San Diego', 'Los Angeles']

In [26]:
# multi-nested loops to build a dictionary with job as the primary key, then city as the sub keys for job listings.
# the innermost loop will check listings and filter by job
data = {}

for job in search_job:
    # new dictionary for city
    city_dict = {}
    
    for city in search_city:
        # list to save unique job listings
        filter_list = []
        # load to dictionary with city as the key
#         city_dict[(f'{city}')] = parse(job, city)
        city_dict[city] = parse(job, city)

        for listing in city_dict[city]:
            if job in listing['Name']:
#                 print(listing['Name'])
                for key in listing:
                    if listing[key] == '':
                        listing[key] = '?'
                filter_list.append(listing)
        city_dict[(f'{city}')] = filter_list
    # load to dictionary with job as the key
#     result_dict[(f'{job}')] = city_dict
    data[job] = city_dict

Retrieving job listings for Data Scientist in Ann Arbor.
Retrieving job listings for Data Scientist in Atlanta.
Retrieving job listings for Data Scientist in Austin.
Retrieving job listings for Data Scientist in Boston.
Retrieving job listings for Data Scientist in Charlotte.
Retrieving job listings for Data Scientist in Chicago.
Retrieving job listings for Data Scientist in Cincinnati.
Retrieving job listings for Data Scientist in Columbia.
Retrieving job listings for Data Scientist in Dallas.
Retrieving job listings for Data Scientist in Denver.
Retrieving job listings for Data Scientist in Houston.
Retrieving job listings for Data Scientist in Jacksonville.
Retrieving job listings for Data Scientist in Los Angeles.
Retrieving job listings for Data Scientist in Louisville.
Retrieving job listings for Data Scientist in Miami.
Retrieving job listings for Data Scientist in Minneapolis.
Retrieving job listings for Data Scientist in Nashville.
Retrieving job listings for Data Scientist in

In [27]:
data

{'Data Scientist': {'Ann Arbor': [{'Name': 'Data Scientist',
    'Company': ' AVL Instrumentation & Test Systems  ',
    'State': 'MI',
    'City': 'Plymouth',
    'Salary': '$108k-$147k',
    'Location': 'Plymouth, MI',
    'Url': 'https://www.glassdoor.com/partner/jobListing.htm?pos=101&ao=622086&s=149&guid=0000016af18f45e3b5e2bfdf7db245bf&src=GD_JOB_AD&t=SRFJ&extid=4&exst=OL&ist=&ast=OL&vt=w&slr=true&cs=1_a0fe42ed&cb=1558830860520&jobListingId=3214814911'},
   {'Name': 'Data Scientist (Marketing Science)',
    'Company': " Domino's  ",
    'State': 'MI',
    'City': 'Ann Arbor',
    'Salary': '$80k-$90k',
    'Location': 'Ann Arbor, MI',
    'Url': 'https://www.glassdoor.com/partner/jobListing.htm?pos=102&ao=359453&s=149&guid=0000016af18f45e3b5e2bfdf7db245bf&src=GD_JOB_AD&t=SRFJ&extid=4&exst=OL&ist=&ast=OL&vt=w&slr=true&cs=1_8e3dde43&cb=1558830860521&jobListingId=3203079211'},
   {'Name': 'Data Scientist',
    'Company': ' Daimler  ',
    'State': 'MI',
    'City': 'Farmington Hills

In [28]:
# count check for job listings
for job in search_job:
    print(job)
    for city in search_city:
        print(city)
        count = 1
        for listing in data[job][city]:
            count+=1
        print(f' Total Listings: {count}')

Data Scientist
Ann Arbor
 Total Listings: 17
Atlanta
 Total Listings: 21
Austin
 Total Listings: 20
Boston
 Total Listings: 23
Charlotte
 Total Listings: 18
Chicago
 Total Listings: 15
Cincinnati
 Total Listings: 21
Columbia
 Total Listings: 1
Dallas
 Total Listings: 23
Denver
 Total Listings: 20
Houston
 Total Listings: 20
Jacksonville
 Total Listings: 22
Los Angeles
 Total Listings: 15
Louisville
 Total Listings: 8
Miami
 Total Listings: 14
Minneapolis
 Total Listings: 21
Nashville
 Total Listings: 15
New York
 Total Listings: 16
Newport News
 Total Listings: 7
Phoenix
 Total Listings: 20
San Diego
 Total Listings: 15
San Francisco
 Total Listings: 25
Seattle
 Total Listings: 18
Virginia Beach
 Total Listings: 12
Data Engineer
Ann Arbor
 Total Listings: 6
Atlanta
 Total Listings: 11
Austin
 Total Listings: 14
Boston
 Total Listings: 11
Charlotte
 Total Listings: 13
Chicago
 Total Listings: 14
Cincinnati
 Total Listings: 6
Columbia
 Total Listings: 1
Dallas
 Total Listings: 7
Denver
 

In [29]:
# data checks
len(data['Data Engineer']['Ann Arbor'])

5

In [30]:
len(data['Statistician']['Columbia'])

2

In [32]:
data['Data Scientist']['New York'][14]

{'Name': 'Data Scientist - Actimize division',
 'Company': ' NICE Actimize  ',
 'State': 'NJ',
 'City': 'Hoboken',
 'Salary': '$93k-$127k',
 'Location': 'Hoboken, NJ',
 'Url': 'https://www.glassdoor.com/partner/jobListing.htm?pos=128&ao=632918&s=58&guid=0000016af18f94b8aafc7c23bbac7256&src=GD_JOB_AD&t=SR&extid=1&exst=OL&ist=&ast=OL&vt=w&slr=true&cs=1_991b013b&cb=1558830880435&jobListingId=3204399642'}

In [33]:
data['Data Scientist']['Miami'][9]['Url']

'https://www.glassdoor.com/partner/jobListing.htm?pos=117&ao=447365&s=58&guid=0000016af18f879eb97e187c1e2b0440&src=GD_JOB_AD&t=SR&extid=1&exst=OL&ist=&ast=OL&vt=w&slr=true&cs=1_cea83dd9&cb=1558830876997&jobListingId=3193763309'

In [34]:
count = 0
for listing in data['Data Scientist']['Miami']:
    count+=1
    print(count)
    print(listing)

1
{'Name': 'Data Scientist', 'Company': '?', 'State': 'FL', 'City': 'Pembroke Pines', 'Salary': '?', 'Location': 'Pembroke Pines, FL', 'Url': 'https://www.glassdoor.com/partner/jobListing.htm?pos=101&ao=512709&s=149&guid=0000016af18f879e9bc467203171444b&src=GD_JOB_AD&t=SRFJ&extid=4&exst=OL&ist=&ast=OL&vt=w&slr=true&ea=1&cs=1_f8fbbc32&cb=1558830876980&jobListingId=2855332444'}
2
{'Name': 'Data Scientist', 'Company': '?', 'State': 'FL', 'City': 'Miami', 'Salary': '?', 'Location': 'Miami, FL', 'Url': 'https://www.glassdoor.com/partner/jobListing.htm?pos=104&ao=85058&s=58&guid=0000016af18f879eb97e187c1e2b0440&src=GD_JOB_AD&t=SR&extid=1&exst=OL&ist=&ast=OL&vt=w&slr=true&cs=1_e05c926a&cb=1558830876984&jobListingId=3221382657'}
3
{'Name': 'Data Engineer / Data Scientist / Data Analyst - Remote', 'Company': '?', 'State': 'FL', 'City': 'Miami', 'Salary': '?', 'Location': 'Miami, FL', 'Url': 'https://www.glassdoor.com/partner/jobListing.htm?pos=108&ao=85058&s=58&guid=0000016af18f879eb97e187c1e2b

In [35]:
# save to json
with open('glassdoor_data.json', 'w') as fp:
    json.dump(data, fp)

In [None]:
data['Data Architect']

In [36]:
# build a geojson from the city list and save it to geojson.json
feature_list = []
geolocator = Nominatim(country_bias='US', user_agent="job_coordinates")

for city in search_city:
    get_coords = geolocator.geocode(city)
    lat = get_coords.latitude
    lon = get_coords.longitude

    feature_list.append({'type': 'Feature',
                         'geometry': {
                             'type': 'Point',
                             'coordinates': [lon, lat]
                         },
                         'properties': {
                             'title': city.title(),
                             "icon": "monument"
                         }})

geojson = {
    'type': 'FeatureCollection',
    'features': feature_list
}

In [37]:
geojson

{'type': 'FeatureCollection',
 'features': [{'type': 'Feature',
   'geometry': {'type': 'Point', 'coordinates': [-83.7312291, 42.2681569]},
   'properties': {'title': 'Ann Arbor', 'icon': 'monument'}},
  {'type': 'Feature',
   'geometry': {'type': 'Point', 'coordinates': [-84.3901849, 33.7490987]},
   'properties': {'title': 'Atlanta', 'icon': 'monument'}},
  {'type': 'Feature',
   'geometry': {'type': 'Point', 'coordinates': [-97.7436995, 30.2711286]},
   'properties': {'title': 'Austin', 'icon': 'monument'}},
  {'type': 'Feature',
   'geometry': {'type': 'Point', 'coordinates': [-71.0582912, 42.3602534]},
   'properties': {'title': 'Boston', 'icon': 'monument'}},
  {'type': 'Feature',
   'geometry': {'type': 'Point', 'coordinates': [-80.8431268, 35.2270869]},
   'properties': {'title': 'Charlotte', 'icon': 'monument'}},
  {'type': 'Feature',
   'geometry': {'type': 'Point', 'coordinates': [-87.6244212, 41.8755616]},
   'properties': {'title': 'Chicago', 'icon': 'monument'}},
  {'type

In [38]:
lat

36.8529841

In [39]:
# save to geojson
with open('geojson.json','w') as f:
    json.dump(geojson, f)

In [40]:
# upload data to mongoDB
mongo.db.job_listings.update_one({}, {'$set': data}, upsert=True)
mongo.db.geojson.update_one({}, {'$set': geojson}, upsert=True)

<pymongo.results.UpdateResult at 0x1edf6a3b688>

In [41]:
# download data from mongoDB and load to variable
# data = {}
# geojson = {}
data = mongo.db.job_listings.find_one()
geojson = mongo.db.geojson.find_one()

In [42]:
data

{'_id': ObjectId('5ce84975a7a4282b050acc17'),
 'Data Analyst': {'Ann Arbor': [{'Name': 'Data Analyst',
    'Company': '?',
    'State': 'MI',
    'City': 'Ann Arbor',
    'Salary': '?',
    'Location': 'Ann Arbor, MI',
    'Url': 'https://www.glassdoor.com/partner/jobListing.htm?pos=101&ao=66506&s=58&guid=0000016af19039a3b6bb80e30563654c&src=GD_JOB_AD&t=SR&extid=1&exst=OL&ist=&ast=OL&vt=w&slr=true&ea=1&cs=1_d922da98&cb=1558830922520&jobListingId=3236353110'},
   {'Name': 'Data Analyst',
    'Company': '?',
    'State': 'MI',
    'City': 'Ann Arbor',
    'Salary': '?',
    'Location': 'Ann Arbor, MI',
    'Url': 'https://www.glassdoor.com/partner/jobListing.htm?pos=102&ao=155686&s=58&guid=0000016af19039a3b6bb80e30563654c&src=GD_JOB_AD&t=SR&extid=1&exst=OL&ist=&ast=OL&vt=w&slr=true&ea=1&cs=1_c2d8d967&cb=1558830922522&jobListingId=3194851208'},
   {'Name': 'Data Analyst',
    'Company': '?',
    'State': 'MI',
    'City': 'Ann Arbor',
    'Salary': '?',
    'Location': 'Ann Arbor, MI',
  

In [43]:
geojson

{'_id': ObjectId('5ce84975a7a4282b050acc31'),
 'features': [{'type': 'Feature',
   'geometry': {'type': 'Point', 'coordinates': [-83.7312291, 42.2681569]},
   'properties': {'title': 'Ann Arbor', 'icon': 'monument'}},
  {'type': 'Feature',
   'geometry': {'type': 'Point', 'coordinates': [-84.3901849, 33.7490987]},
   'properties': {'title': 'Atlanta', 'icon': 'monument'}},
  {'type': 'Feature',
   'geometry': {'type': 'Point', 'coordinates': [-97.7436995, 30.2711286]},
   'properties': {'title': 'Austin', 'icon': 'monument'}},
  {'type': 'Feature',
   'geometry': {'type': 'Point', 'coordinates': [-71.0582912, 42.3602534]},
   'properties': {'title': 'Boston', 'icon': 'monument'}},
  {'type': 'Feature',
   'geometry': {'type': 'Point', 'coordinates': [-80.8431268, 35.2270869]},
   'properties': {'title': 'Charlotte', 'icon': 'monument'}},
  {'type': 'Feature',
   'geometry': {'type': 'Point', 'coordinates': [-87.6244212, 41.8755616]},
   'properties': {'title': 'Chicago', 'icon': 'monum