In [1]:
from lxml import html, etree
import argparse
import json
import os
import requests
import re
import sys
import time
import unicodecsv as csv

def parse(keyword, place):

    headers = {	'accept': 'text/html,application/xhtml+xml,application/xml;q=0.9,image/webp,*/*;q=0.8',
                'accept-encoding': 'gzip, deflate, sdch, br',
                'accept-language': 'en-GB,en-US;q=0.8,en;q=0.6',
                'referer': 'https://www.glassdoor.com/',
                'upgrade-insecure-requests': '1',
                'user-agent': 'Mozilla/5.0 (X11; Linux x86_64) AppleWebKit/537.36 (KHTML, like Gecko) Ubuntu Chromium/51.0.2704.79 Chrome/51.0.2704.79 Safari/537.36',
                'Cache-Control': 'no-cache',
                'Connection': 'keep-alive'
        }

    location_headers = {
        'accept': 'text/html,application/xhtml+xml,application/xml;q=0.9,image/webp,*/*;q=0.01',
        'accept-encoding': 'gzip, deflate, sdch, br',
        'accept-language': 'en-GB,en-US;q=0.8,en;q=0.6',
        'referer': 'https://www.glassdoor.com/',
        'upgrade-insecure-requests': '1',
        'user-agent': 'Mozilla/5.0 (X11; Linux x86_64) AppleWebKit/537.36 (KHTML, like Gecko) Ubuntu Chromium/51.0.2704.79 Chrome/51.0.2704.79 Safari/537.36',
        'Cache-Control': 'no-cache',
        'Connection': 'keep-alive'
    }
    data = {"term": place, "maxLocationsToReturn": 20}

    location_url = "https://www.glassdoor.co.in/findPopularLocationAjax.htm?"
    try:
        time.sleep(7)

        # Getting location id for search location
        print("Fetching location details")
        location_response = requests.post(location_url, headers=location_headers, data=data).json()
        place_id = location_response[0]['locationId']
        job_litsting_url = 'https://www.glassdoor.com/Job/jobs.htm'
        # Form data to get job results
        data = {
            'clickSource': 'searchBtn',
            'sc.keyword': keyword,
            'locT': 'C',
            'locId': place_id,
            'jobType': ''
        }

        job_listings = []
        if place_id:
            response = requests.post(job_litsting_url, headers=headers, data=data)
            # extracting data from
            # https://www.glassdoor.com/Job/jobs.htm?suggestCount=0&suggestChosen=true&clickSource=searchBtn&typedKeyword=andr&sc.keyword=android+developer&locT=C&locId=1146821&jobType=
            parser = html.fromstring(response.text)
            # Making absolute url 
            base_url = "https://www.glassdoor.com"
            parser.make_links_absolute(base_url)

            XPATH_ALL_JOB = '//li[@class="jl"]'
            XPATH_NAME = './/a/text()'
            XPATH_JOB_URL = './/a/@href'
            XPATH_LOC = './/span[@class="subtle loc"]/text()'
            XPATH_COMPANY = './/div[@class="flexbox empLoc"]/div/text()'
            XPATH_SALARY = './/span[@class="green small"]/text()'

            listings = parser.xpath(XPATH_ALL_JOB)
            for job in listings:
                raw_job_name = job.xpath(XPATH_NAME)
                raw_job_url = job.xpath(XPATH_JOB_URL)
                raw_lob_loc = job.xpath(XPATH_LOC)
                raw_company = job.xpath(XPATH_COMPANY)
                raw_salary = job.xpath(XPATH_SALARY)

                # Cleaning data
                job_name = ''.join(raw_job_name).strip('–') if raw_job_name else None
                job_location = ''.join(raw_lob_loc) if raw_lob_loc else None
                raw_state = re.findall(",\s?(.*)\s?", job_location)
                state = ''.join(raw_state).strip()
                raw_city = job_location.replace(state, '')
                city = raw_city.replace(',', '').strip()
                company = ''.join(raw_company).replace('–','')
                salary = ''.join(raw_salary).strip()
                job_url = raw_job_url[0] if raw_job_url else None

                jobs = {
                    "Name": job_name,
                    "Company": company,
                    "State": state,
                    "City": city,
                    "Salary": salary,
                    "Location": job_location,
                    "Url": job_url
                }
                job_listings.append(jobs)
            return job_listings
        else:
            print("location id not available")

    except:
        print("Failed to load locations")

    if __name__ == "__main__":

#     ''' eg-:python 1934_glassdoor.py "Android developer", "new york" '''

        argparser = argparse.ArgumentParser()
        argparser.add_argument('keyword', help='job name', type=str)
        argparser.add_argument('place', help='job location', type=str)
        args = argparser.parse_args()
        keyword = args.keyword
        place = args.place
        print("Fetching job details")
        scraped_data = parse(keyword, place)
        print("Writing data to output file")

        with open('%s-%s-job-results.csv' % (keyword, place), 'wb')as csvfile:
            fieldnames = ['Name', 'Company', 'State',
                          'City', 'Salary', 'Location', 'Url']
            writer = csv.DictWriter(csvfile, fieldnames=fieldnames,quoting=csv.QUOTE_ALL)
            writer.writeheader()
            if scraped_data:
                for data in scraped_data:
                    writer.writerow(data)
            else:
                print("Your search for %s, in %s does not match any jobs"%(keyword,place))

In [2]:
# test variables to save time
search_job = ['data scientist', 'data engineer']
search_city = ['san diego', 'los angeles']

# search_job = ['data scientist', 'data engineer', 'data analyst']
# search_city = ['san diego', 'los angeles', 'san francisco', 'denver', 'austin', 'new york']
result_dict = {}

for job in search_job:
    # new dictionary for city
    city_dict = {}
    
    for city in search_city:
        # load to dictionary with city as the key
        city_dict[(f'{city}')] = parse(job, city)
        
    # load to dictionary with job as the key
    result_dict[(f'{job}')] = city_dict

Fetching location details
Fetching location details
Fetching location details
Fetching location details


In [3]:
len(result_dict)

2

In [4]:
result_dict.keys()

dict_keys(['data scientist', 'data engineer'])

In [5]:
result_dict

{'data scientist': {'san diego': [{'Name': 'Data Scientist',
    'Company': ' Dexcom  ',
    'State': 'CA',
    'City': 'San Diego',
    'Salary': '$95k-$134k',
    'Location': 'San Diego, CA',
    'Url': 'https://www.glassdoor.com/partner/jobListing.htm?pos=101&ao=296910&s=149&guid=0000016ad69398078266b94020925512&src=GD_JOB_AD&t=SRFJ&extid=4&exst=OL&ist=&ast=OL&vt=w&slr=true&cs=1_cba900ae&cb=1558378158749&jobListingId=3129909498'},
   {'Name': 'Senior Data Scientist',
    'Company': ' Tealium  ',
    'State': 'CA',
    'City': 'San Diego',
    'Salary': '$143k-$191k',
    'Location': 'San Diego, CA',
    'Url': 'https://www.glassdoor.com/partner/jobListing.htm?pos=102&ao=642403&s=149&guid=0000016ad69398078266b94020925512&src=GD_JOB_AD&t=SRFJ&extid=4&exst=OL&ist=&ast=OL&vt=w&slr=true&cs=1_8f67c27b&cb=1558378158751&jobListingId=3232692044'},
   {'Name': 'Machine Learning Engineer - Data Analysis',
    'Company': ' Samsung Semiconductor, Inc.  ',
    'State': 'CA',
    'City': 'San Dieg

In [6]:
result_dict['data scientist'].keys()

dict_keys(['san diego', 'los angeles'])

In [7]:
result_dict['data engineer'].keys()

dict_keys(['san diego', 'los angeles'])

In [8]:
result_dict['data analyst'].keys()

KeyError: 'data analyst'

In [9]:
result_dict['data scientist']['san diego']

[{'Name': 'Data Scientist',
  'Company': ' Dexcom  ',
  'State': 'CA',
  'City': 'San Diego',
  'Salary': '$95k-$134k',
  'Location': 'San Diego, CA',
  'Url': 'https://www.glassdoor.com/partner/jobListing.htm?pos=101&ao=296910&s=149&guid=0000016ad69398078266b94020925512&src=GD_JOB_AD&t=SRFJ&extid=4&exst=OL&ist=&ast=OL&vt=w&slr=true&cs=1_cba900ae&cb=1558378158749&jobListingId=3129909498'},
 {'Name': 'Senior Data Scientist',
  'Company': ' Tealium  ',
  'State': 'CA',
  'City': 'San Diego',
  'Salary': '$143k-$191k',
  'Location': 'San Diego, CA',
  'Url': 'https://www.glassdoor.com/partner/jobListing.htm?pos=102&ao=642403&s=149&guid=0000016ad69398078266b94020925512&src=GD_JOB_AD&t=SRFJ&extid=4&exst=OL&ist=&ast=OL&vt=w&slr=true&cs=1_8f67c27b&cb=1558378158751&jobListingId=3232692044'},
 {'Name': 'Machine Learning Engineer - Data Analysis',
  'Company': ' Samsung Semiconductor, Inc.  ',
  'State': 'CA',
  'City': 'San Diego',
  'Salary': '$99k-$134k',
  'Location': 'San Diego, CA',
  'Url

In [10]:
result_dict['data scientist']['los angeles']

[{'Name': 'Data Scientist Senior',
  'Company': ' Capital Group  ',
  'State': 'CA',
  'City': 'Los Angeles',
  'Salary': '$115k-$155k',
  'Location': 'Los Angeles, CA',
  'Url': 'https://www.glassdoor.com/partner/jobListing.htm?pos=101&ao=360325&s=149&guid=0000016ad693b93da338f235b017c77e&src=GD_JOB_AD&t=SRFJ&extid=4&exst=OL&ist=&ast=OL&vt=w&slr=true&cs=1_5bd46239&cb=1558378167211&jobListingId=3136656764'},
 {'Name': 'Data Scientist',
  'Company': ' Thrive Market  ',
  'State': 'CA',
  'City': 'Los Angeles',
  'Salary': '$106k-$146k',
  'Location': 'Los Angeles, CA',
  'Url': 'https://www.glassdoor.com/partner/jobListing.htm?pos=102&ao=556693&s=149&guid=0000016ad693b93da338f235b017c77e&src=GD_JOB_AD&t=SRFJ&extid=4&exst=OL&ist=&ast=OL&vt=w&slr=true&cs=1_cfe87e4d&cb=1558378167213&jobListingId=3221727527'},
 {'Name': 'Data Scientist',
  'Company': ' Rubicon Project  ',
  'State': 'CA',
  'City': 'Los Angeles',
  'Salary': '$116k-$157k',
  'Location': 'Los Angeles, CA',
  'Url': 'https://

In [11]:
result_dict['data scientist']['san francisco']

KeyError: 'san francisco'

In [12]:
result_dict['data scientist']['denver']

KeyError: 'denver'

In [13]:
result_dict['data scientist']['austin']

KeyError: 'austin'

In [14]:
result_dict['data scientist']['new york']

KeyError: 'new york'

In [15]:
result_dict['data engineer']['san diego']

[{'Name': 'Data Engineer',
  'Company': ' LeadCrunch  ',
  'State': 'CA',
  'City': 'San Diego',
  'Salary': '$125k-$155k',
  'Location': 'San Diego, CA',
  'Url': 'https://www.glassdoor.com/partner/jobListing.htm?pos=101&ao=535387&s=149&guid=0000016ad693d9d09ad14d33216866a9&src=GD_JOB_AD&t=SRFJ&extid=4&exst=OL&ist=&ast=OL&vt=w&slr=true&cs=1_268dba92&cb=1558378175734&jobListingId=2996249774'},
 {'Name': 'Machine Learning Engineer - Data Analysis',
  'Company': ' Samsung Semiconductor, Inc.  ',
  'State': 'CA',
  'City': 'San Diego',
  'Salary': '$99k-$134k',
  'Location': 'San Diego, CA',
  'Url': 'https://www.glassdoor.com/partner/jobListing.htm?pos=102&ao=630701&s=149&guid=0000016ad693d9d09ad14d33216866a9&src=GD_JOB_AD&t=SRFJ&extid=4&exst=OL&ist=&ast=OL&vt=w&slr=true&cs=1_0f324041&cb=1558378175736&jobListingId=3223997059'},
 {'Name': 'Front End Engineer',
  'Company': ' foresee medical, inc.  ',
  'State': 'CA',
  'City': 'San Diego',
  'Salary': '',
  'Location': 'San Diego, CA',
  

In [16]:
result_dict['data analyst']['new york']

KeyError: 'data analyst'

In [17]:
from flask import Flask
from flask_pymongo import PyMongo

app = Flask(__name__)
uri = 'mongodb+srv://gil:bert@cluster0-8wvjx.mongodb.net/project2?retryWrites=true'
mongo = PyMongo(app, uri=uri)

mongo.db.housing.update({}, result_dict, upsert=True)

  


{'n': 1,
 'nModified': 0,
 'upserted': ObjectId('5ce2f713a7a4282b0589ce41'),
 'opTime': {'ts': Timestamp(1558378259, 2), 't': 2},
 'electionId': ObjectId('7fffffff0000000000000002'),
 'ok': 1.0,
 'operationTime': Timestamp(1558378259, 2),
 '$clusterTime': {'clusterTime': Timestamp(1558378259, 2),
  'signature': {'hash': b'\xfb|\xb7\xef\x93\xbd3>U\xcc\xc4mc.\xd0j\xf9\\\xf6b',
   'keyId': 6681343450274594817}},
 'updatedExisting': False}

In [None]:
# save to json
import json

with open('data.json', 'w') as fp:
    json.dump(result_dict, fp)

In [None]:
# retrieve data from mongoDB and load to variable
from flask import Flask
from flask_pymongo import PyMongo

app = Flask(__name__)
uri = 'mongodb+srv://gil:bert@cluster0-8wvjx.mongodb.net/project2?retryWrites=true'
mongo = PyMongo(app, uri=uri)

data = mongo.db.housing.find_one()

In [None]:
data