https://bulletbyte.weebly.com/tech/how-to-scrape-a-companys-glassdoor-reviews-using-python

https://www.glassdoor.com/Reviews/Salesforce-Reviews-E11159.htm

In [1]:
#import the libraries
import os
import time

import numpy as np
import pandas as pd
import math

from bs4 import BeautifulSoup
from urllib.request import Request, urlopen

In [2]:
#create a function to scrape any Glassdoor company review page
#the code still works when I run it on 7 Sep, 2021, but the html content of Glassdoor webpages changes all the time
#please inspect the webpage and make the necessary changes to the html tags if any of the list returns empty

def review_scraper(url):
  #scraping the web page content
  hdr = {'User-Agent': 'Mozilla/5.0'}
  req = Request(url,headers=hdr)
  page = urlopen(req)
  soup = BeautifulSoup(page, "html.parser") 

  #define some lists
  Summary=[]
  Date_n_JobTitle=[]
  Date=[]
  JobTitle=[]
  AuthorLocation=[]
  OverallRating=[]
  Pros=[]
  Cons=[]  

  #get the Summary
  for x in soup.find_all('h2', {'class':'mb-xxsm mt-0 css-93svrw el6ke055'}):
    Summary.append(x.text)

  #get the Posted Date and Job Title
  for x in soup.find_all('span', {'class':'middle common__EiReviewDetailsStyle__newGrey'}):
    Date_n_JobTitle.append(x.text)

  #get the Posted Date
  for x in Date_n_JobTitle:
    Date.append(x.split(' -')[0])

  #get Job Title
  for x in Date_n_JobTitle:
    JobTitle.append(x.split(' -')[1])

  #get Author Location
  for x in soup.find_all('span', {'class':'middle'}):
    AuthorLocation.append(x.text)

  #get Overall Rating
  for x in soup.find_all('span', {'class':'ratingNumber mr-xsm'}):
    OverallRating.append(float(x.text))

  #get Pros
  for x in soup.find_all('span', {'data-test':'pros'}):
    Pros.append(x.text)

  #get Cons
  for x in soup.find_all('span', {'data-test':'cons'}):
    Cons.append(x.text)

  #putting everything together
  Reviews = pd.DataFrame(list(zip(Summary, Date, JobTitle, AuthorLocation, OverallRating, Pros, Cons)), 
                    columns = ['Summary', 'Date', 'JobTitle', 'AuthorLocation', 'OverallRating', 'Pros', 'Cons'])
  
  return Reviews

<h2 class="mb-xxsm mt-0 css-93svrw el6ke055"><a href="/Reviews/Employee-Review-Salesforce-RVW51057878.htm" class="reviewLink">Amazing!</a></h2>
<span class="middle common__EiReviewDetailsStyle__newGrey">Nov 30, 2020 - Account Executive- Core Team</span>
<span class="middle">in <span>San Francisco, CA</span></span>
<span class="ratingNumber mr-xsm">3.0</span>
<span data-test="pros">- Benefits are top notch
- Perks in the tower and holiday party are impressive
- Sales tactics and strategies are great for growth even as an experienced rep
- You’ll meet very talented sales rep with a wide variance of styles
-ESPP
- generous maternity/paternity leave. Although this will affect your likelihood of be promoted</span>

In [None]:
#paste/replace the url to the first page of the company's Glassdoor review in between the ""
input_url="https://www.glassdoor.com/Reviews/Salesforce-Reviews-E11159.htm?filter.iso3Language=eng"

#scraping the first page content
hdr = {'User-Agent': 'Mozilla/5.0'}
req = Request(input_url+str(1)+".htm?sort.sortType=RD&sort.ascending=false",headers=hdr)
page = urlopen(req)
soup = BeautifulSoup(page, "html.parser") 

#check the total number of reviews
countReviews = soup.find('div', {'data-test':'pagination-footer-text'}).text
countReviews = float(countReviews.split(' Reviews')[0].split('of ')[1].replace(',',''))

#calculate the max number of pages (assuming 10 reviews a page)
countPages = math.ceil(countReviews/10)
countPages

#I'm setting the max pages to scrape to 3 here to save time
# maxPage = 3 + 1
maxPage = 1770 + 1
#uncomment the line below to set the max page to scrape (based on total number of reviews)
#maxPage = countPages + 1

#scraping multiple pages of company glassdoor review
output = review_scraper(input_url+str(1)+".htm?sort.sortType=RD&sort.ascending=false")
for x in range(2,maxPage):
    url = input_url+"_P"+str(x)+".htm?sort.sortType=RD&sort.ascending=false"
    output = output.append(review_scraper(url), ignore_index=True)
    time.sleep(3)

#display the output
display(output)


In [None]:
output.to_csv('final.csv')