# Glassdoor web scraper for job reviews
This code was developed by group 01 as a part of the NLP course at IE University 2023. It was inspired by the reference code that can be found in: https://bulletbyte.weebly.com/tech/how-to-scrape-a-companys-glassdoor-reviews-using-python

## 1. Import libraries and functions
First, the libraries will be imported and configured if needed.

In [7]:
## Import libraries

# Import os and time
import os
import time

# Data science related
import numpy as np
import pandas as pd
import math

# String related
import re

# Web scraping related
from bs4 import BeautifulSoup
from urllib.request import Request, urlopen

from http.client import IncompleteRead

Now, the functions which will be used through the code will be defined. There are two functions: 

* review_scrap: it takes a url and it outputs a pandas dataframe with the reviews scraped
* get_reviews: it takes in the main url of the company, and outputs a pandas dataframe with the different things extracted
* get_xxxx: different functions that will be used inside list comprehensions





In [8]:
def review_scrap(url):
    
    """
    This function will take in an url, and will output a pandas dataframe where each line is a review, and each column is a
    feature of the review. Missing values will be filled in with -
    """
    
    ## Scrap the web page url content

    # State user agent
    hdr = {'User-Agent': 'Mozilla/5.0'}
    
    # Request ulr
    req = Request(url,headers=hdr)

    # Get page 
    page = urlopen(req).read()
    
    # Produce soup from BeautifulSoup
    soup = BeautifulSoup(page, "html.parser")
    
    # Get reviews inside soup
    reviews_in_page = soup.find_all('li', {'class': 'noBorder empReview cf pb-0 mb-0'})
    
    ## Get information
    
    # Get the review id
    review_id = [get_review_id(tag) for tag in reviews_in_page]
    
    # Get the summary
    summary = [get_summary(tag) for tag in reviews_in_page]
    
    # Get the date
    date = [get_date(tag) for tag in reviews_in_page]
    
    # Get the job title
    job_title = [get_job_title(tag) for tag in reviews_in_page]
    
    # Get the location
    location = [get_location(tag) for tag in reviews_in_page]
    
    # Get the rating
    overall_rating = [get_rating(tag) for tag in reviews_in_page]
    
    # Get the pros
    pros = [get_pros(tag) for tag in reviews_in_page]
    
    # Get the cons
    cons = [get_cons(tag) for tag in reviews_in_page]
    
    return pd.DataFrame({'review_id': review_id, 'summary': summary, 'date': date, 'job_title': job_title, 'overall_rating': overall_rating, 'pros': pros, 'cons': cons}) # , 'author_location': author_location

In [9]:
def get_reviews(url_main, max_pag=30): #Changing max page to 30 from 5
    
    """
    This function takes in two arguments:
     - url_main: it's the url of the first page of reviews, without the .htm
     - max_pag: the maximum number of pages to get reviews from
     
     It outputs a pandas dataframe with the different things extracted
    """
    
    # Change max_pag value
    max_pag +=2
    
    #Modified by Sahana
    df = pd.DataFrame()
    for page_num in range(2, max_pag):
        df = pd.concat([df, review_scrap(url_main + "_P" + str(page_num) + ".htm")], ignore_index=True)
        time.sleep(0.5)  # wait for 30 seconds between each iteration
    
    return df

In [10]:
"""
These functions are coded to be used in list comprehensions. All they do is that, if the .find method returns an error, they
will return a -
"""

def get_review_id(current_tag):
    try:
        review_id_current = current_tag['id']
    except:
        review_id_current = '-'
    return review_id_current

def get_summary(current_tag):
    try:
        summary_current = current_tag.find('a', {'class':'reviewLink'}).text
    except:
        summary_current = '-'
    return summary_current

def get_date(current_tag):
    try:
        date_current = current_tag.find('span', {'class': 'authorInfo'}).contents[0].text.split(' - ')[0]
    except:
        date_current = '-'
    return date_current

def get_job_title(current_tag):
    try:
        job_title_current = current_tag.find('span', {'class': 'authorInfo'}).contents[0].text.split(' - ')[1]
    except:
        job_title_current = '-'
    return job_title_current

def get_location(current_tag):
    try:
        location_current = current_tag.find('span', {'class':'authorLocation'}).text
    except:
        location_current = '-'
    return location_current

def get_rating(current_tag):
    try:
        rating_current = current_tag.find('span', {'class':'ratingNumber mr-xsm'}).text
    except:
        rating_current = '-'
    return rating_current

def get_pros(current_tag):
    try:
        pros_current = current_tag.find('span', {'data-test':'pros'}).text
    except:
        pros_current = '-'
    return pros_current

def get_cons(current_tag):
    try:
        cons_current = current_tag.find('span', {'data-test':'cons'}).text
    except:
        cons_current = '-'
    return cons_current

## 2. Get company reviews
Now, this script will be used to get the reviews for 5 companies from each industry.

The companies chosen are:
   - Consulting: Bain, McKisney, BCG, Oliver Wyman, Deloitte
   - Tech: Google, Microsoft, Meta, Amazon, Netflix
   - Investmen banking: Goldman Sachs, JP Morgan, Morgan Stanley,Credit Suisse, Citi
   - Unicorns: Revolut, Canva, Instacart, Stripe, ByteDance.
    
These companies were chosen because they are generally regarded as having very good working conditions (unicorns and tech) or bad working conditions (consulting and investment banking).

In [11]:
# Defining the dataframe with Industry, Company, and Glassdoor URL
comp_dict = pd.DataFrame({'Industry': ['Consulting', 'Consulting', 'Consulting', 'Consulting', 'Consulting', 'Tech', 'Tech', 'Tech', 'Tech', 'Tech', 'Investment Banking', 'Investment Banking', 'Investment Banking', 'Investment Banking', 'Investment Banking', 'Unicorns', 'Unicorns', 'Unicorns', 'Unicorns', 'Unicorns'],
                   'Company': ['Bain & Company', 'McKinsey & Company', 'Boston Consulting Group', 'Oliver Wyman', 'Deloitte', 'Google', 'Microsoft', 'META', 'Amazon', 'Netflix', 'Goldman Sachs', 'JP Morgan', 'Morgan Stanley', 'Credit Suisse', 'Citi', 'Revolut', 'Canva', 'Instacart', 'Stripe', 'ByteDance'],
                   'Glassdoor URL': ['https://www.glassdoor.com/Reviews/Bain-and-Company-Reviews-E3752', 'https://www.glassdoor.com/Reviews/McKinsey-and-Company-Reviews-E2893', 'https://www.glassdoor.com/Reviews/Boston-Consulting-Group-Reviews-E3879', 'https://www.glassdoor.com/Reviews/Oliver-Wyman-Reviews-E40206', 'https://www.glassdoor.com/Reviews/Deloitte-Reviews-E2763', 'https://www.glassdoor.com/Reviews/Google-Reviews-E9079', 'https://www.glassdoor.com/Reviews/Microsoft-Reviews-E1651', 'https://www.glassdoor.com/Reviews/Meta-Reviews-E40772', 'https://www.glassdoor.com/Reviews/Amazon-Reviews-E6036', 'https://www.glassdoor.com/Reviews/Netflix-Reviews-E11891', 'https://www.glassdoor.com/Reviews/Goldman-Sachs-Reviews-E2800', 'https://www.glassdoor.com/Reviews/J-P-Morgan-Reviews-E145', 'https://www.glassdoor.com/Reviews/Morgan-Stanley-Reviews-E2282', 'https://www.glassdoor.com/Reviews/Credit-Suisse-Reviews-E3141', 'https://www.glassdoor.com/Reviews/Citi-Reviews-E8843', 'https://www.glassdoor.com/Reviews/Revolut-Reviews-E1176471', 'https://www.glassdoor.com/Reviews/Canva-Reviews-E1013251', 'https://www.glassdoor.com/Reviews/Instacart-Reviews-E714486', 'https://www.glassdoor.com/Reviews/Stripe-Reviews-E671932', 'https://www.glassdoor.com/Reviews/ByteDance-Reviews-E1624196']})

In [12]:
# List where reviews will be saved
reviews_list = []

# Iterate through companies
for index, row in comp_dict.iterrows():
  reviews = get_reviews(row["Glassdoor URL"])
  reviews["Industry"] = row["Industry"]
  reviews["Company"] = row["Company"]
  reviews_list.append(reviews)

# Concatenate all reviews
final_df = pd.concat(reviews_list, ignore_index=True)

Now, having the reviews, the head of the dataset will be displayed, and the number of reviews will also be shown.

In [13]:
# Show dataset head
final_df.head()

Unnamed: 0,review_id,summary,date,job_title,overall_rating,pros,cons,Industry,Company
0,empReview_72841993,Good place to start your career,-,-,5.0,Fun culture for the ACs,"Long hours, don’t see your work through due to...",Consulting,Bain & Company
1,empReview_72790395,Good support. Fast growth opportunities,-,-,5.0,Great learning and teaming. Experience also hi...,"Bc experience is so dependent on case/client, ...",Consulting,Bain & Company
2,empReview_72724930,Love it!,-,-,5.0,Great place! I feel supported and have opportu...,Not many. WLB can be tricky,Consulting,Bain & Company
3,empReview_72671934,Great firm!,-,-,5.0,Great all around experience at the firm. Stron...,"As in all professional services jobs, work lif...",Consulting,Bain & Company
4,empReview_72465515,Great,-,-,5.0,The Best place to work,Longer hours at times on some cases,Consulting,Bain & Company


In [17]:
# Print number of reviews
company_grouped = final_df.groupby("Company")
company_counts = company_grouped.count()["Industry"]
company_counts

Company
Amazon                     300
Bain & Company             300
Boston Consulting Group    300
ByteDance                  300
Canva                      300
Citi                       300
Credit Suisse              300
Deloitte                   300
Goldman Sachs              300
Google                     300
Instacart                  300
JP Morgan                  300
META                       300
McKinsey & Company         300
Microsoft                  300
Morgan Stanley             300
Netflix                    300
Oliver Wyman               300
Revolut                    300
Stripe                     300
Name: Industry, dtype: int64

In [18]:
# Show length of text in reviews
final_df['pros_length'] = final_df['pros'].str.len()
final_df['cons_length'] = final_df['cons'].str.len()
final_df['summary_length'] = final_df['summary'].str.len()

df_grouped = final_df.groupby('Company').agg({'pros_length': 'mean', 
                                        'cons_length': 'mean', 
                                        'summary_length': 'mean'})

df_grouped

Unnamed: 0_level_0,pros_length,cons_length,summary_length
Company,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1
Amazon,163.536667,223.656667,24.66
Bain & Company,101.323333,72.726667,22.733333
Boston Consulting Group,77.85,92.826667,21.536667
ByteDance,70.7,92.903333,21.63
Canva,169.016667,143.073333,24.686667
Citi,59.91,108.873333,18.91
Credit Suisse,61.36,93.463333,19.983333
Deloitte,83.526667,93.93,20.183333
Goldman Sachs,63.613333,74.77,18.26
Google,71.353333,93.553333,19.946667


Finally, the results will be saved to a csv file.

In [16]:
# Save to csv
final_df.to_csv("final_reviews.csv", index=False)