# **Voice of Employees**

We are scraping **Glassdoor Reviews** for both **L'Occitane and Erborian** brands using Requests and Beautifoul Soup Libraries:

Link: https://bulletbyte.weebly.com/tech/how-to-scrape-a-companys-glassdoor-reviews-using-python


## **Introduction: Libraries and Functions**

In [None]:
#import the libraries
import os
import time
import re
import string

import numpy as np
import pandas as pd
import math

from bs4 import BeautifulSoup
from urllib.request import Request, urlopen

In [None]:
from google.colab import drive
drive.mount('/content/drive')

Drive already mounted at /content/drive; to attempt to forcibly remount, call drive.mount("/content/drive", force_remount=True).


In [None]:
#create a function to scrape any Glassdoor company review page
#the code still works when I run it on 7 Sep, 2021, but the html content of Glassdoor webpages changes all the time
#please inspect the webpage and make the necessary changes to the html tags if any of the list returns empty

def review_scraper(url):
  #scraping the web page content
  hdr = {"User-Agent":"Mozilla/5.0 Gecko/20100101 Firefox/33.0 GoogleChrome/10.0"}
  req = Request(url,headers=hdr)
  page = urlopen(req)
  soup = BeautifulSoup(page, "html.parser") 

  #define some lists
  Summary=[]
  Date_n_JobTitle=[]
  Date=[]
  JobTitle=[]
  AuthorLocation=[]
  OverallRating=[]
  Pros=[]
  Cons=[]  

  #get the Summary (Hugo: Corrected)
  for x in soup.find_all('h2', {'class':'mb-xxsm mt-0 css-93svrw el6ke055'}):
    Summary.append(x.text)

  #get the Posted Date and Job Title
  for x in soup.find_all('span', {'class':'authorJobTitle middle common__EiReviewDetailsStyle__newGrey'}):
    Date_n_JobTitle.append(x.text)

  #get the Posted Date
  for x in Date_n_JobTitle:
    Date.append(x.split(' -')[0])

  #get Job Title
  for x in Date_n_JobTitle:
    JobTitle.append(x.split(' -')[1])

  #get Author Location
  for x in soup.find_all('span', {'class':'authorLocation'}):
    AuthorLocation.append(x.text)

  #get Overall Rating
  for x in soup.find_all('span', {'class':'ratingNumber mr-xsm'}):
    OverallRating.append(float(x.text))

  #get Pros
  for x in soup.find_all('span', {'data-test':'pros'}):
    Pros.append(x.text)

  #get Cons
  for x in soup.find_all('span', {'data-test':'cons'}):
    Cons.append(x.text)

  #putting everything together
  Reviews = pd.DataFrame(list(zip(Summary, Date, JobTitle, AuthorLocation, OverallRating, Pros, Cons)), 
                    columns = ['Summary', 'Date', 'JobTitle', 'AuthorLocation', 'OverallRating', 'Pros', 'Cons'])
  
  return Reviews

In [None]:
# Define Microsoft Translation function 

# Microsoft Translation
import requests, uuid, json

def microsoft_translate(text):
  # Add your subscription key and endpoint
  subscription_key = "XXXXXXXXXXXXXXXXX"
  endpoint = "https://api.cognitive.microsofttranslator.com"

  # Add your location, also known as region. The default is global.
  # This is required if using a Cognitive Services resource.
  location = "westeurope"

  path = '/translate'
  constructed_url = endpoint + path

  params = {
      'api-version': '3.0',
      'to': 'en'
  }

  headers = {
      'Ocp-Apim-Subscription-Key': subscription_key,
      'Ocp-Apim-Subscription-Region': location,
      'Content-type': 'application/json',
      'X-ClientTraceId': str(uuid.uuid4())
  }

  # You can pass more than one object in body.
  body = [{
      'text': str(text)
  }]

  request = requests.post(constructed_url, params=params, headers=headers, json=body)
  response = request.json()

  # print(json.dumps(response, sort_keys=True, ensure_ascii=False, indent=4, separators=(',', ': ')))

  return response[0]['translations'][0]['text']

## **Scraping L'Occitane Reviews**

In [None]:
#paste/replace the url to the first page of the company's Glassdoor review in between the ""
input_url="https://www.glassdoor.sg/Reviews/L-Occitane-Reviews-E33235"

#scraping the first page content
hdr = {"User-Agent":"Mozilla/5.0 Gecko/20100101 Firefox/33.0 GoogleChrome/10.0"}
req = Request(input_url+".htm?sort.sortType=RD&sort.ascending=false&filter.iso3Language=eng",headers=hdr)
page = urlopen(req)
soup = BeautifulSoup(page, "html.parser") 

#check the total number of reviews
countReviews = soup.find('div', {'data-test':'pagination-footer-text'}).text
countReviews = float(countReviews.split(' Reviews')[0].split('of ')[1].replace(',',''))

#calculate the max number of pages (assuming 10 reviews a page)
countPages = math.ceil(countReviews/10)
countPages

#I'm setting the max pages to scrape to 3 here to save time
#maxPage = 300
#uncomment the line below to set the max page to scrape (based on total number of reviews)
maxPage = countPages + 1

#scraping multiple pages of company glassdoor review
output = review_scraper(input_url+".htm?sort.sortType=RD&sort.ascending=false&filter.iso3Language=eng")
for x in range(2,maxPage):
  url = input_url+"_P"+str(x)+".htm?sort.sortType=RD&sort.ascending=false&filter.iso3Language=eng"
  output = output.append(review_scraper(url), ignore_index=True)

#display the output
display(output)

Unnamed: 0,Summary,Date,JobTitle,AuthorLocation,OverallRating,Pros,Cons
0,Great place to learn!,"May 19, 2022",Skincare Consultant,"Pittsburgh, PA",5.0,Spent a lot of time training and preparing me ...,"Management can be very sales motivated, can be..."
1,Loved it here,"May 11, 2022",Cashier/Sales Associate,"Boston, MA",5.0,My manager was amazing. She worked hard and ca...,The pay was just minimum wage
2,Good company to stay,"May 6, 2022",Beauty Advisor,Melbourne,4.0,"Flexible shift, good for part-time. good discount","less hour, not good for promotion"
3,Please do not apply for a job here,"May 4, 2022",Boutique Manager,Parramatta,1.0,Previously worked with an amazing supportive t...,"- Upper management do not care about you, your..."
4,Don't Work Here,"May 4, 2022",Manager,"Sheffield, England",1.0,They make good hand cream....that's it.,Management is horrific. They care only for pr...
...,...,...,...,...,...,...,...
435,Good company to work for,"Jul 25, 2010",Assistant Store Manager,"Pleasanton, CA",4.0,- Generous discount and quarterly free product...,"- No commission, but you still have sales goal..."
436,sales associate,"Jul 22, 2010",Sales Associate,"London, England",5.0,"beautiful product, great training, wonderful p...","low payroll, have to work hard to keep store l..."
437,Completely unsatisfactory,"Oct 27, 2009",Part Time Sales Associate,"Calgary, AB",1.0,L'Occitane is fun for those that like competit...,L'Occitane does not praise their best employee...
438,"If you have no soul, drop off your resume at t...","Apr 17, 2009",Sales Associate,"New York, NY",1.0,"We got great employee discounts, and the produ...",Where to begin!! Head Office held no regard fo...


In [None]:
output['Company'] = "L'Occitane"
output.head()

Unnamed: 0,Summary,Date,JobTitle,AuthorLocation,OverallRating,Pros,Cons,Company
0,Great place to learn!,"May 19, 2022",Skincare Consultant,"Pittsburgh, PA",5.0,Spent a lot of time training and preparing me ...,"Management can be very sales motivated, can be...",L'Occitane
1,Loved it here,"May 11, 2022",Cashier/Sales Associate,"Boston, MA",5.0,My manager was amazing. She worked hard and ca...,The pay was just minimum wage,L'Occitane
2,Good company to stay,"May 6, 2022",Beauty Advisor,Melbourne,4.0,"Flexible shift, good for part-time. good discount","less hour, not good for promotion",L'Occitane
3,Please do not apply for a job here,"May 4, 2022",Boutique Manager,Parramatta,1.0,Previously worked with an amazing supportive t...,"- Upper management do not care about you, your...",L'Occitane
4,Don't Work Here,"May 4, 2022",Manager,"Sheffield, England",1.0,They make good hand cream....that's it.,Management is horrific. They care only for pr...,L'Occitane


In [None]:
output.info()

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 440 entries, 0 to 439
Data columns (total 8 columns):
 #   Column          Non-Null Count  Dtype  
---  ------          --------------  -----  
 0   Summary         440 non-null    object 
 1   Date            440 non-null    object 
 2   JobTitle        440 non-null    object 
 3   AuthorLocation  440 non-null    object 
 4   OverallRating   440 non-null    float64
 5   Pros            440 non-null    object 
 6   Cons            440 non-null    object 
 7   Company         440 non-null    object 
dtypes: float64(1), object(7)
memory usage: 27.6+ KB


In [None]:
# Clean text
# Clean the text

# Create a function to clean the tweets

def cleanTxt(text):
  text = re.sub(r'@[A-Za-z0-9_]+', '', text) #Revoming @mentions
  text = re.sub(r'#', '', text) # Removing the '#' simbol
  text = re.sub(r'RT[\s]+', '', text) # Removing RT
  text = re.sub(r'https?:\/\/\S+', '', text) # Removing the hyper link
  text = text.lower() # make text lowercase
  text = re.sub('\[.*?\]', '', text) # removing text within brackets
  text = re.sub('\(.*?\)', '', text) # removing text within parentheses
  text = re.sub('\w*\d\w*', '', text) # removing numbers
  text = re.sub('\s+', ' ', text) # if there's more than 1 whitespace, then make it just 1
  text = re.sub('\n', ' ', text) # if there's a new line, then make it a whitespace
  text = re.sub('\"+', '', text) # removing any quotes
  text = re.sub('(\&amp\;)', '', text) # removing &amp;
  text = re.sub('[%s]' % re.escape(string.punctuation), '', text) # Get rid of all punctuation
  text = re.sub('(httptco)', '', text) # getting rid of `httptco`
  text = re.sub(r'[^\w\s]', '',text) # remove other punctuation

  return text

# Cleaning the text
output['Summary'] = output['Summary'].apply(cleanTxt)
output['Pros'] = output['Pros'].apply(cleanTxt)
output['Cons'] = output['Cons'].apply(cleanTxt)

# Show the cleaned text
output.head()

Unnamed: 0,Summary,Date,JobTitle,AuthorLocation,OverallRating,Pros,Cons,Company
0,great place to learn,"May 19, 2022",Skincare Consultant,"Pittsburgh, PA",5.0,spent a lot of time training and preparing me ...,management can be very sales motivated can be ...,L'Occitane
1,loved it here,"May 11, 2022",Cashier/Sales Associate,"Boston, MA",5.0,my manager was amazing she worked hard and car...,the pay was just minimum wage,L'Occitane
2,good company to stay,"May 6, 2022",Beauty Advisor,Melbourne,4.0,flexible shift good for parttime good discount,less hour not good for promotion,L'Occitane
3,please do not apply for a job here,"May 4, 2022",Boutique Manager,Parramatta,1.0,previously worked with an amazing supportive t...,upper management do not care about you your m...,L'Occitane
4,dont work here,"May 4, 2022",Manager,"Sheffield, England",1.0,they make good hand creamthats it,management is horrific they care only for prof...,L'Occitane


In [None]:
# Loop for Translation
for i in range(0,output.shape[0]):
  print(i)
  output['Summary'][i] = microsoft_translate(output['Summary'][i])
  output['Pros'][i] = microsoft_translate(output['Pros'][i])
  output['Cons'][i] = microsoft_translate(output['Cons'][i])

0


A value is trying to be set on a copy of a slice from a DataFrame

See the caveats in the documentation: https://pandas.pydata.org/pandas-docs/stable/user_guide/indexing.html#returning-a-view-versus-a-copy
  after removing the cwd from sys.path.
A value is trying to be set on a copy of a slice from a DataFrame

See the caveats in the documentation: https://pandas.pydata.org/pandas-docs/stable/user_guide/indexing.html#returning-a-view-versus-a-copy
  """
A value is trying to be set on a copy of a slice from a DataFrame

See the caveats in the documentation: https://pandas.pydata.org/pandas-docs/stable/user_guide/indexing.html#returning-a-view-versus-a-copy
  


1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
50
51
52
53
54
55
56
57
58
59
60
61
62
63
64
65
66
67
68
69
70
71
72
73
74
75
76
77
78
79
80
81
82
83
84
85
86
87
88
89
90
91
92
93
94
95
96
97
98
99
100
101
102
103
104
105
106
107
108
109
110
111
112
113
114
115
116
117
118
119
120
121
122
123
124
125
126
127
128
129
130
131
132
133
134
135
136
137
138
139
140
141
142
143
144
145
146
147
148
149
150
151
152
153
154
155
156
157
158
159
160
161
162
163
164
165
166
167
168
169
170
171
172
173
174
175
176
177
178
179
180
181
182
183
184
185
186
187
188
189
190
191
192
193
194
195
196
197
198
199
200
201
202
203
204
205
206
207
208
209
210
211
212
213
214
215
216
217
218
219
220
221
222
223
224
225
226
227
228
229
230
231
232
233
234
235
236
237
238
239
240
241
242
243
244
245
246
247
248
249
250
251
252
253
254
255
256
257
258
259
260
261
262
263
264
265
266
267
268
269
270
271
272
273
274
275
276
277


In [None]:
# Clean the text again
output['Summary'] = output['Summary'].apply(cleanTxt)
output['Pros'] = output['Pros'].apply(cleanTxt)
output['Cons'] = output['Cons'].apply(cleanTxt)

In [None]:
import nltk
nltk.download('vader_lexicon')
from nltk.sentiment.vader import SentimentIntensityAnalyzer

analyzer = SentimentIntensityAnalyzer()

[nltk_data] Downloading package vader_lexicon to /root/nltk_data...




In [None]:
sentiment = output['Summary'].apply(lambda x: analyzer.polarity_scores(str(x)))
output = pd.concat([output,sentiment.apply(pd.Series)],1)

  


In [None]:
output.rename(columns={'Subjectivity':'subjectivity', 'Polarity':'polarity', 'neg':'negative',
                         'neu':'neutral', 'pos':'positive'}, inplace=True)
output.head()

Unnamed: 0,Summary,Date,JobTitle,AuthorLocation,OverallRating,Pros,Cons,Company,negative,neutral,positive,compound
0,great place to learn,"May 19, 2022",Skincare Consultant,"Pittsburgh, PA",5.0,spent a lot of time training and preparing me ...,management can be very sales motivated can be ...,L'Occitane,0.0,0.423,0.577,0.6249
1,loved it here,"May 11, 2022",Cashier/Sales Associate,"Boston, MA",5.0,my manager was amazing she worked hard and car...,the pay was just minimum wage,L'Occitane,0.0,0.339,0.661,0.5994
2,good company to stay,"May 6, 2022",Beauty Advisor,Melbourne,4.0,flexible shift good for parttime good discount,less hour not good for promotion,L'Occitane,0.0,0.508,0.492,0.4404
3,please do not apply for a job here,"May 4, 2022",Boutique Manager,Parramatta,1.0,previously worked with an amazing supportive t...,upper management do not care about you your m...,L'Occitane,0.0,0.723,0.277,0.3182
4,dont work here,"May 4, 2022",Manager,"Sheffield, England",1.0,they make good hand creamthats it,management is horrific they care only for prof...,L'Occitane,0.0,1.0,0.0,0.0


In [None]:
output.to_csv('/content/drive/MyDrive/Kedge Thesis: Voice of Stakeholders/3. Voice of Employees/loccitane_reviews.csv')

In [None]:
output['Summary']

0                                   great place to learn
1                                          loved it here
2                                   good company to stay
3                     please do not apply for a job here
4                                         dont work here
                             ...                        
435                             good company to work for
436                                      sales associate
437                            completely unsatisfactory
438    if you have no soul drop off your resume at th...
439    it exactly what you think it is a job nothing ...
Name: Summary, Length: 440, dtype: object