## Scrape reviews from Indeed

In [1]:
import requests
import numpy as np
from bs4 import BeautifulSoup as bs
import pandas as pd

In [2]:
# Create a DataFrame
df = pd.DataFrame({'job':[],'info':[],'rating':[],'review_title':[],'review_content':[]})

In [3]:
# Create soup object and parse text html
for i in range(0,2600,20):
    url = 'https://ca.indeed.com/cmp/Scotiabank/reviews?start={}&lang=en'.format(i)
    r = requests.get(url)
    soup = bs(r.text,'html.parser')
    result = soup.find('div',class_="cmp-ReviewsList")
    elements = result.find_all('div',class_='css-lqffld-Box eu4oa1w0')
    for element in elements:
        job = element.find('a',class_='css-91nj6e-Link emf9s7v0')
        if isinstance(job, type(None)):
            job ="Anonymous"
        else:
            job = job.text
        info = element.find('div',class_='css-1ikgorc-Text e1wnkr790').text
        rating = element.find('button', class_='css-1hmmasr-Text e1wnkr790').text       
        review_title = element.find('h2', class_='css-1i3kt4y-Heading e1tiznh50').text
        review_content = element.find('div', class_='css-ebcgx4-Box eu4oa1w0').text
        df = df.append({'job':job,'info':info,'rating':rating,'review_title': review_title,'review_content':review_content},ignore_index=True)

In [4]:
df.loc[df['job']=="Anonymous"]

Unnamed: 0,job,info,rating,review_title,review_content
1883,Anonymous,Anonymous (Former Employee) - Anonymous - 10 J...,2.0,Cheap parking,SDA is an ok place to work but their values ar...


In [5]:
df.shape

(2730, 5)

In [6]:
df.describe()

Unnamed: 0,job,info,rating,review_title,review_content
count,2730,2730,2730.0,2730,2730
unique,1338,2598,5.0,2192,2597
top,Financial Advisor,"Financial Advisor (Former Employee) - Toronto,...",4.0,A Great Place to Start Your Career in Financia...,If you are just out of school or finishing a d...
freq,299,130,1006.0,130,130


In [7]:
df.info()

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 2730 entries, 0 to 2729
Data columns (total 5 columns):
 #   Column          Non-Null Count  Dtype 
---  ------          --------------  ----- 
 0   job             2730 non-null   object
 1   info            2730 non-null   object
 2   rating          2730 non-null   object
 3   review_title    2730 non-null   object
 4   review_content  2730 non-null   object
dtypes: object(5)
memory usage: 106.8+ KB


## Reorganize columns

In [18]:
# split info column
temp_df = df['info'].str.split('-',expand=True)
temp_df.head()

Unnamed: 0,0,1,2,3,4
0,Financial Advisor (Former Employee),"Toronto, ON",2 November 2020,,
1,Agent de perception (Former Employee),"Montréal, QC",23 June 2021,,
2,Customer Service Representative (Former Employ...,"Calgary, AB",12 June 2021,,
3,Scotiabank Senior Financial Advisor (Current E...,"Sackville, NB",9 June 2021,,
4,Financial Advisor (Former Employee),"Trenton, ON",7 June 2021,,


In [19]:
# Delete unnecessary columns
temp_df = temp_df.drop([0,3,4],axis=1)
temp_df.head()

Unnamed: 0,1,2
0,"Toronto, ON",2 November 2020
1,"Montréal, QC",23 June 2021
2,"Calgary, AB",12 June 2021
3,"Sackville, NB",9 June 2021
4,"Trenton, ON",7 June 2021


In [20]:
# Rename columns
temp_df = temp_df.rename (columns={1:'location', 2:'date'})
temp_df.head()

Unnamed: 0,location,date
0,"Toronto, ON",2 November 2020
1,"Montréal, QC",23 June 2021
2,"Calgary, AB",12 June 2021
3,"Sackville, NB",9 June 2021
4,"Trenton, ON",7 June 2021


In [21]:
# Merge 2 DataFrame
final_df = pd.concat([temp_df,df],axis=1)

In [22]:
final_df.head()

Unnamed: 0,location,date,job,info,rating,review_title,review_content
0,"Toronto, ON",2 November 2020,Financial Advisor,"Financial Advisor (Former Employee) - Toronto,...",4.0,A Great Place to Start Your Career in Financia...,If you are just out of school or finishing a d...
1,"Montréal, QC",23 June 2021,Agent de perception,Agent de perception (Former Employee) - Montré...,1.0,Poor management,"Poor management and leadership, one team leade..."
2,"Calgary, AB",12 June 2021,Customer Service Representative,Customer Service Representative (Former Employ...,1.0,This corporation is the devil - do not work here,I worked as CSR and it was the worst job I've ...
3,"Sackville, NB",9 June 2021,Scotiabank Senior Financial Advisor,Scotiabank Senior Financial Advisor (Current E...,1.0,"Poor management, bad support teams no work lif...",Thave been working for 15 years including 4 co...
4,"Trenton, ON",7 June 2021,Financial Advisor,"Financial Advisor (Former Employee) - Trenton,...",1.0,Toxic envirnoment,The culture in this bank was toxic for myself ...


In [23]:
final_df = final_df.drop(columns=['info'])

In [25]:
final_df['rating']=pd.to_numeric(final_df['rating'])

In [26]:
final_df.head()

Unnamed: 0,location,date,job,rating,review_title,review_content
0,"Toronto, ON",2 November 2020,Financial Advisor,4.0,A Great Place to Start Your Career in Financia...,If you are just out of school or finishing a d...
1,"Montréal, QC",23 June 2021,Agent de perception,1.0,Poor management,"Poor management and leadership, one team leade..."
2,"Calgary, AB",12 June 2021,Customer Service Representative,1.0,This corporation is the devil - do not work here,I worked as CSR and it was the worst job I've ...
3,"Sackville, NB",9 June 2021,Scotiabank Senior Financial Advisor,1.0,"Poor management, bad support teams no work lif...",Thave been working for 15 years including 4 co...
4,"Trenton, ON",7 June 2021,Financial Advisor,1.0,Toxic envirnoment,The culture in this bank was toxic for myself ...


In [27]:
final_df.shape

(2730, 6)

In [29]:
final_df.to_csv('Scotiabank-Indeed-Reviews.csv')