In this notebook we'll fetch the urls of the companies that are referenced on https://fr.trustpilot.com/

We'll use selenium because the content is dynamically rendered

We'll then scrape the reviews using scrapy and feending it the scraped urls

In [1]:
%config Completer.use_jedi=False

In [2]:
import json
import time

from bs4 import BeautifulSoup
import requests
import pandas as pd

from selenium import webdriver
from selenium.common.exceptions import NoSuchElementException
from selenium.common.exceptions import TimeoutException
from selenium.webdriver.support.ui import WebDriverWait
from selenium.webdriver.support import expected_conditions as EC
from selenium.webdriver.common.by import By
from selenium.webdriver.chrome.options import Options

from tqdm.notebook import trange, tqdm

import pprint 
pp = pprint.PrettyPrinter()

import threading

TY - If you don't see XML builder, then you will run into issues. You have to conda install lxml. If that still fails, then you have to re-install lxml and bs, and restall them in the order: lxml then bs.

In [3]:
import bs4
bs4.builder.builder_registry.builders

[bs4.builder._lxml.LXMLTreeBuilder,
 bs4.builder._lxml.LXMLTreeBuilderForXML,
 bs4.builder._htmlparser.HTMLParserTreeBuilder]

We first start by fetching sub-categories urls:

In [33]:
base_url = "https://trustpilot.com"
data = {}

soup = BeautifulSoup(requests.get(base_url + '/categories').content, 'lxml')
for category in soup.findAll('div', {'class': 'subCategory___BRUDy'}):
    name = category.find('h3', {'class': 'subCategoryHeader___36ykD'}).text
    name = name.strip()
    data[name] = {}  
    sub_categories = category.find('div', {'class': 'subCategoryList___r67Qj'})
    for sub_category in sub_categories.findAll('div', {'class': 'subCategoryItem___3ksKz'}):
        sub_category_name = sub_category.text 
        sub_category_uri = sub_category.find('a')['href'] 
        data[name][sub_category_name] = sub_category_uri

In [34]:
pp.pprint(data)

{'Animals & Pets': {'Animal Health': '/categories/animal_health',
                    'Animal Parks & Zoo': '/categories/animal_parks_zoo',
                    'Cats & Dogs': '/categories/cats_dogs',
                    'Horses & Riding': '/categories/horses_riding',
                    'Pet Services': '/categories/pet_services',
                    'Pet Stores': '/categories/pet_stores'},
 'Beauty & Well-being': {'Cosmetics & Makeup': '/categories/cosmetics_makeup',
                         'Hair Care & Styling': '/categories/hair_care_styling',
                         'Personal Care': '/categories/personal_care',
                         'Salons & Clinics': '/categories/salons_clinics',
                         'Tattoos & Piercings': '/categories/tattoos_piercings',
                         'Wellness & Spa': '/categories/wellness_spa',
                         'Yoga & Meditation': '/categories/yoga_meditation'},
 'Business Services': {'Administration & Services': '/categories/admini

In [35]:
# Number of categories
len(data)

22

We start by initializing Selenium with a headless Chromedriver:

In [36]:
options = Options()
options.add_argument('--headless')
options.add_argument('--no-sandbox')
options.add_argument('start-maximized')
options.add_argument('disable-infobars')
options.add_argument("--disable-extensions")

prefs = {"profile.managed_default_content_settings.images": 2}
options.add_experimental_option("prefs", prefs)

driver = webdriver.Chrome('C:/Users/kting/Documents/GitHub/post-tuto-deployment/src/scraping/selenium/driver/chromedriver.exe', 
                          options=options)

In [38]:
def cat_companies_finder(cat, ls):
    base_url = "https://trustpilot.com"
    timeout = 5 # No. of seconds for driver to attempt connection before giving up

    for subcat_name, subcat_uri in tqdm(data[cat].items(), leave=False):
        # Reset back to page 1
        page_num, is_NextPage = 1, True 

        while is_NextPage:

            # Navigate to page
            url = base_url + subcat_uri + "?numberofreviews=0" + "&page=" + str(page_num) + "&status=all&timeperiod=0" 
            # print(url)
            driver.get(url)

            # Pause code execution until driver loads page fully
            try: 
                # EC is Expected Conditions
                element_present = EC.presence_of_element_located((By.CLASS_NAME, 'wrapper___28fVm'))
                WebDriverWait(driver, timeout).until(element_present)
            except:
                print(f"{url} takes too long to access the link. Breaking.")
                break

            # Extract all company urls in that page and add to list
            companies = driver.find_elements_by_xpath('//div[@class="businessUnitCardsContainer___Qhix1"][1]/a')
            # urls = [url.get_attribute('href') for url in companies]
            
            for url in companies:
                ls.append([cat, subcat_name, url.get_attribute('href')])
            

            # check if there's next page button
            try:
                button = driver.find_element_by_xpath('//nav[@role="navigation"]/a[@aria-label="Next page"]')
                # If no exception, then there exists a next page
                page_num += 1
            except NoSuchElementException:
                is_NextPage = False
            except:
                print("Hmm") 
                
    print(f"Category: {cat} is completed.")
    

## The following cell takes too long. There's too many things to scrape lmao.

In [None]:
thread_list = []
ls = []

for category in data.keys():  

    # Create nested dictionary for each Category
    all_companies_dict[category] = {}
    
    trd = threading.Thread(target=cat_companies_finder, args=(category, ls))
    thread_list.append(trd) 
    trd.start()
    
for trd in thread_list:
    trd.join() # wait for all threads to complete before progressing.
    
print("All categories completed.")

HBox(children=(FloatProgress(value=0.0, max=7.0), HTML(value='')))

HBox(children=(FloatProgress(value=0.0, max=11.0), HTML(value='')))

HBox(children=(FloatProgress(value=0.0, max=6.0), HTML(value='')))

HBox(children=(FloatProgress(value=0.0, max=11.0), HTML(value='')))

HBox(children=(FloatProgress(value=0.0, max=8.0), HTML(value='')))

HBox(children=(FloatProgress(value=0.0, max=10.0), HTML(value='')))

HBox(children=(FloatProgress(value=0.0, max=5.0), HTML(value='')))

HBox(children=(FloatProgress(value=0.0, max=15.0), HTML(value='')))

HBox(children=(FloatProgress(value=0.0, max=8.0), HTML(value='')))

HBox(children=(FloatProgress(value=0.0, max=9.0), HTML(value='')))

HBox(children=(FloatProgress(value=0.0, max=4.0), HTML(value='')))

HBox(children=(FloatProgress(value=0.0, max=7.0), HTML(value='')))

HBox(children=(FloatProgress(value=0.0, max=6.0), HTML(value='')))

HBox(children=(FloatProgress(value=0.0, max=9.0), HTML(value='')))

HBox(children=(FloatProgress(value=0.0, max=11.0), HTML(value='')))

HBox(children=(FloatProgress(value=0.0, max=6.0), HTML(value='')))

HBox(children=(FloatProgress(value=0.0, max=14.0), HTML(value='')))

HBox(children=(FloatProgress(value=0.0, max=5.0), HTML(value='')))

HBox(children=(FloatProgress(value=0.0, max=13.0), HTML(value='')))

HBox(children=(FloatProgress(value=0.0, max=10.0), HTML(value='')))

HBox(children=(FloatProgress(value=0.0, max=2.0), HTML(value='')))

HBox(children=(FloatProgress(value=0.0, max=10.0), HTML(value='')))

In [None]:
ls;

In [None]:
df = pd.DataFrame(ls)

In [None]:
df

In [None]:
base_url = "https://trustpilot.com"
timeout = 3 # No. of seconds for driver to attempt connection before giving up
company_urls = {}

for cat in tqdm(data):
    
    for subcat in tqdm(data[cat], leave=False):
        page_num, is_NextPage = 1, True # Reset back to page 1
        company_urls[sub_category] = []
        
        while is_NextPage:
            url = base_url + data[cat][subcat] + "?numberofreviews=0&timeperiod=0&status=all" + "&page=" + str(page_num)
            driver.get(url)

            # Pause code execution until driver loads page fully
            try: 
                # EC is Expected Conditions
                element_present = EC.presence_of_element_located((By.CLASS_NAME, 'wrapper___28fVm'))
                WebDriverWait(driver, timeout).until(element_present)
            except:
                print("Takes too long to access the link. Breaking.")
                break

            # Extract all company urls in that page and add to subcategory list
            companies = driver.find_elements_by_xpath('//div[@class="businessUnitCardsContainer___Qhix1"][1]/a')
            urls = [url.get_attribute('href') for url in companies]
            company_urls[sub_category].extend(urls) 

            # check if there's next page button
            try:
                button = driver.find_element_by_xpath('//nav[@role="navigation"]/a[@aria-label="Next page"]')
                # If no exception, then there exists a next page
                page_num += 1
            except NoSuchElementException:
                is_NextPage = False
            except:
                print("Hmm") 

In [None]:
len(subcat_coylist)

We launch scraping: (~ 50 minutes)

In [None]:
company_urls = {}
for category in tqdm_notebook(data):
    for sub_category in tqdm_notebook(data[category], leave=False):
        company_urls[sub_category] = []

        url = base_url + data[category][sub_category] + "?numberofreviews=0&timeperiod=0&status=all"
        driver.get(url)
        try: 
            element_present = EC.presence_of_element_located(
                (By.CLASS_NAME, 'category-business-card card'))
            
            WebDriverWait(driver, timeout).until(element_present)
        except:
            pass
    
        next_page = True
        c = 1
        while next_page:
            extracted_company_urls = extract_company_urls_form_page()
            company_urls[sub_category] += extracted_company_urls
            next_page, button = go_next_page()
            
            if next_page:
                c += 1
                next_url = base_url + data[category][sub_category] + "?numberofreviews=0&timeperiod=0&status=all" + f'&page={c}'
                driver.get(next_url)
                try: 
                    element_present = EC.presence_of_element_located(
                        (By.CLASS_NAME, 'category-business-card card'))
                    
                    WebDriverWait(driver, timeout).until(element_present)
                except:
                    pass
                    

And finally we export everything:

In [None]:
with open('./exports/company_urls_en', 'w') as f:
    json.dump(company_urls, f)

In [None]:
consolidated_data = []

for category in data:
    for sub_category in data[category]:
        for url in company_urls[sub_category]:
            consolidated_data.append((category, sub_category, url))

df_consolidated_data = pd.DataFrame(consolidated_data, columns=['category', 'sub_category', 'company_url'])

df_consolidated_data.to_csv('./exports/consolidate_company_urls.csv', index=False)

In [None]:
df_consolidated_data.head()

# Test
Breakdown of the components

The `lxml` parser is important. If run into errors, you have to conda install lxml. If that still fails, then you have to re-install lxml and bs, and restall them in the order: lxml then bs.

In [None]:
def get_soup(url):
    return BeautifulSoup(requests.get(url).content, 'lxml')

In [None]:
# First get the beautiful soup object
soup = get_soup("https://trustpilot.com/categories")
print(type(soup))

Obtain the list of category tags. As in the HTML tags; these tags are full of weird strings that make up the web page elements.

In [None]:
catTAG_ls = soup.findAll('div', {'class': 'subCategory___BRUDy'})

In [None]:
print(f'cat_ls is a list containing {len(catTAG_ls)} {type(catTAG_ls[0])} objects.')

We get the first element of the list `catTAG_ls`, which is category tag, which in this case is the whole HTML element make up of "Animals & Pets".

In [None]:
catTAG = catTAG_ls[0]

From this category tag HTML-element, we extract the name "Animals & Pets" string from it.

In [None]:
catNAME = catTAG.find('h3', {'class': 'subCategoryHeader___36ykD'}).text
catNAME

In [None]:
# This shouldn't effect in any changes, cause .strip() only removes leading and trailing whitespaces.
catNAME = catNAME.strip()
catNAME

Under the category "Animals & Pets", there're many subcategories under it. Such as "Animal Health".

In [None]:
subcatTAG_ls = catTAG.findAll('div', {'class': 'subCategoryItem___3ksKz'})

In [None]:
print(f"There are {len(subcatTAG_ls)} subcategories under the category 'Animals & Pets'.")

We extract the first element from it, to get the subcategory tag HTML element object thingy.

In [None]:
subcatTAG = subcatTAG_ls[0]

Because `subcatTag` only contains one text element, thus there's no need to further filter it. We can just extract the text "Animal Health" straightaway.

In [None]:
subcatNAME = subcatTAG.text
subcatNAME

We can pretty print this element tag with `prettify()`. Without pretty print, it will look terrible.

In [None]:
print(subcatTAG.prettify())

We can extract the href (hypertext reference) that is nested in the `href` tag of the Tag. Note that the value extracted won't be the full url. 

In [None]:
subcatTAG.find('a')['href']

# Testing 2

We start off with a sub-category "Best in IT & Communication". We will be able to know it's url too by tweaking the earlier test. We choose this because it has numerous companies under it, and thus have many pages.

In [None]:
base_url = "https://trustpilot.com"
url = base_url + "/categories/it_communication" + "?numberofreviews=0&timeperiod=0&status=all&page=1"
print(url)

See here for [Explicit Waits](https://selenium-python.readthedocs.io/waits.html). The point of the following cell block is to ensure that Selenium is done loading the resources of the webpage before the rest of the code can proceed.

In [None]:
# Go and instantiate the chromedriver somewhere up above
driver.get(url)

timeout = 3
try: 
    # EC is Expected Conditions
    element_present = EC.presence_of_element_located(
        (By.CLASS_NAME, 'wrapper___28fVm'))

    WebDriverWait(driver, timeout).until(element_present)
except:
    print("If you see this, means you got to find a new element to wait on. This trustpilot website updates itself frequently.")
    pass

This `find_elements_by_xpath` can be rather tough to understand. Becuase of the fact that when you see this in the future, the layout of the webpage can change, as such the argument in it will not work any longer.

But here are some pointers to understand the syntax:
- The thing about xpath (which I think it's a Selenium thing) is that the argument it takes in is like a path, similar to directories in a file system.
- As such, there are slashes (`/`) that denote "sub-directories".
- `//` stands for relative path. Means you don't start from the root "folder".
- A html _node_ is analogous to the folder in a file system. For example, if you see: `<div class='roflmao'></div>`, then `<div>` is the _node_. `class` is not the node in this case.
- From the previous example: `class` is an example of a attribute. Attributes always have a name and a value. In this case, `class` is the name, and `roflmao` is the value associated to it.
- Since nodes are like folders, their relationship with the slashes makes intutive hierarchical sense.
- Attributes are used for filtering stuff inside a node/folder. Attributes always reside in a square bracket appended to a node. 
- Aside from attributes, you can use numbers to filter too, in the form of indexing in an array. Note that the syntax adopts one-indexing, so it starts from `1` and not `0`.

Click [here](https://selenium-python.readthedocs.io/locating-elements.html#locating-by-xpath) for more info.

In [None]:
companies = driver.find_elements_by_xpath('//div[@class="businessUnitCardsContainer___Qhix1"][1]/a')

In [None]:
len(companies)

In [None]:
urls = [a.get_attribute('href') for a in companies]
urls

In [None]:
try:
    button = driver.find_element_by_xpath('//nav[@role="navigation"]/a[@aria-label="Next page"]')
    print(f"Button found, and (for sanity check) it says: {button.text}")
except NoSuchElementException:
    print("NoSuchElementException")
except:
    print("Hmm")

In [None]:
page_num = 1
is_NextPage = True
base_url = "https://trustpilot.com"
sub_cat_url = "/categories/it_communication"
timeout = 3 # No. of seconds for driver to attempt connection before giving up
subcat_coylist = []

while is_NextPage:
    url = base_url + sub_cat_url + "?numberofreviews=0&timeperiod=0&status=all" + "&page=" + str(page_num)
    print(url)
    driver.get(url)

    # Pause code execution until driver loads page fully
    try: 
        # EC is Expected Conditions
        element_present = EC.presence_of_element_located((By.CLASS_NAME, 'wrapper___28fVm'))
        WebDriverWait(driver, timeout).until(element_present)
    except:
        print("Takes too long to access the link. Breaking.")
        break
    
    # Extract all company urls in that page and add to subcategory list
    companies = driver.find_elements_by_xpath('//div[@class="businessUnitCardsContainer___Qhix1"][1]/a')
    urls = [url.get_attribute('href') for url in companies]
    subcat_coylist.extend(urls)
    pp.pprint(urls)    
    
    # check if there's next page button
    try:
        button = driver.find_element_by_xpath('//nav[@role="navigation"]/a[@aria-label="Next page"]')
        # If no exception, then there exists a next page
        page_num += 1
        print("Next page found: ", end="")
    except NoSuchElementException:
        is_NextPage = False
        print("No next page. End.")
    except:
        print("Hmm")
    