In this notebook we'll fetch the urls of the companies that are referenced on https://fr.trustpilot.com/

We'll use selenium because the content is dynamically rendered

We'll then scrape the reviews using scrapy and feending it the scraped urls

In [1]:
%config Completer.use_jedi=False

In [2]:
import json
import time

from bs4 import BeautifulSoup
#import bs4 as BeautifulSoup

import requests
import pandas as pd

from selenium import webdriver
from selenium.common.exceptions import NoSuchElementException
from selenium.common.exceptions import TimeoutException
from selenium.webdriver.support.ui import WebDriverWait
from selenium.webdriver.support import expected_conditions as EC
from selenium.webdriver.common.by import By
from selenium.webdriver.chrome.options import Options

from tqdm import tqdm_notebook

In [3]:
base_url = "https://trustpilot.com"

In [4]:
def get_soup(url):
    return BeautifulSoup(requests.get(url).content, 'lxml')
data = {}

soup = get_soup(base_url + '/categories')

We first start by fetching sub-categories urls:

In [5]:
data = {}

soup = get_soup(base_url + '/categories')
for category in soup.findAll('div', {'class': 'category-object'}):
    name = category.find('h3', {'class': 'sub-category__header'}).text
    name = name.strip()
    data[name] = {}  
    sub_categories = category.find('div', {'class': 'sub-category-list'})
    for sub_category in sub_categories.findAll('div', {'class': 'child-category'}):
        sub_category_name = sub_category.find('a', {'class': 'sub-category-item'}).text 
        sub_category_uri = sub_category.find('a', {'class': 'sub-category-item'})['href'] 
        data[name][sub_category_name] = sub_category_uri

In [6]:
len(data)

22

This function allows to fetch company urls referenced in a given subcategory:

In [7]:
def extract_company_urls_form_page():
    a_list = driver.find_elements_by_xpath('//a[@class="category-business-card card"]')
    urls = [a.get_attribute('href') for a in a_list]
    dedup_urls = list(set(urls))
    return dedup_urls

This function indicates if a pagination exists on the page:

In [8]:
def go_next_page():
    try:
        button = driver.find_element_by_xpath('//a[@class="button button--primary next-page"]')
        return True, button
    except NoSuchElementException:
        return False, None

We start by initializing Selenium with a headless Chromedriver:

In [15]:
options = Options()
options.add_argument('--headless')
options.add_argument('--no-sandbox')
options.add_argument('start-maximized')
options.add_argument('disable-infobars')
options.add_argument("--disable-extensions")

prefs = {"profile.managed_default_content_settings.images": 2}
options.add_experimental_option("prefs", prefs)


In [25]:

driver = webdriver.Chrome('.\driver\chromedriver', options=options)

timeout = 3

We launch scraping: (~ 50 minutes)

In [None]:
company_urls = {}
for category in tqdm_notebook(data):
    for sub_category in tqdm_notebook(data[category], leave=False):
        company_urls[sub_category] = []

        url = base_url + data[category][sub_category] + "?numberofreviews=0&timeperiod=0&status=all"
        driver.get(url)
        try: 
            element_present = EC.presence_of_element_located(
                (By.CLASS_NAME, 'category-business-card card'))
            
            WebDriverWait(driver, timeout).until(element_present)
        except:
            pass
    
        next_page = True
        c = 1
        while next_page:
            extracted_company_urls = extract_company_urls_form_page()
            company_urls[sub_category] += extracted_company_urls
            next_page, button = go_next_page()
            
            if next_page:
                c += 1
                next_url = base_url + data[category][sub_category] + "?numberofreviews=0&timeperiod=0&status=all" + f'&page={c}'
                driver.get(next_url)
                try: 
                    element_present = EC.presence_of_element_located(
                        (By.CLASS_NAME, 'category-business-card card'))
                    
                    WebDriverWait(driver, timeout).until(element_present)
                except:
                    pass
                    

HBox(children=(IntProgress(value=0, max=22), HTML(value='')))

HBox(children=(IntProgress(value=0, max=28), HTML(value='')))

HBox(children=(IntProgress(value=0, max=67), HTML(value='')))

HBox(children=(IntProgress(value=0, max=161), HTML(value='')))

HBox(children=(IntProgress(value=0, max=209), HTML(value='')))

HBox(children=(IntProgress(value=0, max=95), HTML(value='')))

HBox(children=(IntProgress(value=0, max=71), HTML(value='')))

HBox(children=(IntProgress(value=0, max=75), HTML(value='')))

HBox(children=(IntProgress(value=0, max=104), HTML(value='')))

HBox(children=(IntProgress(value=0, max=167), HTML(value='')))

HBox(children=(IntProgress(value=0, max=68), HTML(value='')))

HBox(children=(IntProgress(value=0, max=135), HTML(value='')))

HBox(children=(IntProgress(value=0, max=63), HTML(value='')))

HBox(children=(IntProgress(value=0, max=48), HTML(value='')))

HBox(children=(IntProgress(value=0, max=57), HTML(value='')))

And finally we export everything:

In [None]:
with open('./exports/company_urls_en', 'w') as f:
    json.dump(company_urls, f)

In [None]:
consolidated_data = []

for category in data:
    for sub_category in data[category]:
        for url in company_urls[sub_category]:
            consolidated_data.append((category, sub_category, url))

df_consolidated_data = pd.DataFrame(consolidated_data, columns=['category', 'sub_category', 'company_url'])

df_consolidated_data.to_csv('./exports/consolidate_company_urls.csv', index=False)

In [None]:
df_consolidated_data.head()