# Trust Pilot Scraper 
*not for commerical use based on Trust Pilot's Terms of Use*

In [16]:
import requests
import json
import math
import pandas as pd
from bs4 import BeautifulSoup

### Configurations

In [32]:
# Trustpilot review page
url = 'https://uk.trustpilot.com/review/www.johnlewis.com'

# Final list to be the dataframe
final_list = [] 

# Data file to save reviews to
save_datafile = 'trust_reviews.csv'

# Handling for Pagination
results_per_page = 20 
run_pagination_finder = True
total_pages = 1

# Throttling to avoid spamming page with requests
# With sleepTime seconds between every page request
throttle = False
sleep_time = 2

print(f'Scraper set for {url} \nSaving results to {save_datafile}'
      f'\nRun Pagination Finder: {run_pagination_finder} \nThrottling On: {throttle}')

Scraper set for https://uk.trustpilot.com/review/www.johnlewis.com 
Saving results to trust_reviews.csv
Run Pagination Finder: True 
Throttling On: False


### Pagination Finder
Get Total Number of Pages of Reviews

In [23]:
## Count amount of pages to scrape
if run_pagination_finder:
    # Get page
    r = requests.get(url)
    soup = BeautifulSoup(r.text, 'lxml')

    # Total amount of ratings
    rating_count = soup.find('span', class_='headline__review-count')
    rating_count = int(rating_count.text.replace(',', ''))

    # Total pages to scrape
    total_pages = math.ceil(rating_count / results_per_page)

    print('Found total of ' + str(total_pages) + ' pages to scrape')

Found total of 486 pages to scrape


### Run Scraper

In [10]:
for pg in range(1, total_pages + 1):
    pg = url + '?page=' + str(pg)
    r = requests.get(pg)
    soup = BeautifulSoup(r.text, 'lxml')
        
    for paragraph in soup.find_all('section', class_='review__content'):
        title_section = paragraph.find('h2', class_='review-content__title')   
        
        if title_section:
            title = title_section.find('a').text.strip()
        else:
            title = ''
            
        content = paragraph.find('p', class_='review-content__text')
        
        if content:
            content = content.text.strip()
            datedata = json.loads(paragraph.find('div', class_='review-content-header__dates').text)
            date = datedata['publishedDate'].split('T')[0]
            rating_class = paragraph.find('div', class_='star-rating')
            rating = rating_class.find('img')['alt'][0]   

            final_list.append([title, content,date,rating])

    if(throttle): 
        time.sleep(sleep_time)

# Save to pandas dataframe
df = pd.DataFrame(final_list,columns=['Title','Content','Date','Rating'])

### Pretty Print Results

In [12]:
df

Unnamed: 0,Title,Content,Date,Rating
0,Update on Apple watch guarantee from John Lewis,Following my last negative review about John L...,2020-03-20,1
1,Shocking experience,"Shocking, lack of customer service and staff i...",2020-03-19,1
2,Complaint number:14071749 - dreadful and misle...,Complaint number:14071749I had enough of your ...,2020-03-19,1
3,I recently purchased a top from John…,I recently purchased a top from John Lewis onl...,2020-03-19,5
4,John Lewis used to be a great place to…,John Lewis used to be a great place to shop bu...,2020-03-19,3
5,My item was lost,No apology over phone when I cancelled order d...,2020-03-19,1
6,Awful experience,Awful experience. Do not order a hotpoint oven...,2020-03-18,1
7,Last Saturday afternoon I had my make…,Last Saturday afternoon I had my make up booke...,2020-03-18,5
8,We we’re extremely disappointed with “John Lew...,We we’re extremely disappointed with “John Lew...,2020-03-18,1
9,being a stay at home mum,"being a stay at home mum, i rarely get the cha...",2020-03-18,5


### Save dataframe to csv

In [24]:
df.to_csv(save_datafile, encoding='utf-8', index=False)