In [2]:
### Trustpilot data scraping module 
### Author: Hakim Khan

## Imports

import lxml.html as html
import math
import csv
import time
import requests

In [3]:
## Configurations

# Trustpilot review page
basePage = 'http://www.trustpilot.com/review/'
reviewSite = 'www.skype.com'
reviewPage = basePage + reviewSite

# Data file to save to
datafile = 'dataSkype.csv'

# Trustpilot default 
resultsPerPage = 20 

print('Scraper set for ' + reviewPage + ' - saving result to ' + datafile)

Scraper set for http://www.trustpilot.com/review/www.skype.com - saving result to dataSkype.csv


In [4]:
## Count amount of pages to scrape

# Get page, skipping HTTPS as it gives certificate errors
page = requests.get(reviewPage, verify=False)
tree = html.fromstring(page.content)

# Total amount of ratings
ratingCount = tree.xpath('//span[@class="headline__review-count"]')
ratingCount = int(ratingCount[0].text.replace(',',''))

# Amount of chunks to consider for displaying processing output 
# For ex. 10 means output progress for every 10th of the data
tot_chunks = 20

# Throttling to avoid spamming page with requests
# With sleepTime seconds between every page request
throttle = False
sleepTime = 2

# Total pages to scrape
pages = math.ceil(ratingCount / resultsPerPage)
print('Found total of ' + str(pages) + ' pages to scrape')



Found total of 23 pages to scrape


In [5]:
## Main scraping section

with open(datafile, 'w', newline='', encoding='utf8') as csvfile:
    
    # Tab delimited to allow for special characters
    datawriter = csv.writer(csvfile, delimiter='\t')
    print('Processing..')
    for i in range(1,pages+1):
        
        # Sleep if throttle enabled
        if(throttle): time.sleep(sleepTime)

        page = requests.get(reviewPage + '?page=' + str(i))
        tree = html.fromstring(page.content)
        
        # Each item below scrapes a pages review titles, bodies, ratings and languages. 
        titles = tree.xpath('//a[@class="review-title-link"]')
        bodies = tree.xpath('//div[@class="review-body"]')
        ratings = tree.xpath('//div[@data-status]')
        langs = tree.xpath("//h3[starts-with(@class, 'review-title')]")
        
        for idx,e in enumerate(bodies):
            
            # Progress counting, outputs for every processed chunk
            reviewNumber = idx + 20*(i-1) + 1
            chunk = int(ratingCount / tot_chunks)
            if(reviewNumber % chunk == 0): 
                print('Processed ' + str(reviewNumber) + '/'  + str(ratingCount) + ' ratings')
            
            # Title of comment
            title = titles[idx].text_content()
            
            # Body of comment
            body = e.text_content().strip()
            
            # The rating is the 5th from last element
            rating = ratings[idx].get('data-status').split(' ')[-5] 
            
            # Language is 2nd element of h3 tag
            lang = langs[idx].get('class').split(' ')[1]
            
            datawriter.writerow([title,body,rating,lang])
    print('Processed ' + str(ratingCount) + '/' + str(ratingCount) + ' ratings.. Finished!')

Processing..
Processed 22/452 ratings
Processed 44/452 ratings
Processed 66/452 ratings
Processed 88/452 ratings
Processed 110/452 ratings
Processed 132/452 ratings
Processed 154/452 ratings
Processed 176/452 ratings
Processed 198/452 ratings
Processed 220/452 ratings
Processed 242/452 ratings
Processed 264/452 ratings
Processed 286/452 ratings
Processed 308/452 ratings
Processed 330/452 ratings
Processed 352/452 ratings
Processed 374/452 ratings
Processed 396/452 ratings
Processed 418/452 ratings
Processed 440/452 ratings
Processed 452/452 ratings.. Finished!
