In [1]:
from PIL import Image
import requests
import pandas as pd
import random
from bs4 import BeautifulSoup

In [None]:
import spacy
model = spacy.load("en_core_web_sm")


# TrustPilot

In [2]:
import requests
from bs4 import BeautifulSoup
import json
import concurrent.futures
import pandas as pd

class TrustPilotScraper:
    """
    A class for scraping reviews from Trustpilot and transforming them into a dataframe.
    """

    def __init__(self, url, num_pages):
        """
        Initialize the TrustPilotScraper object with the Trustpilot URL and number of pages to scrape.

        Args:
            url (str): The URL of the Trustpilot page to scrape.
            num_pages (int): The number of pages to scrape.
        """
        self.url = url
        self.num_pages = num_pages
        self.nested_content = []

    def _scrape_trustpilot_data(self, url):
        """
        Private method to scrape data from a given Trustpilot URL.

        Args:
            url (str): The URL to scrape.

        Returns:
            dict: The scraped JSON data.
        """
        response = requests.get(url)
        soup = BeautifulSoup(response.text, 'html.parser')
        script_tag = soup.find('script', {'data-business-unit-json-ld': 'true'})
        json_str = script_tag.text.strip()
        json_data = json.loads(json_str)
        return json_data

    def scrape_reviews(self):
        """
        Scrape reviews from the Trustpilot URL for the specified number of pages.

        Returns:
            tuple: A tuple containing lists of reviews, headlines, ratings, and authors.
        """
        base_url = self.url + '?page='
        urls = [base_url + str(i) for i in range(1, self.num_pages + 1)]

        with concurrent.futures.ThreadPoolExecutor(max_workers=10) as executor:
            # Use the map() function to apply the function to each URL in parallel
            results = executor.map(self._scrape_trustpilot_data, urls)

        # results is an iterator that contains the results of each function call
        page_results = [result for result in results]

        reviews_nested, headlines_nested, ratings_nested, authors_nested = self._extract_nested_content(page_results)
        reviews_flattened, headlines_flattened, ratings_flattened, authors_flattened = self._flatten_nested_content(
            reviews_nested, headlines_nested, ratings_nested, authors_nested)

        return reviews_flattened, headlines_flattened, ratings_flattened, authors_flattened

    def _get_content_from_page_results(self, page_result, key):
        """
        Private method to extract nested content from the scraped JSON data.

        Args:
            page_result (dict): The scraped JSON data for a single page.
            key (str): The key to extract from the JSON data.

        Returns:
            list: A list of extracted content.
        """
        page_content = [i[key] for i in page_result['@graph'] if i['@type'] == 'Review']
        return page_content

    def _extract_nested_content(self, page_results):
        """
        Private method to extract nested content from a list of scraped JSON data.

        Args:
            page_results (list): A list of scraped JSON data for multiple pages.

        Returns:
            tuple: A tuple containing lists of nested reviews, headlines, ratings, and authors.
        """
        reviews_nested = [self._get_content_from_page_results(page_result, 'reviewBody') for page_result in page_results]
        headlines_nested = [self._get_content_from_page_results(page_result, 'headline') for page_result in page_results]
        ratings_nested = [self._get_content_from_page_results(page_result, 'reviewRating') for page_result in
                          page
        authors_nested = [self._get_content_from_page_results(page_result, 'authors') for page_result in
                          page
        self.nested_content.extend()
                          
    def _flatten_nested_content(self)reviews_nested, headlines_nested, ratings_nested, authors_nested):
        reviews_flattened = [review for reviews in reviews_nested for review in reviews]
        headlines_flattened = [headline for headlines in headlines_nested for headline in headlines]
        ratings_flattened = [rating for ratings in ratings_nested for rating in ratings]
        authors_flattened = [author for authors in authors_nested for author in authors]
        return reviews_flattened,headlines_flattened,ratings_flattened,authors_flattened



In [3]:
base_url = 'https://uk.trustpilot.com/review/lionsprep.co.uk?page='
urls = [base_url+str(i) for i in range(1,10)]

In [4]:
import concurrent.futures

def scrape_trustpilot_data(url):
    response = requests.get(url)
    soup = BeautifulSoup(response.text, 'html.parser')
    script_tag = soup.find('script', {'data-business-unit-json-ld': 'true'})
    json_str = script_tag.text.strip()
    json_data = json.loads(json_str)
    return json_data






In [5]:
with concurrent.futures.ThreadPoolExecutor(max_workers=10) as executor:
    # Use the map() function to apply the function to each URL in parallel
    results = executor.map(scrape_trustpilot_data, urls)

# results is an iterator that contains the results of each function call
page_results = [result for result in results]


In [12]:
def _get_content_from_page_results(page_result,key):
    page_content = [i[key] for i in page_result['@graph'] if i['@type'] == 'Review']
    return page_content

In [42]:
def extract_nested_content(page_results):
    reviews_nested = [_get_content_from_page_results(page_result,'reviewBody') for page_result in page_results]
    headlines_nested = [_get_content_from_page_results(page_result,'headline') for page_result in page_results]
    ratings_nested = [_get_content_from_page_results(page_result,'reviewRating') for page_result in page_results]
    authors_nested = [_get_content_from_page_results(page_result,'author') for page_result in page_results]
    
    return reviews_nested, headlines_nested, ratings_nested, authors_nested
   

In [44]:
reviews_nested, headlines_nested, ratings_nested, authors_nested = extract_nested_content(page_results)

In [48]:
def flatten_nested_content(reviews_nested, headlines_nested, ratings_nested, authors_nested):
    reviews_flattened = [review for reviews in reviews_nested for review in reviews]
    headlines_flattened = [headline for headlines in headlines_nested for headline in headlines]
    ratings_flattened = [rating for ratings in ratings_nested for rating in ratings]
    authors_flattened = [author for authors in authors_nested for author in authors]
    return reviews_flattened,headlines_flattened,ratings_flattened,authors_flattened


In [59]:

    reviews_flattened,headlinesx_flattened,ratings_flattened,authors_flattened = \
     flatten_nested_content(reviews_nested, headlines_nested, ratings_nested, authors_nested)

In [61]:
def get_dataframe_results():
    df = pd.DataFrame({'review':reviews_flattened,
                      'author':authors_flattened,
                      'headline':headlines_flattened,
                      'ratings':ratings_flattened})
    df_author = transform_nested_dataframe_column(df, 'author',['name','url'])
    df_rating = transform_nested_dataframe_column(df, 'ratings',['ratingValue'])
    return pd.concat([df[['headline','review',]],df_author,df_rating],axis=1)

In [62]:
def transform_nested_dataframe_column(df, nested_col, new_cols):
    df_flat = df[nested_col].apply(pd.Series)
    df_flat = df_flat[new_cols]
    return df_flat

In [63]:
get_dataframe_results()

Unnamed: 0,headline,review,name,url,ratingValue
0,Started using lions prep few months ago…,"Started using lions prep few months ago ,1-2 w...",Tommie Williams,https://uk.trustpilot.com/users/5cd94d8d01be99...,5
1,Box was delivered by DPD 4 miles away…,Box was delivered by DPD 4 miles away from my ...,Lily O'Hara,https://uk.trustpilot.com/users/607de1878b2df9...,1
2,Fantastic food,Fantastic food. I have had the breakfasts and ...,Emily Sayers,https://uk.trustpilot.com/users/59a0446f0000ff...,5
3,I am on week one of my Lion's Prep…,I am on week one of my Lion's Prep experience ...,J Rush,https://uk.trustpilot.com/users/622b480a017be6...,5
4,No more processed meals,Very convenient for myself and my carers as I ...,Roxanne Kent,https://uk.trustpilot.com/users/5d6aec95730974...,5
...,...,...,...,...,...
175,I waited a week to give a better review…,I waited a week to give a better review of the...,Anna Nsubuga,https://uk.trustpilot.com/users/63de61a7620112...,3
176,Delicious food can't fault it at all!,Delicious food can't fault it at all!,Lauren Danielle Forge,https://uk.trustpilot.com/users/6429a8df71677f...,5
177,Loving my lions prep meals,"Loving my lions prep meals, really delicious a...",Lisa,https://uk.trustpilot.com/users/634fa10ab1a8a8...,5
178,Sharp plastic in food and no refund,I cancelled my regular subscription with Lions...,Ryan,https://uk.trustpilot.com/users/5ff6de6978a852...,1
