In [1]:
import requests
import random
from bs4 import BeautifulSoup
import pandas as pd
import json
import concurrent.futures


# TrustPilot

In [54]:


class TrustPilotScraper:
    """
    A class for scraping reviews from Trustpilot and transforming them into a dataframe.
    """

    def __init__(self, url, num_pages):
        """
        Initialize the TrustPilotScraper object with the Trustpilot URL and number of pages to scrape.

        Args:
            url (str): The URL of the Trustpilot page to scrape.
            num_pages (int): The number of pages to scrape.
        """
        self.url = url
        self.url_company = url.split("/")[-1].split(".")[0]
        self.num_pages = num_pages

    def scrape_reviews(self):
        """
        Scrape reviews from the Trustpilot URL for the specified number of pages.

        Returns:
            tuple: A tuple containing lists of reviews, headlines, ratings, and authors.
        """
        base_url = self.url + '?page='
        urls = [base_url + str(i) for i in range(1, self.num_pages + 1)]

        with concurrent.futures.ThreadPoolExecutor(max_workers=10) as executor:
            results = executor.map(self.__scrape_trustpilot_data, urls)

        page_results = list(results)

        reviews, headlines, ratings, authors, dates = [], [], [], [], []
        for page_result in page_results:
            page_reviews, page_headlines, page_ratings, page_authors, page_dates = self.__extract_nested_content(page_result)
            reviews.extend(page_reviews)
            headlines.extend(page_headlines)
            ratings.extend(page_ratings)
            authors.extend(page_authors)
            dates.extend(page_dates)
        self.page_results = page_result

        return reviews, headlines, ratings, authors, dates

    def __scrape_trustpilot_data(self, url):
        """
        Private method to scrape data from a given Trustpilot URL.

        Args:
            url (str): The URL to scrape.

        Returns:
            dict: The scraped JSON data.
        """
        try:
            response = requests.get(url)
            response.raise_for_status()
        except requests.exceptions.RequestException as e:
            print(f"Error scraping URL: {url}\n{str(e)}")
            return None

        try:
            soup = BeautifulSoup(response.text, 'html.parser')
            script_tag = soup.find('script', {'data-business-unit-json-ld': 'true'})
            json_str = script_tag.text.strip()
            json_data = json.loads(json_str)
            return json_data
        except (AttributeError, ValueError, KeyError) as e:
            print(f"Error parsing JSON data for URL: {url}\n{str(e)}")
            return None

    def __get_content_from_page_results(self, page_result, key):
        """
        Private method to extract nested content from the scraped JSON data.

        Args:
            page_result (dict): The scraped JSON data for a single page.
            key (str): The key to extract from the JSON data.

        Returns:
            list: A list of extracted content.
        """
        if page_result and '@graph' in page_result:
            page_content = [i[key] for i in page_result['@graph'] if i.get('@type') == 'Review']
            return page_content
        return []

    def __extract_nested_content(self, page_result):
        """
        Private method to extract nested content from a list of scraped JSON data.

        Args:
            page_results (list): A list of scraped JSON data for multiple pages.

        Returns:
            tuple: A tuple containing lists of nested reviews, headlines, ratings, and authors.
        """
        reviews = self.__get_content_from_page_results(page_result, 'reviewBody')
        headlines = self.__get_content_from_page_results(page_result, 'headline')
        ratings = self.__get_content_from_page_results(page_result, 'reviewRating')
        authors = self.__get_content_from_page_results(page_result, 'author')
        dates = self.__get_content_from_page_results(page_result, 'datePublished')
        return reviews, headlines, ratings, authors, dates

    def get_dataframe_results(self, reviews, headlines, ratings, authors, dates):
        """
        Generate a dataframe from the scraped review data.

        Args:
            reviews (list): List of review content.
            headlines (list): List of review headlines.
            ratings (list): List of review ratings.
            authors (list): List of review authors.

        Returns:
            pandas.DataFrame: The dataframe containing the review data.
        """
        df = pd.DataFrame({'review': reviews, 
                           'author': authors, 
                           'headline': headlines, 
                           'ratings': ratings,
                           'date':dates})
        df['company'] = self.url_company
        df_author = self.transform_nested_dataframe_column(df, 'author', ['name', 'url'])
        df_rating = self.transform_nested_dataframe_column(df, 'ratings', ['ratingValue'])
        df['company'] = self.url_company
        return pd.concat([df[['company','date','headline', 'review']], df_author, df_rating], axis=1)


    @staticmethod
    def transform_nested_dataframe_column(df, nested_col, new_cols):
        """
        Transform a nested column in the dataframe into separate columns.

        Args:
            df (pandas.DataFrame): The input dataframe.
            nested_col (str): The name of the nested column to transform.
            new_cols (list): The names of the new columns to create.

        Returns:
            pandas.DataFrame: The transformed dataframe with the new columns.
        """
        df_flat = df[nested_col].apply(pd.Series)[new_cols]
        return df_flat


    def run(self):
        """
        Run the TrustPilotScraper to scrape reviews and generate the dataframe.

        Returns:
            pandas.DataFrame: The dataframe containing the scraped review data.
        """
        reviews, headlines, ratings, authors, dates = self.scrape_reviews()
        df = self.get_dataframe_results(reviews, headlines, ratings, authors,dates)
        return df


In [50]:
tp = TrustPilotScraper('https://uk.trustpilot.com/review/gymshark.com',5)

In [51]:
df = tp.run()

In [52]:
tp.page_results

{'@context': 'https://schema.org',
 '@graph': [{'@type': 'Organization',
   '@id': 'https://www.trustpilot.com/#/schema/Organization/1',
   'name': 'Trustpilot',
   'legalName': 'Trustpilot A/S',
   'url': 'https://www.trustpilot.com',
   'description': 'Read reviews. Write reviews. Find companies.',
   'sameAs': ['https://en.wikipedia.org/wiki/Trustpilot',
    'https://www.facebook.com/Trustpilot/',
    'https://www.instagram.com/trustpilot/',
    'https://www.linkedin.com/company/trustpilot',
    'https://twitter.com/Trustpilot',
    'https://www.youtube.com/c/trustpilotreviews'],
   'logo': {'@id': 'https://www.trustpilot.com/#/schema/ImageObject/Logo/1'},
   'email': 'support@trustpilot.com',
   'address': {'@type': 'PostalAddress',
    '@id': 'https://www.trustpilot.com/#/schema/PostalAddress/DK',
    'streetAddress': 'Pilestræde 58, 5th floor',
    'addressLocality': 'Copenhagen',
    'addressCountry': 'DK',
    'postalCode': '1112 København'}},
  {'@type': 'ImageObject',
   '@id

In [53]:
df

Unnamed: 0,company,date,headline,review,name,url,ratingValue
0,gymshark,2023-08-11T09:39:01.000Z,Efficient delivery and excellent…,Efficient delivery and excellent customer serv...,Miss Lucy Craven,https://uk.trustpilot.com/users/5d114d4a6d82d1...,4
1,gymshark,2023-08-12T13:21:23.000Z,Worst Customer Service Of ALL Time! Avoid.,"I have ordered thousands from this site and ""u...",Conor,https://uk.trustpilot.com/users/5d3ade847576c4...,1
2,gymshark,2023-08-11T13:45:39.000Z,Gymshark are incredible,Gymshark have always given such incredible cus...,Steph Webb,https://uk.trustpilot.com/users/5cb05052499381...,5
3,gymshark,2023-08-11T14:00:05.000Z,I ordered something online and received…,I ordered something online and received someth...,Hadar,https://uk.trustpilot.com/users/64d6229054eb0b...,1
4,gymshark,2023-08-11T22:21:08.000Z,Order cancelled 3 times.,I have ordered a hoodie 3 times now as a gift ...,Nicole,https://uk.trustpilot.com/users/5e4854177168c0...,1
...,...,...,...,...,...,...,...
95,gymshark,2023-07-26T17:57:45.000Z,Quality is excellent and fantastic…,Quality is excellent and fantastic customer se...,Billie-Jo Dearden,https://uk.trustpilot.com/users/5e98a83ba222f4...,5
96,gymshark,2023-07-26T18:29:22.000Z,All items were too small/tight even…,All items were too small/tight even though I'd...,BM,https://uk.trustpilot.com/users/64c149e1a1676b...,1
97,gymshark,2023-07-30T10:18:59.000Z,Great service,"Great service, great product. Gym leggings fit...",Lisa,https://uk.trustpilot.com/users/5d7d485aab5050...,5
98,gymshark,2023-07-23T21:22:19.000Z,Terrible,Terrible! 1st time- my parcel got “lost” but ...,Gosia Grzesiak,https://uk.trustpilot.com/users/5eee6d5c63c027...,1


In [49]:
df.to_csv('/Users/saho/Documents/sam/skllm/data/test_data.csv',index=False)

In [16]:
import requests
import concurrent.futures
import json
import pandas as pd
from bs4 import BeautifulSoup
import logging

class TrustPilotScraper:
    """
    A class for scraping reviews from Trustpilot and transforming them into a dataframe.
    """

    def __init__(self, url, num_pages):
        """
        Initialize the TrustPilotScraper object with the Trustpilot URL and number of pages to scrape.

        Args:
            url (str): The URL of the Trustpilot page to scrape.
            num_pages (int): The number of pages to scrape.
        """
        self.url = url
        self.url_company = url.split("/")[-1].split(".")[0]
        self.num_pages = num_pages

        # Configure logging
        logging.basicConfig(level=logging.INFO)
        self.logger = logging.getLogger(__name__)

    def scrape_reviews(self):
        """
        Scrape reviews from the Trustpilot URL for the specified number of pages.

        Returns:
            tuple: A tuple containing lists of reviews, headlines, ratings, and authors.
        """
        base_url = self.url + '?page='
        urls = [base_url + str(i) for i in range(1, self.num_pages + 1)]

        with concurrent.futures.ThreadPoolExecutor(max_workers=10) as executor:
            results = executor.map(self.__scrape_trustpilot_data, urls)

        page_results = list(results)

        reviews, headlines, ratings, authors = [], [], [], []
        for page_result in page_results:
            page_reviews, page_headlines, page_ratings, page_authors = self.__extract_nested_content(page_result)
            reviews.extend(page_reviews)
            headlines.extend(page_headlines)
            ratings.extend(page_ratings)
            authors.extend(page_authors)

        return reviews, headlines, ratings, authors

    def __scrape_trustpilot_data(self, url):
        """
        Private method to scrape data from a given Trustpilot URL.

        Args:
            url (str): The URL to scrape.

        Returns:
            dict: The scraped JSON data.
        """
        try:
            response = requests.get(url)
            response.raise_for_status()
        except requests.exceptions.RequestException as e:
            self.logger.error(f"Error scraping URL: {url}\n{str(e)}")
            return None

        try:
            soup = BeautifulSoup(response.text, 'html.parser')
            script_tag = soup.find('script', {'data-business-unit-json-ld': 'true'})
            json_str = script_tag.text.strip()
            json_data = json.loads(json_str)
            return json_data
        except (AttributeError, ValueError, KeyError) as e:
            self.logger.error(f"Error parsing JSON data for URL: {url}\n{str(e)}")
            return None

    def __get_content_from_page_results(self, page_result, key):
        """
        Private method to extract nested content from the scraped JSON data.

        Args:
            page_result (dict): The scraped JSON data for a single page.
            key (str): The key to extract from the JSON data.

        Returns:
            list: A list of extracted content.
        """
        if page_result and '@graph' in page_result:
            page_content = [i[key] for i in page_result['@graph'] if i.get('@type') == 'Review']
            return page_content
        return []

    def __extract_nested_content(self, page_result):
        """
        Private method to extract nested content from a list of scraped JSON data.

        Args:
            page_results (list): A list of scraped JSON data for multiple pages.

        Returns:
            tuple: A tuple containing lists of nested reviews, headlines, ratings, and authors.
        """
        reviews = self.__get_content_from_page_results(page_result, 'reviewBody')
        headlines = self.__get_content_from_page_results(page_result, 'headline')
        ratings = self.__get_content_from_page_results(page_result, 'reviewRating')
        authors = self.__get_content_from_page_results(page_result, 'author')
        return reviews, headlines, ratings, authors

    def transform_nested_dataframe_column(self, df, nested_col, new_cols):
        """
        Transform a nested column in the dataframe into separate columns.

        Args:
            df (pandas.DataFrame): The input dataframe.
            nested_col (str): The name of the nested column to transform.
            new_cols (list): The names of the new columns to create.

        Returns:
            pandas.DataFrame: The transformed dataframe with the new columns.
        """
        df_flat = df[nested_col].apply(pd.Series)[new_cols]
        return df_flat

    def get_dataframe_results(self, reviews, headlines, ratings, authors):
        """
        Generate a dataframe from the scraped review data.

        Args:
            reviews (list): List of review content.
            headlines (list): List of review headlines.
            ratings (list): List of review ratings.
            authors (list): List of review authors.

        Returns:
            pandas.DataFrame: The dataframe containing the review data.
        """
        df = pd.DataFrame({'review': reviews, 'author': authors, 'headline': headlines, 'ratings': ratings})
        df['company'] = self.url_company
        df_author = self.transform_nested_dataframe_column(df, 'author', ['name', 'url'])
        df_rating = self.transform_nested_dataframe_column(df, 'ratings', ['ratingValue'])
        df['company'] = self.url_company
        return pd.concat([df[['headline', 'review', 'company']], df_author, df_rating], axis=1)

    def run(self):
        """
        Run the TrustPilotScraper to scrape reviews and generate the dataframe.

        Returns:
            pandas.DataFrame: The dataframe containing the scraped review data.
        """
        reviews, headlines, ratings, authors = self.scrape_reviews()
        df = self.get_dataframe_results(reviews, headlines, ratings, authors)
        return df

# Usag
scraper = TrustPilotScraper(url='https://uk.trustpilot.com/review/gymshark.com', num_pages=10)
df = scraper.run()
print(df)


                                       headline  \
0             Efficient delivery and excellent…   
1    Worst Customer Service Of ALL Time! Avoid.   
2                       Gymshark are incredible   
3      I ordered something online and received…   
4                      Order cancelled 3 times.   
..                                          ...   
195                                  Best ever!   
196                                 I hate evri   
197                              Quick delivery   
198                               Great product   
199                                   Girls bag   

                                                review   company  \
0    Efficient delivery and excellent customer serv...  gymshark   
1    I have ordered thousands from this site and "u...  gymshark   
2    Gymshark have always given such incredible cus...  gymshark   
3    I ordered something online and received someth...  gymshark   
4    I have ordered a hoodie 3 times now as a g

In [17]:
df

Unnamed: 0,headline,review,company,name,url,ratingValue
0,Efficient delivery and excellent…,Efficient delivery and excellent customer serv...,gymshark,Miss Lucy Craven,https://uk.trustpilot.com/users/5d114d4a6d82d1...,4
1,Worst Customer Service Of ALL Time! Avoid.,"I have ordered thousands from this site and ""u...",gymshark,Conor,https://uk.trustpilot.com/users/5d3ade847576c4...,1
2,Gymshark are incredible,Gymshark have always given such incredible cus...,gymshark,Steph Webb,https://uk.trustpilot.com/users/5cb05052499381...,5
3,I ordered something online and received…,I ordered something online and received someth...,gymshark,Hadar,https://uk.trustpilot.com/users/64d6229054eb0b...,1
4,Order cancelled 3 times.,I have ordered a hoodie 3 times now as a gift ...,gymshark,Nicole,https://uk.trustpilot.com/users/5e4854177168c0...,1
...,...,...,...,...,...,...
195,Best ever!,...........,gymshark,Aaron Hammond,https://uk.trustpilot.com/users/59d785960000ff...,5
196,I hate evri,Good fit ngl ong,gymshark,Callum Chambers,https://uk.trustpilot.com/users/64c3e24eb6a350...,5
197,Quick delivery,Quick delivery,gymshark,Haulwen,https://uk.trustpilot.com/users/56ce3d3e0000ff...,5
198,Great product,Great product,gymshark,will waddell,https://uk.trustpilot.com/users/642598b1a69d19...,5
