In [12]:
from urllib.request import urlopen
from bs4 import BeautifulSoup as bs
import pandas as pd
import time
import sys

from IPython.display import display, HTML
display(HTML("<style>.container { width:100% !important; }</style>"))

In [13]:
def initialize_soup_for_page(page_number):
    """We initialize a soup object that will contain all the tags that are on the main part of the Trustpilot website for the company Asurion :
        https://www.trustpilot.com/review/www.asurion.com
        The website contains client ratings on multiple pages.
        In the function we can choose for which page number we want to retrieve the data.
    -----------------
       Parameters
    -----------------
    page_number : insert an int value between 1 and the biggest existing page number for the company 

    -----------------
        Returns
    -----------------
    soup object containing the tags of the selected website
    """
    
    url = ''
    if page_number == 1:
        url = "https://www.trustpilot.com/review/www.asurion.com"
    else:
        url = f"https://www.trustpilot.com/review/www.asurion.com?page={page_number}"
    page = urlopen(url)
    soup = bs(page, "html.parser")
    evaluations = soup.findAll('div', attrs = {'class' : "styles_cardWrapper__LcCPA styles_show__HUXRb styles_reviewCard__9HxJJ"})
    return evaluations
    
def insert_row(df, my_row):
    """Insert a list in an existing DataFrame. The length of the list must be the same as the number of the columns in the DataFrame.
    -----------------
       Parameters
    -----------------
    df : the DataFrame in which we want to insert a new list as the last row of the DataFrame 
    my_row : the list we want to insert into the DataFrame
    
    -----------------
        Returns
    -----------------
    None
    """
    df.loc[len(df)] = my_row

def get_specific_data_for_page(page_number):   
    """ After we get all the data for a specific page with the function initialize_soup_for_page(), we go over each evaluation on the page
        and we will select the specific tags (within the evaluation) that we want to keep.
        The values of these tags will be stored into a DataFrame.
    -----------------
       Parameters
    -----------------
    page_number : this parameter will be used by the initialize_soup_for_page() function and it is the same (it represent the page that we want to scrape) 
    -----------------
        Returns
    -----------------
    df_evals : return a df containing specific data of each evaluation on a chosen page
        The speficifc data we acquire is : 
            - comment title
            - name of person who gives evaluation
            - number of stars left by this person
            - localisation of this person
            - number of reviews left by this person
            - date of the evaluation
            - the date of experience that the person had
            - comment left by the person
    """
        
    df_evals = pd.DataFrame(columns = ['titre','nom','stars','localisation','nb_reviews','date_review','date_experience','comment'])
    evaluations = initialize_soup_for_page(page_number)
    for e in evaluations:
        titre_ = e.find('h2', {'class': 'typography_heading-s__f7029 typography_appearance-default__AAY17'})
        titre = titre_.text if titre_ is not None else ''
        
        nom_ = e.find('span', {'class': 'typography_heading-xxs__QKBS8 typography_appearance-default__AAY17'})
        nom = nom_.text if nom_ is not None else ''
        
        stars_ = e.find('img')['alt']
        stars = stars_ if stars_ is not None else ''
        
        localisation_ = e.find('div', {'class': 'typography_body-m__xgxZ_ typography_appearance-subtle__8_H2l styles_detailsIcon__Fo_ua'}).find('span')
        localisation = localisation_.text if localisation_ is not None else ''
        
        nb_reviews_ = e.find('span', {'class': 'typography_body-m__xgxZ_ typography_appearance-subtle__8_H2l'})
        nb_reviews = nb_reviews_.text if nb_reviews_ is not None else ''
        
        date_review_ = e.find('time')['datetime']
        date_review = date_review_ if date_review_ is not None else ''
        
        date_experience_ = e.find('p',{'class':'typography_body-m__xgxZ_ typography_appearance-default__AAY17'})
        date_experience = date_experience_.text if date_experience_ is not None else ''
        
        comment_ = e.find('p', {'class': 'typography_body-l__KUYFJ typography_appearance-default__AAY17 typography_color-black__5LYEn'})
        comment = comment_.text if comment_ is not None else ''
        
        insert_row(df_evals, [titre,nom,stars,localisation,nb_reviews,date_review,date_experience,comment])
    return df_evals

In [3]:
def get_specific_data_for_specific_pages(first_page = 1,last_page = 4590):
    """ WARNING : last_page value cant be bigger than the number of pages that exist
        
        The function goes over all the pages that are selected in the parameters.
        With the get_specific_data_for_page() function we scrape all the chosen data of these pages.
        
        Error message appears when we scrape too much data in a short time period. So after each of these error messages,
        we wait 8 minutes before we start scraping again from where we left off.
    -----------------
       Parameters
    -----------------
    first_page = 1 : the page from we which we want to start scraping
    last_page = 4590 : the last page that exists and that we want to scrape 
    -----------------
        Returns
    -----------------
    df_evals : return a df containing concatenated data of each evaluation on all the selected pages
    
    """
    df_evals = pd.DataFrame(columns = ['titre','nom','stars','localisation','nb_reviews','date_review','date_experience','comment'])
    for n in range(first_page,last_page):
        try :
            df_evals = pd.concat([df_evals,get_specific_data_for_page(n)])
            print(f"Page {n} was succesfully scraped.") if n%10 == 0 else None
        except:
            print(f"For some reason data scraping couldn't be executed for page {n}.")
            print("Maybe the page we selected to scrape doesn't exist.")
            print("Or we reached the amount of data we could scrape within in a specific time period.")
            print("Let's wait a few minutes and start scraping again.")
            print("==="*20)
            print("Countdown 8 minutes :")
            print("==="*20)
            for i in range(480,0,-1):
                time.sleep(1)
                sys.stdout.write(str(i)+', ')
            df_evals = pd.concat([df_evals, get_specific_data_for_page(n)])
            print(f"Page {n} was succesfully scraped.")
    return df_evals

In [7]:
#df1 = get_specific_data_for_specific_pages(first_page = 1, last_page = 1000)
#df1.to_csv('asurion1.csv', index=False)

In [9]:
#df2 = get_specific_data_for_specific_pages(first_page = 1000, last_page = 2000)
#df2.to_csv('asurion2.csv', index=False)

In [11]:
#df3 = get_specific_data_for_specific_pages(first_page = 2000, last_page = 3000)
#df3.to_csv('asurion3.csv', index=False)

In [13]:
#df4 = get_specific_data_for_specific_pages(first_page = 3000, last_page = 4000)
#df4.to_csv('asurion4.csv', index=False)

In [14]:
df5 = get_specific_data_for_specific_pages(first_page = 4000, last_page = 4590) # careful about the value of last_page
df5.to_csv('asurion5.csv', index=False)

Page 4000 was succesfully scraped.
Page 4010 was succesfully scraped.
Page 4020 was succesfully scraped.
Page 4030 was succesfully scraped.
Page 4040 was succesfully scraped.
Page 4050 was succesfully scraped.
Page 4060 was succesfully scraped.
Page 4070 was succesfully scraped.
Page 4080 was succesfully scraped.
Page 4090 was succesfully scraped.
Page 4100 was succesfully scraped.
Page 4110 was succesfully scraped.
Page 4120 was succesfully scraped.
Page 4130 was succesfully scraped.
Page 4140 was succesfully scraped.
Page 4150 was succesfully scraped.
Page 4160 was succesfully scraped.
Page 4170 was succesfully scraped.
Page 4180 was succesfully scraped.
For some reason data scraping couldn't be executed for page 4186.
Maybe the page we selected to scrape doesn't exist.
Or we reached the amount of data we could scrape within in a specific time period.
Let's wait a few minutes and start scraping again.
Countdown 8 minutes :
480, 479, 478, 477, 476, 475, 474, 473, 472, 471, 470, 469, 4

# `WARNING`:

If we put a higher number for last_page than how many actually exist, the program will run infinitely and nothing will be added to the variable df5.

This part should be still handled so the program automatically stops when we reach the last page.

In [15]:
#df = pd.concat([df1,df2,df3,df4,df5])\
#    .reset_index(drop=True) \
#    .drop_duplicates()

Scraping all the data was't done all at once so we got the data with 1000 page batches.

We import these into a final df.

In [8]:
# import all files that has the scraped data
data_files = ['asurion1.csv', 'asurion2.csv', 'asurion3.csv', 'asurion4.csv', 'asurion5.csv']
df = pd.concat((pd.read_csv(filename) for filename in data_files))
df = df.drop_duplicates() \
    .reset_index(drop = True)
print(df.shape)

(91773, 8)


In [15]:
df

Unnamed: 0,titre,nom,stars,localisation,nb_reviews,date_review,date_experience,comment
0,Fast efficient and simple process for repair,Debbie Denmark,Rated 5 out of 5 stars,US,1 review,2023-05-24T19:47:54.000Z,"Date of experience: May 20, 2023",My husband cracked his iPhone screen on a Satu...
1,A+ service,Brandon,Rated 5 out of 5 stars,US,1 review,2023-05-24T21:56:18.000Z,"Date of experience: May 24, 2023",I was having difficulty with a tablet being se...
2,Excellent service,Kevin P.,Rated 5 out of 5 stars,US,1 review,2023-05-24T20:18:21.000Z,"Date of experience: May 18, 2023",With any insurance claim you obviously have to...
3,I was out fishing on the lake another…,John Frazzini,Rated 5 out of 5 stars,US,3 reviews,2023-05-24T18:38:46.000Z,"Date of experience: May 19, 2023",I was out fishing on the lake another boat sid...
4,Everything went very well,JOSEPHINE,Rated 5 out of 5 stars,US,1 review,2023-05-24T17:02:02.000Z,"Date of experience: May 20, 2023",Everything went very well except I was confuse...
...,...,...,...,...,...,...,...,...
91768,Verizon replacement,RUTH P,Rated 3 out of 5 stars,US,1 review,2016-10-24T22:37:03.000Z,"Date of experience: October 24, 2016",Prongs were bent on first phone. I had to pay...
91769,Replacement cost,THOMAS D,Rated 4 out of 5 stars,US,1 review,2016-10-24T21:49:43.000Z,"Date of experience: October 24, 2016","Then cost of a rebuilt phone is high, consider..."
91770,Great,PEGGY,Rated 4 out of 5 stars,US,1 review,2016-10-24T21:20:12.000Z,"Date of experience: October 24, 2016",Good customer service! Thanks
91771,As always - an easy transaction,Casey S.,Rated 5 out of 5 stars,US,1 review,2016-10-24T20:35:26.000Z,"Date of experience: October 24, 2016",I have teenage boys who are not careful with t...


In [16]:
# concatenate the files into one CSV file
df.to_csv('asurion_complete.csv',index=False)

In [18]:
# concatenate the files into one JSON file
df.to_json('asurion_complete.json')