In [1]:
from urllib.request import urlopen
from bs4 import BeautifulSoup as bs
import pandas as pd
import time
import sys

In [3]:
def initialize_soup_for_page(page_number):
    """We initialize a soup object that will contain all the tags that are on the main part of the Trustpilot website for the company Asurion : https://www.trustpilot.com/review/www.asurion.com
        The website contains client ratings on multiple pages.
        In the function we can choose for which page number we want to retrieve the data.
    -----------------
       Parameters
    -----------------
    page_number : insert an int value between 1 and the biggest existing page nuymber for the company 

    -----------------
        Returns
    -----------------
    soup object containing the tags of the selected website
    """
    
    url = ''
    if page_number == 1:
        url = "https://www.trustpilot.com/review/www.asurion.com"
    else:
        url = f"https://www.trustpilot.com/review/www.asurion.com?page={page_number}"
    page = urlopen(url)
    soup = bs(page, "html.parser")
    evaluations = soup.findAll('div', attrs = {'class' : "styles_cardWrapper__LcCPA styles_show__HUXRb styles_reviewCard__9HxJJ"})
    return evaluations
    
def insert_row(df, my_row):
    """Insert a list in an existing DataFrame. The length of the list must be the same as the number of the columns in the DataFrame.
    -----------------
       Parameters
    -----------------
    df : the DataFrame in which we want to insert a new list as the last row of the DataFrame 
    my_row : the list we want to insert into the DataFrame
    
    -----------------
        Returns
    -----------------
    None
    """
    df.loc[len(df)] = my_row

def get_specific_data_for_page(page_number):   
    """
    -----------------
       Parameters
    -----------------
    
    -----------------
        Returns
    -----------------
    """
        
    df_evals = pd.DataFrame(columns = ['titre','nom','stars','localisation','nb_reviews','date','comment'])
    evaluations = initialize_soup_for_page(page_number)
    for e in evaluations:
        titre_ = e.find('h2', {'class': 'typography_heading-s__f7029 typography_appearance-default__AAY17'})
        titre = titre_.text if titre_ is not None else ''
        
        nom_ = e.find('span', {'class': 'typography_heading-xxs__QKBS8 typography_appearance-default__AAY17'})
        nom = nom_.text if nom_ is not None else ''
        
        stars_ = e.find('img')['alt']
        stars = stars_#.text if stars_ is not None else ''
        
        localisation_ = e.find('div', {'class': 'typography_body-m__xgxZ_ typography_appearance-subtle__8_H2l styles_detailsIcon__Fo_ua'}).find('span')
        localisation = localisation_.text if localisation_ is not None else ''
        
        nb_reviews_ = e.find('span', {'class': 'typography_body-m__xgxZ_ typography_appearance-subtle__8_H2l'})
        nb_reviews = nb_reviews_.text if nb_reviews_ is not None else ''
        
        date_ = e.find('time')
        date = date_.text if date_ is not None else ''
        
        comment_ = e.find('p', {'class': 'typography_body-l__KUYFJ typography_appearance-default__AAY17 typography_color-black__5LYEn'})
        comment = comment_.text if comment_ is not None else ''
        
        insert_row(df_evals, [titre,nom,stars,localisation,nb_reviews,date,comment])
    return df_evals

In [4]:
def get_specific_data_for_specific_pages(first_page=1,last_page=5000):
    """ faut gerer le nb de page max automatiquement
    -----------------
       Parameters
    -----------------
    
    -----------------
        Returns
    -----------------
    """
    df_evals = pd.DataFrame(columns = ['titre','nom','stars','localisation','nb_reviews','date','comment'])
    for n in range(first_page,last_page):
        try :
            df_evals = pd.concat([df_evals,get_specific_data_for_page(n)])
            print(f"Page {n} was succesfully scraped.") if n%10 == 0 else None
        except:
            print(f"For some reason data scraping couldn't be executed for page {n}.")
            print("Maybe the page we selected to scrape doesn't exist.")
            print("Or we reached the amount of data we could scrape within in a time period.")
            print("Let's wait a few minutes and start scraping again.")
            print("==="*20)
            print("Countdown 8 minutes :")
            print("==="*20)
            for i in range(480,0,-1):
                time.sleep(1)
                sys.stdout.write(str(i)+', ')
            df_evals = pd.concat([df_evals, get_specific_data_for_page(n)])
            print(f"Page {n} was succesfully scraped.")
    return df_evals

In [5]:
df = get_specific_data_for_specific_pages(first_page = 1, last_page = 100)

Page 10 was succesfully scraped.
Page 20 was succesfully scraped.
Page 30 was succesfully scraped.
Page 40 was succesfully scraped.
Page 50 was succesfully scraped.
Page 60 was succesfully scraped.
Page 70 was succesfully scraped.
Page 80 was succesfully scraped.
Page 90 was succesfully scraped.


In [6]:
print(df.shape)
df.to_csv('asurion.csv', index=False)
df.head()

(1980, 7)


Unnamed: 0,titre,nom,stars,localisation,nb_reviews,date,comment
0,My screen shattered and I was leaving…,Connie G,Rated 5 out of 5 stars,US,1 review,17 hours ago,"My screen shattered and I was leaving town, I ..."
1,I Recommend Purchasing a Asurion Plan,Estelle F,Rated 5 out of 5 stars,US,8 reviews,18 hours ago,I make a good deal of purchases on Amazon.com ...
2,I know sometimes that you think that…,Kimberly,Rated 5 out of 5 stars,US,1 review,12 hours ago,I know sometimes that you think that getting a...
3,Glad a bought the warranty!!!,Melanie,Rated 5 out of 5 stars,US,1 review,15 hours ago,Christa live chatted with me in a quick and ef...
4,Asurion did me a great service!!!,William Gunter,Rated 5 out of 5 stars,US,3 reviews,17 hours ago,"Everything was simple doing it all online, pro..."


In [None]:
# test getting data for page
page_num = 250
url = f"https://www.trustpilot.com/review/www.asurion.com?page={page_num}"
page = urlopen(url)
soup = bs(page, "html.parser")
evaluations = soup.findAll('div', attrs = {'class' : "styles_cardWrapper__LcCPA styles_show__HUXRb styles_reviewCard__9HxJJ"})

#print(evaluations)
df_evals = pd.DataFrame(columns = ['titre','nom','stars','localisation','nb_reviews','date','comment'])
for e in evaluations:
        titre_ = e.find('h2', {'class': 'typography_heading-s__f7029 typography_appearance-default__AAY17'})
        titre = titre_.text if titre_ is not None else ''
        
        nom_ = e.find('span', {'class': 'typography_heading-xxs__QKBS8 typography_appearance-default__AAY17'})
        nom = nom_.text if nom_ is not None else ''
        
        stars_ = e.find('img')['alt']
        stars = stars_#.text if stars_ is not None else ''
        
        localisation_ = e.find('div', {'class': 'typography_body-m__xgxZ_ typography_appearance-subtle__8_H2l styles_detailsIcon__Fo_ua'}).find('span')
        localisation = localisation_.text if localisation_ is not None else ''
        
        nb_reviews_ = e.find('span', {'class': 'typography_body-m__xgxZ_ typography_appearance-subtle__8_H2l'})
        nb_reviews = nb_reviews_.text if nb_reviews_ is not None else ''
        
        date_ = e.find('time')
        date = date_.text if date_ is not None else ''
        
        comment_ = e.find('p', {'class': 'typography_body-l__KUYFJ typography_appearance-default__AAY17 typography_color-black__5LYEn'})
        comment = comment_.text if comment_ is not None else ''
        
        insert_row(df_evals, [titre,nom,stars,localisation,nb_reviews,date,comment])
        
df_evals.head()