In [1]:
from webdriver_manager.chrome import ChromeDriverManager
from selenium import webdriver
from selenium.webdriver.chrome.service import Service as ChromeService
from selenium.webdriver.common.by import By
from selenium.webdriver.chrome.options import Options
import re

options = Options()
options.add_argument('--headless')
options.add_argument('--disable-gpu') 

In [2]:
# function to check if page has loaded
def page_has_loaded(driver):
    page_state = driver.execute_script('return document.readyState;')
    return page_state == 'complete'

# function to open browser
def open_browser(url):
    driver = webdriver.Chrome(
        service=ChromeService(ChromeDriverManager().install()), options=options)
    driver.implicitly_wait(5)
    driver.get(url)
    page_has_loaded(driver)
    return driver

# function to get quotes from page
def get_quotes_from_page(driver):
    quotes_list =[]
    quotes = driver.find_elements(By.XPATH, "//div[@class='quote']")
    for quote in quotes:
        quote_text = quote.find_element(By.XPATH, ".//span[@class='text']").text
        quote_author = quote.find_element(By.XPATH, ".//span/small").text
        quote_tags = quote.find_elements(By.XPATH, ".//div[@class='tags']/a")
        quote_tags = [tag.text for tag in quote_tags]
        quote_tags = re.sub(r'[\[\]\']', '', str(quote_tags))

        quotes_list.append({
            'text': quote_text,
            'author': quote_author,
            'tags': quote_tags
        })
    
    return quotes_list

# function to go to next page
def next_page(driver):
    try:
        print('Going to next page')
        next_button = driver.find_element(By.XPATH, "//li[@class='next']/a")
        next_button.click()
        page_has_loaded(driver)
        return True
    except Exception as e:
        print('No more pages')
        driver.close()
        return False

In [3]:
url = 'http://quotes.toscrape.com/'

In [4]:
quotes = [] 
page_num = 1
max_pages = 10 # real page
# max_pages = 2 # for testing

driver = open_browser(url)
# loop through pages
while True:
    
    # get quotes from page
    quotes.extend(get_quotes_from_page(driver))
    
    # print info
    print(f'URL: {driver.current_url}')
    print(f'Page {page_num} scraped')
    print(f'Number of quotes scraped: {len(quotes)}')
    
    page_num += 1
    
    # check if max pages reached
    if page_num > max_pages:
        driver.close()
        break
    
    # go to next page or break
    try:
        next_page(driver)
        print('-------------------------')
    except:
        driver.close()
        break

URL: http://quotes.toscrape.com/
Page 1 scraped
Number of quotes scraped: 10
Going to next page
-------------------------
URL: http://quotes.toscrape.com/page/2/
Page 2 scraped
Number of quotes scraped: 20
Going to next page
-------------------------
URL: http://quotes.toscrape.com/page/3/
Page 3 scraped
Number of quotes scraped: 30
Going to next page
-------------------------
URL: http://quotes.toscrape.com/page/4/
Page 4 scraped
Number of quotes scraped: 40
Going to next page
-------------------------
URL: http://quotes.toscrape.com/page/5/
Page 5 scraped
Number of quotes scraped: 50
Going to next page
-------------------------
URL: http://quotes.toscrape.com/page/6/
Page 6 scraped
Number of quotes scraped: 60
Going to next page
-------------------------
URL: http://quotes.toscrape.com/page/7/
Page 7 scraped
Number of quotes scraped: 70
Going to next page
-------------------------
URL: http://quotes.toscrape.com/page/8/
Page 8 scraped
Number of quotes scraped: 80
Going to next page
-

In [5]:
import pandas as pd

In [6]:
df = pd.DataFrame(quotes)
df.head()

Unnamed: 0,text,author,tags
0,“The world as we have created it is a process ...,Albert Einstein,"change, deep-thoughts, thinking, world"
1,"“It is our choices, Harry, that show what we t...",J.K. Rowling,"abilities, choices"
2,“There are only two ways to live your life. On...,Albert Einstein,"inspirational, life, live, miracle, miracles"
3,"“The person, be it gentleman or lady, who has ...",Jane Austen,"aliteracy, books, classic, humor"
4,"“Imperfection is beauty, madness is genius and...",Marilyn Monroe,"be-yourself, inspirational"


In [7]:
df.shape

(100, 3)