In [35]:
import pandas as pd
import requests
from bs4 import BeautifulSoup

In [22]:
base_url = 'http://books.toscrape.com/catalogue/page-{}.html'

In [23]:
for page in range(1, 3):
    print(f"Scraping page {page}...")
    url = base_url.format(page)
    response = requests.get(url)
    response.encoding = 'utf-8'
    soup = BeautifulSoup(response.text, 'html.parser')

    articles = soup.find_all('article', class_='product_pod')
    print(f"Found {len(articles)} books on page {page}")

Scraping page 1...
Found 20 books on page 1
Scraping page 2...
Found 20 books on page 2


In [24]:
titles = []

for page in range(1, 3):
    print(f"Scraping page {page}...")
    url = base_url.format(page)
    response = requests.get(url)
    response.encoding = 'utf-8'
    soup = BeautifulSoup(response.text, 'html.parser')

    for h3 in soup.find_all('h3'):
        full_title = h3.find('a')['title']
        titles.append(full_title)

Scraping page 1...
Scraping page 2...


In [25]:
for title in titles[:]:
    print(title)

print(f"\nTotal titles scraped: {len(titles)}")

A Light in the Attic
Tipping the Velvet
Soumission
Sharp Objects
Sapiens: A Brief History of Humankind
The Requiem Red
The Dirty Little Secrets of Getting Your Dream Job
The Coming Woman: A Novel Based on the Life of the Infamous Feminist, Victoria Woodhull
The Boys in the Boat: Nine Americans and Their Epic Quest for Gold at the 1936 Berlin Olympics
The Black Maria
Starving Hearts (Triangular Trade Trilogy, #1)
Shakespeare's Sonnets
Set Me Free
Scott Pilgrim's Precious Little Life (Scott Pilgrim #1)
Rip it Up and Start Again
Our Band Could Be Your Life: Scenes from the American Indie Underground, 1981-1991
Olio
Mesaerion: The Best Science Fiction Stories 1800-1849
Libertarianism for Beginners
It's Only the Himalayas
In Her Wake
How Music Works
Foolproof Preserving: A Guide to Small Batch Jams, Jellies, Pickles, Condiments, and More: A Foolproof Guide to Making Small Batch Jams, Jellies, Pickles, Condiments, and More
Chase Me (Paris Nights #2)
Black Dust
Birdsong: A Story in Pictures
A

In [26]:
prices = []

for page in range(1, 3):
    print(f"Scraping prices from Page {page}...")
    url = base_url.format(page)
    response = requests.get(url)
    response.encoding = 'utf-8'
    soup = BeautifulSoup(response.text, 'html.parser')

    for div in soup.find_all('div', class_='product_price'):
        price = div.find('p', class_='price_color').text.strip()
        prices.append(price)

Scraping prices from Page 1...
Scraping prices from Page 2...


In [27]:
for price in prices[:]:
    print(price)

£51.77
£53.74
£50.10
£47.82
£54.23
£22.65
£33.34
£17.93
£22.60
£52.15
£13.99
£20.66
£17.46
£52.29
£35.02
£57.25
£23.88
£37.59
£51.33
£45.17
£12.84
£37.32
£30.52
£25.27
£34.53
£54.64
£22.50
£53.13
£40.30
£44.18
£17.66
£31.05
£23.82
£36.89
£15.94
£33.29
£18.02
£19.63
£52.22
£33.63


In [28]:
ratings = []

for page in range(1, 3):
    print(f"Scraping prices from Page {page}...")
    url = base_url.format(page)
    response = requests.get(url)
    response.encoding = 'utf-8'
    soup = BeautifulSoup(response.text, 'html.parser')

    for article in soup.find_all('article', class_='product_pod'):
        rating_tag = article.find('p', class_='star-rating')
        rating = rating_tag['class'][1]
        ratings.append(rating)

Scraping prices from Page 1...
Scraping prices from Page 2...


In [29]:
for rate in ratings[:]:
    print(rate)

Three
One
One
Four
Five
One
Four
Three
Four
One
Two
Four
Five
Five
Five
Three
One
One
Two
Two
One
Two
Three
Five
Five
Three
Three
Three
Five
Four
Five
Three
Five
One
Five
Three
Two
One
Four
Two


In [30]:
print(f"\nTotal ratings scraped: {len(ratings)}")


Total ratings scraped: 40


In [31]:
genres = []

base_url = 'http://books.toscrape.com/catalogue/page-{}.html'
book_base_url = 'http://books.toscrape.com/catalogue/'

for page in range(1, 3):
    print(f"Scraping genres from Page {page}...")
    url = base_url.format(page)
    response = requests.get(url)
    response.encoding = 'utf-8'
    soup = BeautifulSoup(response.text, 'html.parser')

    for article in soup.find_all('article', class_='product_pod'):
        # Get the relative link to the book detail page
        rel_url = article.find('h3').find('a')['href']
        book_url = book_base_url + rel_url.replace('../../../', '')

        # Visit the book detail page
        book_resp = requests.get(book_url)
        book_resp.encoding = 'utf-8'
        book_soup = BeautifulSoup(book_resp.text, 'html.parser')

        # Find the breadcrumb and extract genre
        breadcrumb = book_soup.find('ul', class_='breadcrumb')
        if breadcrumb:
            breadcrumb_items = breadcrumb.find_all('li')
            if len(breadcrumb_items) >= 3:
                genre = breadcrumb_items[2].text.strip()
            else:
                genre = "Unknown"
        else:
            genre = "Unknown"

        genres.append(genre)

for genre in genres[:]:
    print(genre)

print(f"\nTotal genres scraped: {len(genres)}")

Scraping genres from Page 1...
Scraping genres from Page 2...
Poetry
Historical Fiction
Fiction
Mystery
History
Young Adult
Business
Default
Default
Poetry

Total genres scraped: 40


In [32]:
# Since it's making 1000 HTTP requests so we stop it right now!

In [33]:
print(len(titles), len(prices), len(ratings), len(genres))

40 40 40 40


In [36]:
# Creating a Pandas Dataframe
df = pd.DataFrame({
    'Title': titles,
    'Price': prices,
    'Rating': ratings,
    'Genre': genres,
})

In [37]:
df.head()

Unnamed: 0,Title,Price,Rating,Genre
0,A Light in the Attic,£51.77,Three,Poetry
1,Tipping the Velvet,£53.74,One,Historical Fiction
2,Soumission,£50.10,One,Fiction
3,Sharp Objects,£47.82,Four,Mystery
4,Sapiens: A Brief History of Humankind,£54.23,Five,History


In [38]:
df.to_csv('books_scraped_demo.csv', index=False)

In [39]:
df['Price'] = df['Price'].str.replace('£', '').astype(float)

In [41]:
rating_map = {
    'One': 1,
    'Two': 2,
    'Three': 3,
    'Four': 4,
    'Five': 5,
}
df['Rating'] = df['Rating'].map(rating_map)

In [42]:
df.to_csv('books_scraped_demo_clean_data.csv', index=False)