In [None]:
#WHAT IS EXPECTED IN THIS TASK 1

# 1.Choose a website with publicly available data

# 2.Extract specific data from that website using Python

# 3.Save the data in a clean, structured format (CSV or JSON)

# 4.Explain what you scraped and how you did it


In [None]:
# TOOLS NEEDED TO DO TASK : The task explicitly expects:

# 1. Python ; requests → to fetch the webpage

# 2.BeautifulSoup → to extract data

# 3. pandas → to store & save data

In [None]:
# LIBRARIES NEEDED FOR TASK
#  1. requests: Sends HTTP requests to a website, used to fetch the webpage HTML

# 2. BeautifulSoup : Parses HTML, helps  extract specific elements like titles, prices, etc.

# 3. pandas : Converts scraped data into a structured table, Saves data as CSV or JSON

# 4. time : Adds delays between requests, Prevents overwhelming the website (good scraping practice)

In [None]:
# STEP 1 : IMPORTING LIBRARIES
import requests
from bs4 import BeautifulSoup
import pandas as pd
import time


In [None]:
# STEP 2: SENDING REQUEST TO GET BOOKS FROM SCRAPE WEBSITE

url = "https://books.toscrape.com/"
response = requests.get(url)

print(response)

<Response [200]>


In [None]:
# What this means: <Response [200]>
# ✔️ The request worked
# ✔️ The page is accessible
# ✔️ You can now scrape it

In [None]:
# STEP 3 : PARSE THE HTML CONTENT

soup = BeautifulSoup(response.text, "html.parser")

print(soup.title.text)



    All products | Books to Scrape - Sandbox



In [None]:
# STEP 4 : FINDING ALL BOOKS CONTAINER
books = soup.find_all("article", class_="product_pod")
print(len(books))


20


In [None]:
# STEP 5 : EXTRACTING DATA FROM BOOK 1
book = books[0]
title = book.h3.a["title"]
price = book.find("p", class_="price_color").text
print(title)
print(price)


A Light in the Attic
Â£51.77


In [None]:
# STEP 6 : EXTRACTING DATA FROM ALL BOOKS ON THE PAGE
titles = []
prices = []
for book in books:
  titles.append(book.h3.a["title"])
  prices.append(book.find("p", class_="price_color").text)



In [None]:
# STEP 7: CREATING PANDAS DATAFRAME
# Converts lists into a structured table, Columns: Title | Price, Rows: Each book
df = pd.DataFrame({
    "Title": titles,
    "Price": prices
})
df.head()



Unnamed: 0,Title,Price
0,A Light in the Attic,Â£51.77
1,Tipping the Velvet,Â£53.74
2,Soumission,Â£50.10
3,Sharp Objects,Â£47.82
4,Sapiens: A Brief History of Humankind,Â£54.23


In [None]:
# STEP 8: SAVING DATA TO CSV: Saves DataFrame as CSV, index=False removes unnecessary index column
df.to_csv("books_page1.csv", index=False)


In [15]:
# STEP 9: PAGINATION: Inserts page number into URL, Fetches each page
import requests
from bs4 import BeautifulSoup
import pandas as pd
import time

base_url = "https://books.toscrape.com/catalogue/page-{}.html"

all_titles = []
all_prices = []

for page in range(1, 6):
    print(f"Scraping page {page}")

    response = requests.get(base_url.format(page))
    soup = BeautifulSoup(response.text, "html.parser")

    books = soup.find_all("article", class_="product_pod")

    for book in books:
        all_titles.append(book.h3.a["title"])
        all_prices.append(book.find("p", class_="price_color").text)

    time.sleep(1)



Scraping page 1
Scraping page 2
Scraping page 3
Scraping page 4
Scraping page 5


In [16]:
# STEP 10: FINAL DATAFRAME AND SAVE
df = pd.DataFrame({
    "Title": all_titles,
    "Price": all_prices
})

df.to_csv("books_all_pages.csv", index=False)


In [None]:
# STEP 11: DOWNLOADING CSV SAVED
from google.colab import files

files.download("books_all_pages.csv")

In [None]:
# SUMMARY
# “I used Python’s requests library to fetch webpage content,
# BeautifulSoup to parse and extract book titles and prices, handled pagination to scrape multiple pages,
# and stored the structured data using pandas in CSV format.”