In [17]:
import requests
from bs4 import BeautifulSoup
import pandas as pd

BASE_URL = "https://books.toscrape.com/"
BOOKS = []

def get_genre(book_url):
    """Extract genre from individual book page"""
    response = requests.get(book_url)
    soup = BeautifulSoup(response.text, "html.parser")
    genre = soup.find("ul", class_="breadcrumb").find_all("li")[2].text.strip()
    return genre

def scrape_all_pages():
    url = BASE_URL

    while url:
        response = requests.get(url)
        soup = BeautifulSoup(response.text, "html.parser")
        articles = soup.find_all("article", class_="product_pod")

        for book in articles:
            title = book.h3.a["title"]
            price = book.find("p", class_="price_color").text.strip()
            availability = book.find("p", class_="instock availability").text.strip()
            rating = book.find("p", class_="star-rating")["class"][1]

            book_link = book.h3.a["href"]
            book_url = BASE_URL + book_link.replace("../", "")
            genre = get_genre(book_url)

            BOOKS.append({
                "Title": title,
                "Price": price,
                "Rating": rating,
                "Availability": availability,
                "Genre": genre
            })

        # Pagination
        next_page = soup.find("li", class_="next")
        if next_page:
            next_link = next_page.a["href"]
            url = BASE_URL + "catalogue/" + next_link
        else:
            url = None

# Run Scraper
scrape_all_pages()

# Save to CSV
df = pd.DataFrame(BOOKS)
df.to_csv("books_data.csv", index=False)

print("Scraping completed. Data saved to books_data.csv")


Scraping completed. Data saved to books_data.csv



# **pyspark**

In [30]:
import os
print(os.getcwd())

print(os.listdir())


/content
['.config', 'books_data.csv', 'sample_data']


In [31]:
#Create a Spark DataFrame

from pyspark.sql import SparkSession

spark = SparkSession.builder.appName("Books").getOrCreate()

df = spark.read.csv(
    "books_data.csv",
    header=True,
    inferSchema=True
)

df.printSchema()
df.show(5)

root
 |-- Title: string (nullable = true)
 |-- Price: string (nullable = true)
 |-- Rating: string (nullable = true)
 |-- Availability: string (nullable = true)
 |-- Genre: string (nullable = true)

+--------------------+-------+------+------------+------------------+
|               Title|  Price|Rating|Availability|             Genre|
+--------------------+-------+------+------------+------------------+
|A Light in the Attic|Â£51.77| Three|    In stock|            Poetry|
|  Tipping the Velvet|Â£53.74|   One|    In stock|Historical Fiction|
|          Soumission|Â£50.10|   One|    In stock|           Fiction|
|       Sharp Objects|Â£47.82|  Four|    In stock|           Mystery|
|Sapiens: A Brief ...|Â£54.23|  Five|    In stock|           History|
+--------------------+-------+------+------------+------------------+
only showing top 5 rows


In [32]:
df.printSchema()


root
 |-- Title: string (nullable = true)
 |-- Price: string (nullable = true)
 |-- Rating: string (nullable = true)
 |-- Availability: string (nullable = true)
 |-- Genre: string (nullable = true)



In [33]:
df.show(5)


+--------------------+-------+------+------------+------------------+
|               Title|  Price|Rating|Availability|             Genre|
+--------------------+-------+------+------------+------------------+
|A Light in the Attic|Â£51.77| Three|    In stock|            Poetry|
|  Tipping the Velvet|Â£53.74|   One|    In stock|Historical Fiction|
|          Soumission|Â£50.10|   One|    In stock|           Fiction|
|       Sharp Objects|Â£47.82|  Four|    In stock|           Mystery|
|Sapiens: A Brief ...|Â£54.23|  Five|    In stock|           History|
+--------------------+-------+------+------------+------------------+
only showing top 5 rows


In [37]:
#from pyspark.sql.functions import col, regexp_replace
#df_clean = df.withColumn(
  #  "Price_num",
  #  regexp_replace(col("Price"), "[^0-9.]", "").cast("double")
#)

#df_clean.printSchema()


root
 |-- Title: string (nullable = true)
 |-- Price: string (nullable = true)
 |-- Rating: string (nullable = true)
 |-- Availability: string (nullable = true)
 |-- Genre: string (nullable = true)
 |-- Price_num: double (nullable = true)



In [39]:
from pyspark.sql.functions import col, regexp_replace

df = df.withColumn(
    "Price",
    regexp_replace(col("Price"), "[^0-9.]", "").cast("double")
)


In [46]:
from pyspark.sql.functions import col, when

df = df.withColumn(
    "Rating_num",
    when(col("Rating") == "One", 1)
    .when(col("Rating") == "Two", 2)
    .when(col("Rating") == "Three", 3)
    .when(col("Rating") == "Four", 4)
    .when(col("Rating") == "Five", 5)
    .otherwise(None)
)

df.select("Rating", "Rating_num").show(5)


+------+----------+
|Rating|Rating_num|
+------+----------+
| Three|         3|
|   One|         1|
|   One|         1|
|  Four|         4|
|  Five|         5|
+------+----------+
only showing top 5 rows


In [47]:
#Filter Books with Price Greater Than 20
price_filtered = df.filter(col("Price") > 20)
price_filtered.show()


#Filter Books with Rating 4 and Above
rating_filtered = df.filter(col("Rating_num") >= 4)
rating_filtered.show()



+--------------------+-----+------+------------+------------------+----------+
|               Title|Price|Rating|Availability|             Genre|Rating_num|
+--------------------+-----+------+------------+------------------+----------+
|A Light in the Attic|51.77| Three|    In stock|            Poetry|         3|
|  Tipping the Velvet|53.74|   One|    In stock|Historical Fiction|         1|
|          Soumission| 50.1|   One|    In stock|           Fiction|         1|
|       Sharp Objects|47.82|  Four|    In stock|           Mystery|         4|
|Sapiens: A Brief ...|54.23|  Five|    In stock|           History|         5|
|     The Requiem Red|22.65|   One|    In stock|       Young Adult|         1|
|The Dirty Little ...|33.34|  Four|    In stock|          Business|         4|
|The Boys in the B...| 22.6|  Four|    In stock|           Default|         4|
|     The Black Maria|52.15|   One|    In stock|            Poetry|         1|
|Shakespeare's Son...|20.66|  Four|    In stock|    