<h1 style="color: #FF8C00;">Part 1: Fetching Books Data</h1>

In [1]:
import requests
import pandas as pd
from bs4 import BeautifulSoup
import time

In [2]:
# Function to fetch book data from Open Library API
def fetch_openlibrary_books(limit=500):
    base_url = "https://openlibrary.org/search.json?q=book&limit=100"
    books = []

    for i in range(limit // 100):  # Fetch data in batches
        response = requests.get(base_url + f"&page={i+1}")
        if response.status_code == 200:
            data = response.json()
            for doc in data["docs"]:
                books.append({
                    "title": doc.get("title", ""),
                    "author": doc.get("author_name", [""])[0],
                    "published_year": doc.get("first_publish_year", ""),
                    "isbn": doc.get("isbn", [""])[0] if doc.get("isbn") else "",
                    "subject": doc.get("subject", [""])[0] if doc.get("subject") else ""
                })
        time.sleep(1)  # Avoid overloading the API

    return books

# Function to scrape Books to Scrape
def scrape_books_to_scrape(limit=500):
    base_url = "http://books.toscrape.com/catalogue/page-{}.html"
    books = []
    page = 1

    while len(books) < limit:
        response = requests.get(base_url.format(page))
        if response.status_code != 200:
            break
        
        soup = BeautifulSoup(response.text, "html.parser")
        book_list = soup.find_all("article", class_="product_pod")

        for book in book_list:
            title = book.h3.a.attrs["title"]
            price = book.find("p", class_="price_color").text
            availability = book.find("p", class_="instock availability").text.strip()
            rating = book.p.attrs["class"][1]  # Rating is in a class attribute (e.g., "Three")

            books.append({
                "title": title,
                "price": price,
                "availability": availability,
                "rating": rating
            })
        
        page += 1
        time.sleep(1)  # Avoid overloading the server

    return books[:limit]

In [None]:
# Fetch data
openlibrary_books = fetch_openlibrary_books(limit=500)
books_to_scrape = scrape_books_to_scrape(limit=500)

In [10]:
df_openlibrary

Unnamed: 0,title,author,published_year,isbn,subject
0,Eyewitness,DK Publishing,2000,,
1,The Book of Dragons,Edith Nesbit,1973,,
2,The Book Thief,Markus Zusak,1998,,
3,The Jungle Book,Rudyard Kipling,1893,,
4,The book of tea,Okakura Kakuzo,1900,,
...,...,...,...,...,...
495,Leviathan,Thomas Hobbes,1651,,
496,The Story of the Amulet,Edith Nesbit,1905,,
497,Ars Amatoria,Ovid,1494,,
498,The Boy in the Striped Pyjamas,John Boyne,2006,,


In [11]:
# Convert to DataFrames
df_openlibrary = pd.DataFrame(openlibrary_books)
df_books_to_scrape = pd.DataFrame(books_to_scrape)

# Merge datasets on title
df_combined = pd.merge(df_openlibrary, df_books_to_scrape, on="title", how="outer")

In [13]:
# Save CSV
df_combined.to_csv("book_recommender_dataset.csv", index=False)
df_combined

Unnamed: 0,title,author,published_year,isbn,subject,price,availability,rating
0,"""Most Blessed of the Patriarchs"": Thomas Jeffe...",,,,,Â£44.48,In stock,Five
1,#HigherSelfie: Wake Up Your Life. Free Your So...,,,,,Â£23.11,In stock,Five
2,'Salem’s Lot,Stephen King,1975.0,,,,,
3,(Un)Qualified: How God Uses Broken People to D...,,,,,Â£54.00,In stock,Five
4,... Trotzdem Ja zum Leben sagen,Viktor E. Frankl,1946.0,,,,,
...,...,...,...,...,...,...,...,...
992,Смерть Ивана Ильича,Лев Толстой,1887.0,,,,,
993,कामसूत्र,Mallanaga Vātsyāyana,1883.0,,,,,
994,Ἀπολογία Σωκράτους,Πλάτων,1675.0,,,,,
995,Ἰλιάς,Όμηρος,1505.0,,,,,


<h1 style="color: #FF8C00;">Part 2: Clustering & Viz</h1>

In [16]:
import pandas as pd
from sklearn.feature_extraction.text import TfidfVectorizer
from sklearn.preprocessing import LabelEncoder, StandardScaler
from sklearn.cluster import KMeans
from scipy.spatial.distance import cdist
import numpy as np

In [17]:
# Function to recommend books from the same cluster
def recommend_books(book_title, num_recommendations=5):
    if book_title not in df["title"].values:
        return "Book not found. Try another title."
    
    book_cluster = df[df["title"] == book_title]["cluster"].values[0]
    recommendations = df[df["cluster"] == book_cluster][["title", "author"]]
    
    return recommendations.sample(num_recommendations)

In [18]:
df = pd.read_csv("book_recommender_dataset.csv")

# Fill missing values
df.fillna("Unknown", inplace=True)

# Convert categorical data into numerical values
label_enc = LabelEncoder()
df["author_encoded"] = label_enc.fit_transform(df["author"])
df["subject_encoded"] = label_enc.fit_transform(df["subject"])
df["rating_encoded"] = label_enc.fit_transform(df["rating"])

# TF-IDF for book titles
vectorizer = TfidfVectorizer(stop_words='english')
title_matrix = vectorizer.fit_transform(df["title"])

data_features = np.hstack((
    title_matrix.toarray(),
    df[["author_encoded", "subject_encoded", "rating_encoded"]].values
))

# Standardize features
scaler = StandardScaler()
data_scaled = scaler.fit_transform(data_features)

# Apply K-Means clustering
num_clusters = 10  # Define number of clusters
kmeans = KMeans(n_clusters=num_clusters, random_state=42)
df["cluster"] = kmeans.fit_predict(data_scaled)


  df.fillna("Unknown", inplace=True)


In [24]:
# Example recommendation
book_to_search = "'Salem’s Lot"
print(f"Recommendations for {book_to_search}:")
recommend_books(book_to_search)

Recommendations for 'Salem’s Lot:


Unnamed: 0,title,author
516,Peter Pan,J. M. Barrie
727,The Jewel of Seven Stars,Bram Stoker
52,A Wrinkle in Time,Madeleine L'Engle
710,The High Mountains of Portugal,Unknown
605,Stars Above (The Lunar Chronicles #4.5),Unknown
