In [1]:


import os
import time
import requests
import pandas as pd
from urllib.parse import urlencode

class OpenLibraryScraper:
    def __init__(self, output_dir="ol_data"):
        self.output_dir = output_dir
        self.images_dir = os.path.join(output_dir, "images")
        os.makedirs(self.output_dir, exist_ok=True)
        os.makedirs(self.images_dir, exist_ok=True)
        self.session = requests.Session()
        # It is recommended to provide a descriptive User-Agent including contact info
        self.session.headers.update({
            "User-Agent": "MyOpenLibraryScraper/1.0 (your_email@example.com)"
        })
        self.books = []

    def search_books(self, query: str, page: int = 1, limit: int = 100):
        """
        Use the OpenLibrary Search API to get search results.
        Query example: "python programming"
        page starts from 1.
        """
        base = "https://openlibrary.org/search.json"
        params = {
            "q": query,
            "page": page,
            "limit": limit
        }
        url = f"{base}?{urlencode(params)}"
        resp = self.session.get(url, timeout=10)
        resp.raise_for_status()
        data = resp.json()
        return data

    def download_cover(self, cover_id: int, size="M", book_id=None):
        """
        Download cover image from OpenLibrary Covers service.
        cover_id: integer id of cover from API `cover_i`
        size: "S", "M", or "L"
        """
        if cover_id is None:
            return None
        # Format: https://covers.openlibrary.org/b/id/<cover_id>-M.jpg
        url = f"https://covers.openlibrary.org/b/id/{cover_id}-{size}.jpg"
        try:
            resp = self.session.get(url, timeout=10)
            if resp.status_code == 200:
                fname = f"{book_id or cover_id}-{size}.jpg"
                path = os.path.join(self.images_dir, fname)
                with open(path, "wb") as f:
                    f.write(resp.content)
                return path
        except Exception as e:
            print("Cover download failed:", e)
        return None

    def collect(self, query: str, pages: int = 5, limit_per_page: int = 100, download_covers: bool = True):
        """
        Collect metadata of books matching `query`.
        pages * limit_per_page = maximum number of books.
        """
        for p in range(1, pages + 1):
            print(f"Searching page {p} for '{query}' …")
            result = self.search_books(query, page=p, limit=limit_per_page)
            docs = result.get("docs", [])
            if not docs:
                print("No docs returned, stopping.")
                break

            for doc in docs:
                try:
                    # Extract fields (use .get safely)
                    book = {
                        "title": doc.get("title"),
                        "author_names": doc.get("author_name", []),
                        "first_publish_year": doc.get("first_publish_year"),
                        "edition_count": doc.get("edition_count"),
                        "cover_id": doc.get("cover_i"),
                        "key": doc.get("key"),  # work key like "/works/OL12345W"
                        "isbn": doc.get("isbn", []),
                        "subject": doc.get("subject", []),
                    }

                    # Download cover image
                    if download_covers and book["cover_id"] is not None:
                        img_path = self.download_cover(book["cover_id"], size="M", book_id=book["key"].strip("/").replace("/", "_"))
                        book["cover_path"] = img_path
                    else:
                        book["cover_path"] = None

                    self.books.append(book)
                except Exception as e:
                    print("Error processing doc:", e)
                    continue

            time.sleep(0.5)  # polite delay between API calls

    def save(self):
        df = pd.DataFrame(self.books)
        csv_path = os.path.join(self.output_dir, "openlibrary_books.csv")
        json_path = os.path.join(self.output_dir, "openlibrary_books.json")
        df.to_csv(csv_path, index=False)
        df.to_json(json_path, orient="records", indent=2)
        print("Saved:", csv_path, json_path)

def main():
    scraper = OpenLibraryScraper()
    scraper.collect(query="artificial intelligence", pages=10, limit_per_page=100, download_covers=True)
    scraper.save()

if __name__ == "__main__":
    main()


Searching page 1 for 'artificial intelligence' …
Cover download failed: ('Connection aborted.', RemoteDisconnected('Remote end closed connection without response'))
Searching page 2 for 'artificial intelligence' …
Searching page 3 for 'artificial intelligence' …
Searching page 4 for 'artificial intelligence' …
Searching page 5 for 'artificial intelligence' …
Cover download failed: ('Connection aborted.', RemoteDisconnected('Remote end closed connection without response'))
Searching page 6 for 'artificial intelligence' …
Searching page 7 for 'artificial intelligence' …
Searching page 8 for 'artificial intelligence' …
Searching page 9 for 'artificial intelligence' …
Searching page 10 for 'artificial intelligence' …
Saved: ol_data\openlibrary_books.csv ol_data\openlibrary_books.json
