In [18]:
import requests
from bs4 import BeautifulSoup
import time
import json
from typing import Dict, List, Optional
import re
from datetime import datetime
import csv
from google.colab import drive
import os

# Mount Google Drive
drive.mount('/content/drive')

class BookDataCollector:
    def __init__(self):
        self.goodreads_base_url = "https://www.goodreads.com"
        self.openlibrary_base_url = "https://openlibrary.org"
        self.headers = {
            'User-Agent': 'Mozilla/5.0 (Windows NT 10.0; Win64; x64) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/91.0.4472.124 Safari/537.36'
        }

    def clean_title(self, title: str) -> str:
        """Remove series information from title for better search results"""
        # Remove series information in parentheses
        cleaned = re.sub("[\(\[].*?[\)\]]", "", title)
        # Remove extra spaces
        cleaned = ' '.join(cleaned.split())
        return cleaned.strip()

    def scrape_goodreads_decade_list(self, list_url: str, max_books: int = 100) -> List[Dict]:
        """Scrape books from a Goodreads list page"""
        books = []
        page = 1

        while len(books) < max_books:
            try:
                # Construct URL with pagination
                page_url = f"{list_url}?page={page}"
                response = requests.get(page_url, headers=self.headers)
                response.raise_for_status()

                soup = BeautifulSoup(response.content, 'html.parser')

                # Find all book entries on the page
                book_entries = soup.find_all('tr', itemtype='http://schema.org/Book')

                if not book_entries:
                    break

                for entry in book_entries:
                    if len(books) >= max_books:
                        break

                    book_data = self.extract_book_info(entry)
                    if book_data:
                        books.append(book_data)

                page += 1
                time.sleep(1)  # Be respectful to the server

            except Exception as e:
                print(f"Error scraping page {page}: {e}")
                break

        return books[:max_books]

    def extract_book_info(self, book_entry) -> Optional[Dict]:
        """Extract book information from a Goodreads list entry"""
        try:
            # Get title
            title_elem = book_entry.find('a', class_='bookTitle')
            full_title = title_elem.find('span').text.strip() if title_elem else None

            # Get author
            author_elem = book_entry.find('a', class_='authorName')
            author = author_elem.find('span').text.strip() if author_elem else None

            # Get Goodreads URL
            book_url = self.goodreads_base_url + title_elem['href'] if title_elem else None

            # Get rating
            rating_elem = book_entry.find('span', class_='minirating')
            rating_text = rating_elem.text.strip() if rating_elem else ""
            rating_match = re.search(r'(\d+\.\d+)', rating_text)
            rating = float(rating_match.group(1)) if rating_match else None

            # Get number of ratings
            ratings_count_match = re.search(r'([\d,]+)\s+ratings', rating_text)
            ratings_count = int(ratings_count_match.group(1).replace(',', '')) if ratings_count_match else None

            return {
                'title': full_title,
                'clean_title': self.clean_title(full_title) if full_title else None,
                'author': author,
                'goodreads_url': book_url,
                'rating': rating,
                'ratings_count': ratings_count
            }

        except Exception as e:
            print(f"Error extracting book info: {e}")
            return None

    def search_openlibrary(self, title: str, author: str, clean_title: str) -> Optional[Dict]:
        """Search for a book on Open Library and get its metadata"""
        try:
            # Try with clean title first
            search_queries = [
                f"{clean_title} {author}",
                clean_title]

            # If the clean title is different from the original title, try that as well
            if clean_title != title:
                search_queries.append(f"{title} {author}")
                search_queries.append(title)

            # If title starts with "the," add query with "the" removed
            if title.lower().startswith("the "):
                search_queries.append(f"{clean_title[4:]} {author}")
                search_queries.append(f"{clean_title[4:]}")
                if clean_title[4:] != title[4:]:
                    search_queries.append(f"{title[4:]} {author}")
                    search_queries.append(f"{title[4:]}")

            for query in search_queries:
                search_url = f"{self.openlibrary_base_url}/search.json"

                params = {
                    'q': query.strip(),
                    'limit': 10,
                    'language': 'eng',
                    # 'has_fulltext': 'true',  # Prefer editions with full text
                    # 'fields': '*,number_of_pages,number_of_pages_median'  # Request all fields
                }

                response = requests.get(search_url, params=params)
                response.raise_for_status()

                data = response.json()

                if data.get('docs'):

                    # Try to find the best match
                    for doc in data['docs']:
                        # Check if author matches
                        doc_authors = doc.get('author_name', [])
                        if any(self.fuzzy_match(author, doc_author) for doc_author in doc_authors):
                            result = self.get_book_details(doc)
                            if result:
                              # print("returning result from matching author name")
                              return result

                    # If no good author match, try title match
                    for doc in data['docs']:
                        doc_title = doc.get('title', '').lower()
                        if self.fuzzy_match(clean_title.lower(), doc_title):
                            result = self.get_book_details(doc)
                        elif clean_title.lower().startswith("the ") and self.fuzzy_match(clean_title[4:].lower(), doc_title):
                            result = self.get_book_details(doc)
                        if result:
                          # print("returning result from matching title name")
                          return result

                time.sleep(0.3)  # Small delay between searches

            return None

        except Exception as e:
            print(f"Error searching Open Library for '{title}': {e}")
            return None


    def fuzzy_match(self, str1: str, str2: str, threshold: float = 0.8) -> bool:
        """Simple fuzzy string matching"""
        str1_lower = str1.lower()
        str2_lower = str2.lower()

        # Exact match
        if str1_lower == str2_lower:
            return True

        # One contains the other
        if str1_lower in str2_lower or str2_lower in str1_lower:
            return True

        # Check if most words match
        words1 = set(str1_lower.split())
        words2 = set(str2_lower.split())

        if not words1 or not words2:
            return False

        intersection = words1.intersection(words2)
        union = words1.union(words2)

        return len(intersection) / len(union) >= threshold

    def get_book_details(self, book_doc: Dict) -> Dict:
        """Extract detailed information from Open Library book document"""
        details = {
            'olid': book_doc.get('key', ''),
            'description': None,
            'first_publish_year': book_doc.get('first_publish_year'),
            'number_of_pages': None,
            'subjects': book_doc.get('subject', [])[:10],  # Top 10 subjects
            'isbn': book_doc.get('isbn', [])[:1],  # First ISBN
            'language': book_doc.get('language', [])[:1]
        }

        # Get work details for description
        if book_doc.get('key'):
            work_details = self.get_work_details(details['olid'])
            description = None
            if isinstance(work_details.get('description'), dict):
                description = work_details['description'].get('value', '')
            elif isinstance(work_details.get('description'), str):
                description = work_details['description']
            details['description'] = description

        return details

    def get_work_details(self, work_key: str) -> Optional[Dict]:
        """Get details for a work"""
        try:
            url = f"{self.openlibrary_base_url}{work_key}.json"
            response = requests.get(url)
            response.raise_for_status()

            data = response.json()

            return data

        except Exception as e:
            return None

    def collect_decade_data(self, decade_start: int, list_url: str, max_books: int = 100) -> List[Dict]:
        """Collect data for all books in a decade"""
        print(f"\nCollecting data for {decade_start}s...")

        # Scrape Goodreads list
        books = self.scrape_goodreads_decade_list(list_url, max_books)
        print(f"Found {len(books)} books on Goodreads")

        # Enrich with Open Library data
        enriched_books = []
        for i, book in enumerate(books, 1):
            print(f"Processing {i}/{len(books)}: {book['title']} by {book['author']}")

            num_tries = 0

            while num_tries < 4:

              # Search Open Library
              ol_data = self.search_openlibrary(
                  book['title'],
                  book['author'],
                  book['clean_title']
              )

              num_tries += 1

              # Combine data
              enriched_book = book.copy()
              enriched_book['decade'] = decade_start

              if ol_data and ol_data.get('description'):
                  break


            if ol_data:
                enriched_book.update(ol_data)
                print(f"  ✓ Found in Open Library")
                if ol_data.get('description'):
                    print(f"    - Has description")
                if ol_data.get('number_of_pages'):
                    print(f"    - Has page count: {ol_data['number_of_pages']}")
            else:
                print(f"  ✗ Not found in Open Library")

            # Final status
            # if enriched_book.get('description') and enriched_book.get('number_of_pages'):
            #     print(f"  ✓ Complete: Has both description and page count")
            # elif enriched_book.get('description'):
            #     print(f"  ⚠ Partial: Has description but no page count")
            # elif enriched_book.get('number_of_pages'):
            #     print(f"  ⚠ Partial: Has page count but no description")
            # else:
            #     print(f"  ⚠ Missing: No description or page count found")

            if enriched_book.get('description'):
                print(f"  ✓ Has description")
            else:
                print(f"  ⚠ Missing description")

            enriched_books.append(enriched_book)

            # Rate limiting
            time.sleep(0.5)

        # Summary
        books_with_description = sum(1 for book in enriched_books if book.get('description'))
        # books_with_pages = sum(1 for book in enriched_books if book.get('number_of_pages'))
        books_complete = sum(1 for book in enriched_books if book.get('description') and book.get('number_of_pages'))

        print(f"\nSummary for {decade_start}s:")
        print(f"  - Books with descriptions: {books_with_description}/{len(enriched_books)}")
        # print(f"  - Books with page counts: {books_with_pages}/{len(enriched_books)}")
        # print(f"  - Books with both: {books_complete}/{len(enriched_books)}")

        return enriched_books

    def save_to_csv(self, books: List[Dict], filename: str, drive_folder: str = 'MyDrive/new_book_data'):
        """Save book data to CSV file in Google Drive"""
        if not books:
            return

        # Create the full path in Google Drive
        drive_path = f'/content/drive/{drive_folder}'

        # Create directory if it doesn't exist
        os.makedirs(drive_path, exist_ok=True)

        # Full file path
        filepath = os.path.join(drive_path, filename)

        # Define CSV columns
        fieldnames = [
            'decade', 'title', 'clean_title', 'author', 'rating', 'ratings_count',
            'first_publish_year', 'number_of_pages', 'description',
            'subjects', 'isbn', 'language', 'goodreads_url', 'olid',
        ]

        with open(filepath, 'w', newline='', encoding='utf-8') as csvfile:
            writer = csv.DictWriter(csvfile, fieldnames=fieldnames)
            writer.writeheader()

            for book in books:
                # Convert lists to strings for CSV
                row = book.copy()
                for key in ['subjects', 'isbn', 'language']:
                    if key in row and isinstance(row[key], list):
                        row[key] = '; '.join(str(item) for item in row[key])

                writer.writerow({k: row.get(k, '') for k in fieldnames})

        print(f"✓ CSV saved to Google Drive: {filepath}")

    def save_to_json(self, books: List[Dict], filename: str, drive_folder: str = 'MyDrive/book_data'):
        """Save book data to JSON file in Google Drive"""
        # Create the full path in Google Drive
        drive_path = f'/content/drive/{drive_folder}'

        # Create directory if it doesn't exist
        os.makedirs(drive_path, exist_ok=True)

        # Full file path
        filepath = os.path.join(drive_path, filename)

        with open(filepath, 'w', encoding='utf-8') as f:
            json.dump(books, f, indent=2, ensure_ascii=False)

        print(f"✓ JSON saved to Google Drive: {filepath}")

  cleaned = re.sub("[\(\[].*?[\)\]]", "", title)


Drive already mounted at /content/drive; to attempt to forcibly remount, call drive.mount("/content/drive", force_remount=True).


In [19]:
collector = BookDataCollector()

# Define decade URLs (you'll need to update these with actual URLs)
# Define decade URLs
decade_urls = {
    1900: "https://www.goodreads.com/list/show/38",
    1910: "https://www.goodreads.com/list/show/93",
    1920: "https://www.goodreads.com/list/show/39",
    1930: "https://www.goodreads.com/list/show/85",
    1940: "https://www.goodreads.com/list/show/23",
    1950: "https://www.goodreads.com/list/show/22",
    1960: "https://www.goodreads.com/list/show/18",
    1970: "https://www.goodreads.com/list/show/21",
    1980: "https://www.goodreads.com/list/show/9",
    1990: "https://www.goodreads.com/list/show/17",
    2000: "https://www.goodreads.com/list/show/5",
    2010: "https://www.goodreads.com/list/show/4093",
    2020: "https://www.goodreads.com/list/show/143500"
}

for decade_start in decade_urls.keys():
    all_books = []

    books = collector.collect_decade_data(
        decade_start=decade_start,
        list_url=decade_urls[decade_start],
        max_books=100
    )

    all_books.extend(books)

    # Save data
    # timestamp = datetime.now().strftime("%Y%m%d_%H%M%S")
    # collector.save_to_csv(all_books, f"books_data_{timestamp}.csv")
    collector.save_to_csv(all_books, f"books_data_{decade_start}.csv")
    # collector.save_to_json(all_books, f"books_data_{timestamp}.json")

    print(f"\nData collection complete! Collected {len(all_books)} books.")

    # Print statistics
    books_with_desc = sum(1 for book in all_books if book.get('description'))
    # books_with_pages = sum(1 for book in all_books if book.get('number_of_pages'))
    # books_complete = sum(1 for book in all_books if book.get('description') and book.get('number_of_pages'))

    print(f"\nOverall Statistics:")
    print(f"- Books with descriptions: {books_with_desc}/{len(all_books)} ({books_with_desc/len(all_books)*100:.1f}%)")
    # print(f"- Books with page counts: {books_with_pages}/{len(all_books)} ({books_with_pages/len(all_books)*100:.1f}%)")
    # print(f"- Books with both: {books_complete}/{len(all_books)} ({books_complete/len(all_books)*100:.1f}%)")

[1;30;43mStreaming output truncated to the last 5000 lines.[0m
Processing 74/100: The Unabridged Devil's Dictionary by Ambrose Bierce
  ✓ Found in Open Library
    - Has description
  ✓ Has description
Processing 75/100: Arsène Lupin, Gentleman-Thief by Maurice Leblanc
  ✓ Found in Open Library
  ⚠ Missing description
Processing 76/100: Beneath the Wheel by Hermann Hesse
  ✓ Found in Open Library
  ⚠ Missing description
Processing 77/100: The Diaries of Adam and Eve by Mark Twain
  ✓ Found in Open Library
  ⚠ Missing description
Processing 78/100: The Tale of Mrs. Tiggy-Winkle (World of Beatrix Potter, #6) by Beatrix Potter
  ✓ Found in Open Library
    - Has description
  ✓ Has description
Processing 79/100: The Life and Adventures of Santa Claus by L. Frank Baum
  ✓ Found in Open Library
    - Has description
  ✓ Has description
Processing 80/100: Tonio Kröger / Mario und der Zauberer by Thomas Mann
  ✓ Found in Open Library
  ⚠ Missing description
Processing 81/100: Green Mansions