In [7]:
import requests
import json

# Make API call to Google Books
response = requests.get('https://www.googleapis.com/books/v1/volumes?q=self-help')

# Get JSON data
data = response.json()

# Export JSON to file
with open('books_data.json', 'w') as f:
    json.dump(data, f, indent=4)


In [14]:
!pip install datasets



In [None]:
from datasets import load_dataset

ds = load_dataset("BrightData/Goodreads-Books")

In [15]:
import requests

# Define the endpoint and parameters
url = "https://en.wikipedia.org/w/api.php"
params = {
    "action": "query",
    "generator": "categorymembers",
    "gcmtitle": "Category:Self-help_books",
    "prop": "extracts",
    "explaintext": 1,
    "format": "json",
    "gclimit": 100  # Adjust this limit as needed; pagination may be required for more pages.
}

# Make the API request
response = requests.get(url, params=params)
data = response.json()

# Process and print the extracted text for each page
if "query" in data and "pages" in data["query"]:
    pages = data["query"]["pages"]
    for page_id, page_content in pages.items():
        title = page_content.get("title", "No title")
        extract = page_content.get("extract", "No extract available")
        print(f"Title: {title}")
        print("Extract:")
        print(extract)
        print("\n" + "="*80 + "\n")
else:
    print("No pages found or an error occurred in the query.")

Title: Self-help book
Extract:
A self-help book is one that is written with the intention to instruct its readers on solving personal problems. The books take their name from Self-Help, an 1859 best-seller by Samuel Smiles, but are also known and classified under "self-improvement", a term that is a modernized version of self-help. Self-help books moved from a niche position to being a postmodern cultural phenomenon in the late twentieth century.


== Early history ==
Informal guides to everyday behaviour might be said to have existed almost as long as writing itself. Ancient Egyptian "Codes" of conduct "have a curiously modern note: 'You trail from street to street, smelling of beer...like a broken rudder, good for nothing....you have been found performing acrobatics on a wall!'". Micki McGee writes: "Some social observers have suggested that the Bible is perhaps the first and most significant of self-help books".
In classical Rome, Cicero's On Friendship and On Duties became "handboo

In [16]:
import requests
from bs4 import BeautifulSoup
import json

# Get the main category page
url = "https://en.wikipedia.org/wiki/Category:Self-help_books"
response = requests.get(url)
soup = BeautifulSoup(response.text, 'html.parser')

# Find the div with id "mw-pages" and get all links
mw_pages = soup.find(id="mw-pages")
book_links = []
if mw_pages:
    links = mw_pages.find_all('a')
    for link in links:
        href = link.get('href')
        if href and href.startswith('/wiki/'):
            full_url = f"https://en.wikipedia.org{href}"
            book_links.append(full_url)

# Process first 3 links
book_data = {}
for url in book_links[:3]:
    response = requests.get(url)
    soup = BeautifulSoup(response.text, 'html.parser')
    
    # Get page title
    title = soup.find(id="firstHeading").text
    book_data[title] = {}
    
    # Get infobox if exists
    infobox = soup.find('table', class_='infobox')
    if infobox:
        infobox_data = {}
        rows = infobox.find_all('tr')
        for row in rows:
            header = row.find('th')
            value = row.find('td')
            if header and value:
                infobox_data[header.text.strip()] = value.text.strip()
        book_data[title]['infobox'] = infobox_data
    
    # Get sections and their content
    sections = soup.find_all('h2')
    current_section = None
    section_content = []
    
    for element in soup.find_all(['h2', 'p']):
        if element.name == 'h2':
            # Save previous section content
            if current_section:
                book_data[title][current_section] = ' '.join(section_content)
            # Start new section
            span = element.find('span', class_='mw-headline')
            if span:
                current_section = span.text
                section_content = []
        elif element.name == 'p' and current_section:
            section_content.append(element.text.strip())
    
    # Save last section
    if current_section:
        book_data[title][current_section] = ' '.join(section_content)

# Print results
print(json.dumps(book_data, indent=2))


{
  "Wikipedia:FAQ/Categorization": {},
  "Self-help book": {},
  "The 3rd Alternative": {
    "infobox": {
      "Author": "Stephen R. Covey",
      "Language": "English",
      "Subject": "Self-help, Negotiation",
      "Genre": "non-fiction",
      "Published": "2011 Free Press",
      "Publication place": "United States",
      "Media\u00a0type": "Print (Hardcover, Paperback)",
      "Pages": "456",
      "ISBN": "978-1-4516-2626-1 978-1-4516-2628-5 (ebook)",
      "OCLC": "709673139",
      "Dewey Decimal": "158 23",
      "LC\u00a0Class": "BF449 .C68 2011"
    }
  }
}
