# Base Dataset Buildout - 12/12/25

- All "Self Help" books & all "Nonfiction" books that also include key genres
    - Exclude Textbooks, Reference
    - English language only
---
- Zero-shot classification
    - ChatGPT:
        - Kinda need to determine if it addresses problems AT ALL for nonfiction (people are afflicted by [something])... then filter out any that don't ascribe to this
        - Tony-robbins vs. wellbeing/mental health — something regarding 
    - ~~internal vs. external~~
- Zero-shot:
    - spiritual vs. secular
    - mental vs. physical
    - categories of DSM typoe things (anxiety spikes over time?)
    - sub-categories of Self Help genres
- DANIEL - Mental health piece... how has it been referred to in the past? Medical grounding? Today it's covered by insurance companies — they have created standardized plans. 
- Labels: Popular
- Add Year col
- World events same year
- World events 1-5 years prior
- Return structured data from URL
    - Extract 'want to read', 'currently reading' and 'author bio'
- JSON capture from Wikipedia Article on Book? Then Author?
    - Identify inspirations in the form of other events
- based on relevant prompts to create labeling conventions, use GPT API (or Gemini? Something free?) to assign relevent labels based on descriptions
- standardize THREE datasets where possible:
    1. Goodreads (and analysis)
    2. Historical events
    3. Wikipedia data (if needed)
    4. Medical language?


In [2]:
import pandas as pd
import requests
from bs4 import BeautifulSoup
import re
from tqdm import tqdm
import logging
# pip install langdetect
from langdetect import detect  # Comment out until package is installed
from transformers import pipeline
import ast
# import os
import ast
# import openai
import logging
from dotenv import load_dotenv

# Load environment variables from .env file
load_dotenv()
# openai.api_key = os.getenv("OPENAI_API_KEY")

logging.basicConfig(level=logging.INFO, format='%(levelname)s: %(message)s')

  from .autonotebook import tqdm as notebook_tqdm


In [3]:
df = pd.read_csv("filtered_books_sample_0211.csv")
classifier = pipeline("zero-shot-classification", model="facebook/bart-large-mnli")

Hardware accelerator e.g. GPU is available in the environment, but no `device` argument is passed to the `Pipeline` object. Model will be on CPU.


In [4]:
subcategories = ["personal development", "relationships", "psychology", "business", "memoir", "other"]

In [5]:
def is_english(text):
    try:
        return detect(text) == "en"
    except Exception as e:
        logging.error(f"Language detection error: {e}")
        return False

In [6]:
def scrape_goodreads(url):
    data = {}
    try:
        response = requests.get(url, timeout=10)
        if response.status_code != 200:
            logging.error(f"Request failed with status code {response.status_code} for URL {url}")
            return data
        soup = BeautifulSoup(response.text, "html.parser")

        # Get image URL from the element with class "BookCard"
        book_card = soup.find(class_="BookCard")
        if book_card:
            img_tag = book_card.find("img")
            if img_tag and img_tag.get("src"):
                data["img_url"] = img_tag["src"]

        # Extract author_text from <span class="Formatted"> within the desired parent containers
        page_section = soup.find("div", class_="PageSection")
        if page_section:
            details_div = page_section.find("div", class_="DetailsLayoutRightParagraph__widthConstrained")
            if details_div:
                author_span = details_div.find("span", class_="Formatted")
                if author_span:
                    data["author_text"] = author_span.get_text(strip=True)

        # Extract social signals from SocialSignalsSection__container
        social_container = soup.find(class_="SocialSignalsSection__container")
        logging.info(f"Social container found for URL {url}: {social_container is not None}")
        if social_container:
            signals = social_container.find_all("div", class_="SocialSignalsSection__caption")
            if signals and len(signals) >= 2:
                # The first element is currently reading, second is to-read
                curr_text = signals[0].get_text()
                to_read_text = signals[1].get_text()
                curr_match = re.search(r'([\d,]+)', curr_text)
                to_read_match = re.search(r'([\d,]+)', to_read_text)
                if curr_match:
                    currently_reading = int(curr_match.group(1).replace(',', ''))
                    data["currently_reading"] = currently_reading
                    print(f"Currently reading count for {url}: {currently_reading}")
                if to_read_match:
                    to_read = int(to_read_match.group(1).replace(',', ''))
                    data["to_read"] = to_read
                    print(f"To-read count for {url}: {to_read}")

    except Exception as e:
        logging.error(f"Error scraping {url}: {e}")
    return data

In [7]:
from openai import OpenAI
import os

client = OpenAI(api_key=os.getenv("OPENAI_API_KEY"))

def analyze_external_relevance(candidate_text):
    try:
        response = client.chat.completions.create(
            model="gpt-4o",  # Ensure this is a valid model identifier
            messages=[
                {"role": "system", "content": "You are an expert literary analyst."},
                {"role": "user", "content": (
                    "Does the following text explore forces, ideas, or subjects that impact human physical or mental health? "
                    "Answer 'yes' or 'no' only.\n\nText:\n" + candidate_text
                )}
            ],
            temperature=0
        )
        answer = response.choices[0].message.content.strip().lower()
        logging.info(f"OpenAI answer: {answer}")
        if answer == "yes":
            return True
        elif answer == "no":
            return False
        else:
            logging.warning(f"Unexpected answer from OpenAI: {answer}")
            return False
    except Exception as e:
        logging.error(f"OpenAI API error: {e}")
        return False

In [None]:
for idx, row in tqdm(df.head(50).iterrows(), total=50):
    record_updates = {}
    try:
        summary = row.get("summary", "")
        genres = row.get("genres", "[]")
        
        # Skip if non-English based on summary language detection
        if not is_english(summary):
            logging.info(f"Record {idx} skipped: Non-English summary")
            continue
        
        # Parse genres (if stored as a string representation of a list)
        try:
            genres_list = ast.literal_eval(genres) if isinstance(genres, str) else genres
        except Exception as e:
            logging.error(f"Error parsing genres for record {idx}: {e}")
            genres_list = []
        
        # Prepare candidate text for classification
        candidate_text = summary + " " + " ".join(genres_list)
        
        # Check if the book is labeled as Self Help
        is_self_help = "Self Help" in genres_list
        
        if is_self_help:
            # Use zero-shot classification to assign a Self Help subcategory
            result = classifier(candidate_text, subcategories)
            subcategory = result["labels"][0]  # highest scoring label
            record_updates["self_help_subcategory"] = subcategory
        else:
            # Use OpenAI's LLM to determine relevance for external, health-impacting forces
            record_updates["relevant_non_self_help"] = analyze_external_relevance(candidate_text)
        
        # Generate spectrum labels using zero-shot classification
        # Spectrum 1: Spiritual vs Secular
        spiritual_labels = ["spiritual", "secular"]
        result_spiritual = classifier(candidate_text, spiritual_labels)
        p_spiritual = result_spiritual["scores"][result_spiritual["labels"].index("spiritual")]
        p_secular = result_spiritual["scores"][result_spiritual["labels"].index("secular")]
        total = p_spiritual + p_secular
        record_updates["spiritual_score"] = p_spiritual / total if total else None
        
        # Spectrum 2: Mental Health vs Physical Health
        health_labels = ["mental health", "physical health"]
        result_health = classifier(candidate_text, health_labels)
        p_mental = result_health["scores"][result_health["labels"].index("mental health")]
        p_physical = result_health["scores"][result_health["labels"].index("physical health")]
        total = p_mental + p_physical
        record_updates["mental_health_score"] = p_mental / total if total else None
        
        # Scrape Goodreads page for additional info
        url = row.get("url", "")
        if url:
            scraped = scrape_goodreads(url)
            record_updates.update(scraped)
        
        # Update the dataframe with new data for this record
        for key, value in record_updates.items():
            df.at[idx, key] = value
            
    except Exception as e:
        logging.error(f"Error processing record {idx}: {e}")
        continue

  0%|          | 0/50 [00:00<?, ?it/s]INFO: HTTP Request: POST https://api.openai.com/v1/chat/completions "HTTP/1.1 200 OK"
INFO: OpenAI answer: yes
INFO: Social container found for URL https://www.goodreads.com/book/show/318028.Il_Gigante: True
  2%|▏         | 1/50 [00:03<02:57,  3.62s/it]INFO: HTTP Request: POST https://api.openai.com/v1/chat/completions "HTTP/1.1 200 OK"
INFO: OpenAI answer: yes
INFO: Social container found for URL https://www.goodreads.com/book/show/310054.Final_Analysis: True
  4%|▍         | 2/50 [00:06<02:31,  3.15s/it]INFO: HTTP Request: POST https://api.openai.com/v1/chat/completions "HTTP/1.1 200 OK"
INFO: OpenAI answer: no.
INFO: Social container found for URL https://www.goodreads.com/book/show/542884.Breaking_Loose_Together: True
  6%|▌         | 3/50 [00:08<01:56,  2.48s/it]INFO: HTTP Request: POST https://api.openai.com/v1/chat/completions "HTTP/1.1 200 OK"
INFO: OpenAI answer: no.
INFO: Social container found for URL https://www.goodreads.com/book/show

# ARCHIVE

In [None]:
# import pandas as pd
# import transformers  # Note: "transforcers" as mentioned in the instructions; using the transformers library

# # Load the CSV into a DataFrame
# df = pd.read_csv("filtered_books_sample_0211.csv")
# print("Loaded initial dataset with", len(df), "records")

# # Filter for books with 200+ ratings
# df = df[df["num_ratings"] >= 200]
# print("Filtered to", len(df), "books with 200+ ratings")

# # Identify records where the "genres" column includes "Nonfiction"
# nonfiction_mask = df["genres"].str.contains("Nonfiction", case=False, na=False)
# print("Found", nonfiction_mask.sum(), "nonfiction books")

# # Set up the zero-shot classification pipeline using the transformers library
# pipe = transformers.pipeline("zero-shot-classification", model="facebook/bart-large-mnli")
# print("Initialized classification pipeline")

# # Define candidate labels for classification
# candidate_labels = ["addresses a human problem", "does not address a human problem"]

# # Set a threshold: if the score for "addresses a problem" meets or exceeds this, we mark it True
# threshold = 0.8

# # Helper function to classify a book summary
# def classify_summary(summary):
#     result = pipe(summary, candidate_labels=candidate_labels)
#     # The pipeline returns labels sorted by score (highest first)
#     if result["labels"][0] == "addresses a problem" and result["scores"][0] >= threshold:
#         return True
#     else:
#         return False

# # Apply the classification only on the first 50 filtered (Nonfiction) records and create the new column
# nonfiction_mask_200 = nonfiction_mask & (df.index < 200)
# print("Starting classification of", nonfiction_mask_200.sum(), "books...")
# df.loc[nonfiction_mask_200, "addresses_problem"] = df.loc[nonfiction_mask_200, "summary"].apply(classify_summary)
# print("Classification complete!")

-----------------------------

# for idx, row in tqdm(df.head(10).iterrows(), total=10):
#     record_updates = {}
#     try:
#         summary = row.get("summary", "")
#         genres = row.get("genres", "[]")
        
#         # Skip if non-English based on summary language detection
#         if not is_english(summary):
#             logging.info(f"Record {idx} skipped: Non-English summary")
#             continue
        
#         # Parse genres (if stored as a string representation of a list)
#         try:
#             genres_list = ast.literal_eval(genres) if isinstance(genres, str) else genres
#         except Exception as e:
#             logging.error(f"Error parsing genres for record {idx}: {e}")
#             genres_list = []
        
#         # Prepare candidate text for zero-shot classification
#         candidate_text = summary + " " + " ".join(genres_list)
        
#         # Check if the book is labeled as Self Help
#         is_self_help = "Self Help" in genres_list
        
#         if is_self_help:
#             # Self Help: Use zero-shot classification to assign a subcategory
#             result = classifier(candidate_text, subcategories)
#             subcategory = result["labels"][0]  # highest scoring label
#             record_updates["self_help_subcategory"] = subcategory
#         else:
#             # Non-self-help: Determine relevance to health-impacting forces using zero-shot classification
#             external_labels = ["explores forces, ideas, or subjects that impact human physical or mental health", 
#                              "does not explore forces, ideas, or subjects that impact human physical or mental health"]
#             result_external = classifier(candidate_text, external_labels)
#             score_explore = result_external["scores"][result_external["labels"].index("explores forces, ideas, or subjects that impact human physical or mental health")]
#             score_not_explore = result_external["scores"][result_external["labels"].index("does not explore forces, ideas, or subjects that impact human physical or mental health")]
#             # Mark as relevant if the score for exploring health-impacting forces is higher
#             record_updates["relevant_non_self_help"] = score_explore > score_not_explore
        
#         # Generate spectrum labels using zero-shot classification
#         # Spectrum 1: Spiritual vs Secular
#         spiritual_labels = ["spiritual", "secular"]
#         result_spiritual = classifier(candidate_text, spiritual_labels)
#         p_spiritual = result_spiritual["scores"][result_spiritual["labels"].index("spiritual")]
#         p_secular = result_spiritual["scores"][result_spiritual["labels"].index("secular")]
#         total = p_spiritual + p_secular
#         record_updates["spiritual_score"] = p_spiritual / total if total else None
        
#         # Spectrum 2: Mental Health vs Physical Health
#         health_labels = ["mental health", "physical health"]
#         result_health = classifier(candidate_text, health_labels)
#         p_mental = result_health["scores"][result_health["labels"].index("mental health")]
#         p_physical = result_health["scores"][result_health["labels"].index("physical health")]
#         total = p_mental + p_physical
#         record_updates["mental_health_score"] = p_mental / total if total else None
        
#         # Scrape Goodreads page for additional info
#         url = row.get("url", "")
#         if url:
#             scraped = scrape_goodreads(url)
#             record_updates.update(scraped)
        
#         # Update the dataframe with new data for this record
#         for key, value in record_updates.items():
#             df.at[idx, key] = value
            
#     except Exception as e:
#         logging.error(f"Error processing record {idx}: {e}")
#         continue

