In [46]:
import requests
from bs4 import BeautifulSoup
from playwright.async_api import async_playwright
import re
import unicodedata
import pandas as pd

In [47]:

def normalize_name(name):
    name = name.lower()
    name = unicodedata.normalize('NFKD', name).encode('ASCII', 'ignore').decode()
    name = re.sub(r'[^a-z]', '', name)
    return name


In [48]:

def infer_gender(paragraph):
    paragraph = paragraph.lower()
    female_keywords = ["daughter", "wife", "queen", "princess", "goddess", "lady", "mistress", "her name", "she was", "she is"]
    male_keywords = ["son", "husband", "king", "prince", "god", "his name", "he was", "he is"]

    for word in female_keywords:
        if word in paragraph:
            return "female"
    for word in male_keywords:
        if word in paragraph:
            return "male"
    return "unknown"


In [31]:
async def scrape_characters_playwright(slug):
    url = f"https://www.sparknotes.com/lit/{slug}/characters/"
    characters = []

    async with async_playwright() as p:
        browser = await p.chromium.launch(headless=True)
        page = await browser.new_page()
        try:
            print(f"Opening: {url}")
            await page.goto(url, timeout=10000)
            await page.wait_for_selector("h3", timeout=5000)

            headers = await page.query_selector_all("h3")
            for header in headers:
                char_name = (await header.inner_text()).strip()
                next_p = await header.evaluate_handle("el => el.nextElementSibling")
                if next_p:
                    bio_text = (await next_p.evaluate("el => el.textContent")).strip()
                    gender = infer_gender(bio_text)
                    characters.append({
                        "name": char_name,
                        "normalized": normalize_name(char_name),
                        "gender": gender,
                        "bio": bio_text
                    })

        except Exception as e:
            print(f"Error scraping {slug}: {e}")
        finally:
            await browser.close()

    return characters

In [35]:
characters_odyssey = await scrape_characters_playwright("odyssey")



Opening: https://www.sparknotes.com/lit/odyssey/characters/
Error scraping odyssey: Page.wait_for_selector: Timeout 5000ms exceeded.
Call log:
  - waiting for locator("h3") to be visible



In [36]:
print(characters_odyssey)

[]


In [50]:
df = pd.read_csv("epic_books5.csv")


FileNotFoundError: [Errno 2] No such file or directory: 'epic_books5.csv'

In [38]:
!pip install wikipedia


Collecting wikipedia
  Downloading wikipedia-1.4.0.tar.gz (27 kB)
  Preparing metadata (setup.py) ... [?25ldone
Building wheels for collected packages: wikipedia
  Building wheel for wikipedia (setup.py) ... [?25ldone
[?25h  Created wheel for wikipedia: filename=wikipedia-1.4.0-py3-none-any.whl size=11680 sha256=c18d4e4dabcd0b61fa8de732b236c932e30f8a893f8e9f285b10603c0bb34936
  Stored in directory: /Users/harshitachakravadhanula./Library/Caches/pip/wheels/b2/7f/26/524faff9145e274da278dc97d63ab0bfde1f791ecf101a9c95
Successfully built wikipedia
Installing collected packages: wikipedia
Successfully installed wikipedia-1.4.0


In [39]:
import wikipedia

def get_character_page(title):
    try:
        results = wikipedia.search(f"{title} characters")
        for result in results:
            if "character" in result.lower() and title.lower() in result.lower():
                return wikipedia.page(result).url
        return wikipedia.page(results[0]).url  # fallback
    except Exception as e:
        print(f"Failed for {title}: {e}")
        return None

# Example
print(get_character_page("Odyssey"))
print(get_character_page("Mahabharata"))
print(get_character_page("Shahnameh"))


https://en.wikipedia.org/wiki/List_of_Adventures_in_Odyssey_characters
https://en.wikipedia.org/wiki/List_of_characters_in_the_Mahabharata
https://en.wikipedia.org/wiki/List_of_Shahnameh_characters


In [41]:


def get_character_page2(title):
    try:
        query = f"{title} characters"
        results = wikipedia.search(query)
        title_lower = title.lower()

        for result in results:
            if "character" in result.lower() and title_lower in result.lower():
                return wikipedia.page(result).url

        # fallback: try to find *any* list of characters
        for result in results:
            if "character" in result.lower():
                return wikipedia.page(result).url

        # ultimate fallback
        return wikipedia.page(results[0]).url
    except Exception as e:
        print(f"Failed for {title}: {e}")
        return None


In [42]:
print(get_character_page2("Odyssey"))        # Should skip Adventures in Odyssey
print(get_character_page2("Mahabharata"))    # ✅
print(get_character_page2("Shahnameh"))      # ✅
print(get_character_page2("Divine Comedy"))  # Let's see what it gives you


https://en.wikipedia.org/wiki/List_of_Adventures_in_Odyssey_characters
https://en.wikipedia.org/wiki/List_of_characters_in_the_Mahabharata
https://en.wikipedia.org/wiki/List_of_Shahnameh_characters
https://en.wikipedia.org/wiki/Divine_Comedy_Illustrated_by_Botticelli


In [43]:
async def scrape_shmoop_characters(slug):
    url = f"https://www.shmoop.com/study-guides/{slug}/characters.html"
    characters = []

    async with async_playwright() as p:
        browser = await p.chromium.launch(headless=True)
        page = await browser.new_page()
        try:
            print(f"Opening: {url}")
            await page.goto(url, timeout=15000)
            await page.wait_for_selector(".panel-heading h4", timeout=8000)

            headings = await page.query_selector_all(".panel-heading h4")

            for heading in headings:
                char_name = (await heading.inner_text()).strip()
                body = await heading.evaluate_handle("el => el.parentElement.nextElementSibling")
                if body:
                    bio_text = (await body.evaluate("el => el.textContent")).strip()
                    gender = infer_gender(bio_text)
                    characters.append({
                        "name": char_name,
                        "normalized": normalize_name(char_name),
                        "gender": gender,
                        "bio": bio_text
                    })

        except Exception as e:
            print(f"Error scraping {slug}: {e}")
        finally:
            await browser.close()

    return characters

In [44]:
characters = await scrape_shmoop_characters("gilgamesh")
for c in characters: print(c)

Opening: https://www.shmoop.com/study-guides/gilgamesh/characters.html
{'name': 'Gilgamesh', 'normalized': 'gilgamesh', 'gender': 'male', 'bio': "The hero of our tale: a cocky, selfish young king who befriends a half man/half beast, goes on fantastic adventures with him. When his new, beloved friend dies, Gilgamesh realizes there's no room i..."}
{'name': 'Enkidu', 'normalized': 'enkidu', 'gender': 'male', 'bio': 'Half-man/half-beast bestie of Gilgamesh. He basically symbolizes the natural, non-civilized world. He faces an early death as punishment from the gods for all the trouble that he and Gilgamesh got...'}
{'name': 'Utanapishtim', 'normalized': 'utanapishtim', 'gender': 'male', 'bio': 'A human given immortality who lives on the other side of earth—literally. Gilgamesh visits him to discover the secret of immortality, but ends up empty-handed.The Gods Told You What?Utanapishtim...'}
{'name': 'Shamhat', 'normalized': 'shamhat', 'gender': 'unknown', 'bio': "Even though she only appe

In [45]:


characters_list = []

for title in df["source_title"].dropna().unique():
    try:
        print(f"Scraping: {title}")
        char_info = await scrape_shmoop_characters(title)
        characters_list.append({
            "source_title": title,
            "characters_info": char_info
        })
    except Exception as e:
        print(f"Skipping {title}: {e}")
        characters_list.append({
            "source_title": title,
            "characters_info": None
        })

characters = pd.DataFrame(characters_list)


NameError: name 'df' is not defined