# Draft

## Exploratory requests

In [None]:
import requests
from bs4 import BeautifulSoup
from pathlib import Path
from urllib.parse import urljoin

from pygments import highlight
from pygments.formatters import TerminalFormatter
from pygments.lexers import HtmlLexer

In [48]:
url = "https://myanimelist.net/manga/2/stats"

In [49]:
response = requests.Session().get(url)
soup = BeautifulSoup(response.text, "html.parser")

response.raise_for_status()

In [50]:
cwd_path = Path().cwd()
data_path = cwd_path.parent / "data" / "raw"
file_path = data_path / "berserk_stats.html"

with file_path.open("w") as f:
    f.write(soup.prettify())

## Extracting data

In [51]:
with file_path.open("r") as f:
    raw_html = f.read()

stats_soup = BeautifulSoup(raw_html, "html.parser")
soup = BeautifulSoup(raw_html, "html.parser")

In [97]:
def extract_stats(
    url: str | None = None,
    session: requests.Session = None,
    soup: BeautifulSoup = stats_soup,
) -> dict[str, int | str | list[str] | None]:
    # session = session or requests.Session()

    # response = session.get(url)
    # response.raise_for_status()

    # soup = BeautifulSoup(response.text, "html.parser")

    stats = {}

    for div in soup.select("div[class*='spaceit_pad']"):
        label = div.find("span", class_="dark_text")

        if label:
            key = label.text.strip().rstrip(":").lower()
            links = div.find_all("a")
            # Get the text after the label and clean it
            if len(links) > 1:
                value = [link.text.strip() for link in links]
            elif len(links) == 1:
                value = links[0].text.strip()
            else:
                value = label.next_sibling.strip()
            stats[key] = value
            print(f"{key}: {value}")

    return stats


extract_stats()

synonyms: Berserk: The Prototype
japanese: ベルセルク
english: Berserk
type: Manga
volumes: Unknown
chapters: Unknown
status: Publishing
published: Aug  25, 1989 to ?
genres: ['Action', 'Adventure', 'Award Winning', 'Drama', 'Fantasy', 'Horror', 'Supernatural']
themes: ['Gore', 'Military', 'Mythology', 'Psychological']
demographic: Seinen
serialization: Young Animal
authors: ['Miura, Kentarou', 'Studio Gaga']
score: weighted score
ranked: top manga
popularity: #1
members: 732,000
favorites: 131,366


{'synonyms': 'Berserk: The Prototype',
 'japanese': 'ベルセルク',
 'english': 'Berserk',
 'type': 'Manga',
 'volumes': 'Unknown',
 'chapters': 'Unknown',
 'status': 'Publishing',
 'published': 'Aug  25, 1989 to ?',
 'genres': ['Action',
  'Adventure',
  'Award Winning',
  'Drama',
  'Fantasy',
  'Horror',
  'Supernatural'],
 'themes': ['Gore', 'Military', 'Mythology', 'Psychological'],
 'demographic': 'Seinen',
 'serialization': 'Young Animal',
 'authors': ['Miura, Kentarou', 'Studio Gaga'],
 'score': 'weighted score',
 'ranked': 'top manga',
 'popularity': '#1',
 'members': '732,000',
 'favorites': '131,366'}

In [47]:
max_iterations = 1


for i, item in enumerate(soup.select("a[class*=fs14]")):
    if i >= max_iterations:
        break

    url = item["href"]
    stats_url = urljoin(url, "stats")
    name = item.text.strip()

    stats_data = extract_stats(stats_url, session)

    # colored_html = highlight(item.prettify(), HtmlLexer(), TerminalFormatter())
    print(name, stats_url, sep=": ")

Berserk: https://myanimelist.net/manga/2/stats


In [91]:
fields = {}



synonyms: Berserk: The Prototype
japanese: ベルセルク
english: Berserk
type: Manga
volumes: Unknown
chapters: Unknown
status: Publishing
published: Aug  25, 1989 to ?
genres: ['Action', 'Adventure', 'Award Winning', 'Drama', 'Fantasy', 'Horror', 'Supernatural']
themes: ['Gore', 'Military', 'Mythology', 'Psychological']
demographic: Seinen
serialization: Young Animal
authors: ['Miura, Kentarou', 'Studio Gaga']
score: weighted score
ranked: top manga
popularity: #1
members: 732,000
favorites: 131,366
