In [1]:
import requests
from bs4 import BeautifulSoup

In [2]:
books_url = "https://wolnelektury.pl/api/books/"
response = requests.get(books_url)
books = response.json()

In [3]:
poetry_genres = [
    "Anakreontyk", "Ballada", "Dramat poetycki", "Dramat wierszowany",
    "Epigramat", "Epos", "Erotyk", "Fraszka", "Hymn", "Oda", "Pieśń", 
    "Poemat", "Poemat alegoryczny", "Poemat dygresyjny", "Poemat heroikomiczny",
    "Psalm", "Satyra", "Sielanka", "Sonet", "Tren", "Wiersz", "Wiersz sylabotoniczny"
]

poetry_books = [book for book in books if book.get("genre") in poetry_genres]

In [4]:
len(poetry_books)

4955

In [21]:
import random

test = random.sample(poetry_books, 5)
test

[{'kind': 'Liryka',
  'full_sort_key': 'weintraub krzyz~1anowski jerzy kamil$do krzysztofa baczyn~0skiego albo elegia nocy zimowej$6640',
  'title': 'Do Krzysztofa Baczyńskiego albo elegia nocy zimowej',
  'url': 'https://wolnelektury.pl/katalog/lektura/weintraub-do-krzysztofa-baczynskiego-albo-elegia/',
  'cover_color': '#06393d',
  'author': 'Jerzy Kamil Weintraub-Krzyżanowski',
  'cover': 'book/cover/weintraub-do-krzysztofa-baczynskiego-albo-elegia.jpg',
  'epoch': 'Współczesność',
  'href': 'https://wolnelektury.pl/api/books/weintraub-do-krzysztofa-baczynskiego-albo-elegia/',
  'has_audio': False,
  'genre': 'Wiersz',
  'simple_thumb': 'https://wolnelektury.pl/media/book/cover_api_thumb/weintraub-do-krzysztofa-baczynskiego-albo-elegia_HufB6b2.jpg',
  'slug': 'weintraub-do-krzysztofa-baczynskiego-albo-elegia',
  'cover_thumb': 'book/cover_thumb/weintraub-do-krzysztofa-baczynskiego-albo-elegia_30tTyFF.jpg',
  'liked': None},
 {'kind': 'Liryka',
  'full_sort_key': 'radczyn~0ska misiur

In [None]:
import requests
from bs4 import BeautifulSoup
import pandas as pd
import xml.etree.ElementTree as ET
import re

In [25]:
def fetch_motifs_html(soup):
    motifs = []
    themes_section = soup.find("div", id="themes")
    if themes_section:
        for li in themes_section.find_all("li"):
            motif_text = li.get_text(strip=True).split(":")[0].strip()
            motifs.append(motif_text)
    return motifs


def fetch_motifs_xml(root):
    motifs = []
    for motif in root.findall(".//motyw"):
        motif_text = motif.text.strip().split(":")[0].strip()
        motifs.append(motif_text)
    return motifs


def remove_references(text):
    return re.sub(r"\[\d+\]", "", text)


def remove_theme_begin(soup):
    for theme in soup.find_all("a", class_="theme-begin"):
        theme.decompose()


def fetch_poem_content_html(url):
    if not url:
        return None, None, None

    response = requests.get(url)
    if response.status_code != 200:
        return None, None, None

    soup = BeautifulSoup(response.content, 'html.parser')
    remove_theme_begin(soup)

    motifs = fetch_motifs_html(soup)

    poem_body = "\n".join([
        remove_references(line.get_text().strip())
        for line in soup.find_all("div", class_="verse")
    ])
    return poem_body, ", ".join(motifs), "HTML"


def fetch_poem_content_xml(url):
    if not url:
        return None, None, None

    response = requests.get(url)
    if response.status_code != 200:
        return None, None, None

    root = ET.fromstring(response.content)
    motifs = fetch_motifs_xml(root)

    poem_body = "\n".join([
        remove_references(line.text.strip())
        for line in root.findall(".//strofa") if line.text
    ])
    return poem_body, ", ".join(motifs), "XML"


i = 0
poems_data = []
for book in poetry_books:
    book_url = book.get("href")
    if not book_url:
        continue

    response = requests.get(book_url)
    book_data = response.json()

    html_url = book_data.get("html")
    xml_url = book_data.get("xml")

    content = None
    motifs = None
    source = None

    if html_url and html_url.strip():
        content, motifs, source = fetch_poem_content_html(html_url)
    elif xml_url and xml_url.strip():
        content, motifs, source = fetch_poem_content_xml(xml_url)

    i+=1
    if i%100 == 0: print(f"batch {i}")

    if not content:
        continue

    poems_data.append({
        "author": ", ".join([author["name"] for author in book_data.get("authors", [])]),
        "title": book_data.get("title"),
        "content": content,
        "genre": ", ".join([genre["name"] for genre in book_data.get("genres", [])]),
        "motifs": motifs,
        "source": source
    })

poems_df = pd.DataFrame(poems_data)

poems_df.to_csv("poems_database.csv", index=False, encoding='utf-8')

batch 100
batch 200
batch 300
batch 400
batch 500
batch 600
batch 700
batch 800
batch 900
batch 1000
batch 1100
batch 1200
batch 1300
batch 1400
batch 1500
batch 1600
batch 1700
batch 1800
batch 1900
batch 2000
batch 2100
batch 2200
batch 2300
batch 2400
batch 2500
batch 2600
batch 2700
batch 2800
batch 2900
batch 3000
batch 3100
batch 3200
batch 3300
batch 3400
batch 3500
batch 3600
batch 3700
batch 3800
batch 3900
batch 4000
batch 4100
batch 4200
batch 4300
batch 4400
batch 4500
batch 4600
batch 4700
batch 4800
batch 4900


In [27]:
empty_content_count = poems_df[poems_df['content'].isnull() | (poems_df['content'] == "")].shape[0]
empty_motifs_count = poems_df[poems_df['motifs'].isnull() | (poems_df['motifs'] == "")].shape[0]
empty_genre_count = poems_df[poems_df['genre'].isnull() | (poems_df['genre'] == "")].shape[0]

analysis_results = {
    "Puste 'Treść'": empty_content_count,
    "Puste 'Motywy'": empty_motifs_count,
    "Puste 'Gatunek'": empty_genre_count
}

print(analysis_results)

{"Puste 'Treść'": 0, "Puste 'Motywy'": 2327, "Puste 'Gatunek'": 0}
