In [None]:
import requests
import re
from bs4 import BeautifulSoup

# Step 1: Fetch all books
books_url = "https://wolnelektury.pl/api/books/"
response = requests.get(books_url)
books = response.json()

# Step 2: Filter for genre "Wiersz"
poems = [book for book in books if book.get("kinds") == "Liryka"]

# Step 3: Limit to the first 5 poems
test_poems = poems[:5]

# Step 4: Extract poem details
for poem in test_poems:
    title = poem.get("title")
    author = poem.get("author")
    details_url = poem.get("href")  # Detailed info about the poem

    # Fetch detailed info
    details_response = requests.get(details_url)
    details = details_response.json()

    # Step 5: Get the HTML text
    html_url = details.get("html")  # Full text in HTML format
    html_response = requests.get(html_url)
    soup = BeautifulSoup(html_response.content, "html.parser")

    # Extract the poem content (verses are typically in <div> tags)
    poem_body = "\n".join([line.get_text() for line in soup.find_all("div", class_="verse")])
    poem_body_cleaned = re.sub(r"\[.*?\]", "", poem_body)

    # Print the poem details
    print(f"Title: {title}\nAuthor: {author}\nPoem:\n{poem_body_cleaned}\n")

Title: [Ach! rzucić sieci zdarte...]
Author: Tristan Derème
Poem:

Poeta, Kondycja ludzkaAch! rzucić sieci zdarte, wędki i robaki,
I te książki! Być prostym, jak wiejskie chłopaki,
Być nie-smutnym, podrwiwać z mieszczuchów kataru,
W chłodne wieczory brzdąkać, szczycić się gitarą,
Na miłostkach poprzestać, nie wzdychać: niestety,
Są szczęśliwi, gdy grube złożą triolety[1],
Kiedy rymują: sągi-ongi, żonki-dżonki.
Lecz czuć, że życie siecią jest gęstej koronki,
Pod którą drga twarz obca, jej się, blady, boję…
Wokół twej kiści nagiej wiją się powoje.


Title: A co wam śpiewać
Author: Maria Konopnicka
Poem:
A co wam śpiewać, laleczki?
Bo umiem różne piosneczki:
Takie piosneczki i pieśni,
O jakich lalkom się nie śni!
Umiem piosenki z nad łąki,
Tak jak je nucą skowronki,
Kiedy piórkami szaremi
Pod niebo lecą od ziemi,
Nad ziemią lecą i dzwonią,
Nad polem naszem, nad błonią.
Umiem piosenkę jaskółki,
Gdy lata koło rzeczułki,
I wdzięcznym głoskiem coś nuci,
Czy się weseli, czy smuci,
Albo na gnia

In [25]:
import requests
from bs4 import BeautifulSoup
import re
from transformers import pipeline

# Define the Poem class
class Poem:
    def __init__(self, title, author, content):
        self.title = title
        self.author = author
        self.content = content
        self.themes = []

    def add_theme(self, theme):
        """Add a theme to the poem if it isn't already present."""
        if theme not in self.themes:
            self.themes.append(theme)

    def to_dict(self):
        """Convert the poem object to a dictionary."""
        return {
            "Title": self.title,
            "Author": self.author,
            "Content": self.content,
            "Themes": ", ".join(self.themes)
        }

# Initialize the Hugging Face zero-shot classification pipeline with mBERT
classifier = pipeline(
    "zero-shot-classification",
    model="bert-base-multilingual-cased",
    tokenizer="bert-base-multilingual-cased"
)

# Comprehensive list of candidate themes
candidate_labels = [
    "nadzieja", "szczęście", "smutek", "miłość", "natura",
    "wojna", "pokój", "wolność", "cierpienie", "radość",
    "samotność", "przyjaźń", "religia", "czas", "przemijanie",
    "życie", "śmierć", "rodzina", "tęsknota", "marzenia"
]

def extract_poem_text(poem_data):
    """
    Extract the full poem text from poem_data.
    Args:
        poem_data (dict): The JSON object containing poem metadata.
    Returns:
        str: The cleaned text of the poem.
    """
    # Step 1: Get the detailed poem data
    details_url = poem_data.get("href")
    details_response = requests.get(details_url)
    details = details_response.json()

    # Step 2: Get the HTML link for the poem's text
    html_url = details.get("html")
    html_response = requests.get(html_url)

    # Step 3: Parse the HTML content
    soup = BeautifulSoup(html_response.content, "html.parser")
    poem_body = "\n".join([line.get_text() for line in soup.find_all("div", class_="verse")])

    # Step 4: Clean up the poem text
    poem_body_cleaned = re.sub(r"\[.*?\]", "", poem_body)  # Remove annotations in square brackets

    return poem_body_cleaned

def analyze_themes(poem, candidate_labels):
    """
    Analyze themes using zero-shot classification and add themes with score >0.6.
    Args:
        poem (Poem): The Poem object to analyze.
        candidate_labels (list): List of possible themes to classify.
    """
    # Perform zero-shot classification
    result = classifier(poem.content, candidate_labels=candidate_labels, multi_label=True)

    print(f"\nAnalyzing themes for: {poem.title}")
    print("Candidate themes and scores:")
    for label, score in zip(result["labels"], result["scores"]):
        print(f"  {label}: {score:.2f}")

    # Add themes with scores > 0.6 to the poem
    for label, score in zip(result["labels"], result["scores"]):
        if score > 0.6:
            poem.add_theme(label)

# Main script logic
# Step 1: Fetch all books
# books_url = "https://wolnelektury.pl/api/books/"
# response = requests.get(books_url)
# books = response.json()

# Step 2: Filter for genre "Wiersz"
poems_data = [book for book in books if book.get("genre") == "Wiersz"]

# Step 3: Limit to the first 5 poems for testing
test_poems = poems_data[:5]

# Step 4: Process poems and analyze themes
poems = []

for poem_data in test_poems:
    title = poem_data.get("title")
    author = poem_data.get("author")
    content = extract_poem_text(poem_data)

    # Create a Poem object
    poem = Poem(title=title, author=author, content=content)

    # Analyze and add themes
    analyze_themes(poem, candidate_labels)

    poems.append(poem)

# Step 5: Save poems to a CSV file
import csv

def save_poems_to_csv(poems, filename="poems_with_themes.csv"):
    """Save a list of poems to a CSV file."""
    with open(filename, mode="w", encoding="utf-8", newline="") as file:
        writer = csv.DictWriter(file, fieldnames=["Title", "Author", "Content", "Themes"])
        writer.writeheader()
        for poem in poems:
            writer.writerow(poem.to_dict())

save_poems_to_csv(poems)

print(f"Saved {len(poems)} poems with themes to 'poems_with_themes.csv'!")

loading configuration file config.json from cache at C:\Users\karol\.cache\huggingface\hub\models--bert-base-multilingual-cased\snapshots\3f076fdb1ab68d5b2880cb87a0886f315b8146f8\config.json
Model config BertConfig {
  "_name_or_path": "bert-base-multilingual-cased",
  "architectures": [
    "BertForMaskedLM"
  ],
  "attention_probs_dropout_prob": 0.1,
  "classifier_dropout": null,
  "directionality": "bidi",
  "hidden_act": "gelu",
  "hidden_dropout_prob": 0.1,
  "hidden_size": 768,
  "initializer_range": 0.02,
  "intermediate_size": 3072,
  "layer_norm_eps": 1e-12,
  "max_position_embeddings": 512,
  "model_type": "bert",
  "num_attention_heads": 12,
  "num_hidden_layers": 12,
  "pad_token_id": 0,
  "pooler_fc_size": 768,
  "pooler_num_attention_heads": 12,
  "pooler_num_fc_layers": 3,
  "pooler_size_per_head": 128,
  "pooler_type": "first_token_transform",
  "position_embedding_type": "absolute",
  "transformers_version": "4.46.3",
  "type_vocab_size": 2,
  "use_cache": true,
  "voc


Analyzing themes for: [Ach! rzucić sieci zdarte...]
Candidate themes and scores:
  życie: 0.53
  radość: 0.53
  szczęście: 0.53
  śmierć: 0.52
  pokój: 0.52
  miłość: 0.52
  smutek: 0.51
  wojna: 0.51
  przyjaźń: 0.51
  marzenia: 0.51
  nadzieja: 0.50
  czas: 0.50
  tęsknota: 0.50
  cierpienie: 0.50
  rodzina: 0.50
  wolność: 0.50
  natura: 0.50
  religia: 0.50
  przemijanie: 0.50
  samotność: 0.50

Analyzing themes for: A co wam śpiewać
Candidate themes and scores:
  czas: 0.52
  radość: 0.52
  smutek: 0.51
  miłość: 0.51
  życie: 0.51
  śmierć: 0.51
  szczęście: 0.51
  wojna: 0.51
  nadzieja: 0.51
  pokój: 0.51
  przyjaźń: 0.50
  rodzina: 0.50
  marzenia: 0.50
  tęsknota: 0.50
  natura: 0.50
  wolność: 0.50
  religia: 0.50
  cierpienie: 0.50
  przemijanie: 0.50
  samotność: 0.50

Analyzing themes for: Pieśń II, 3 (Aequam memento rebus in arduis...)
Candidate themes and scores:
  przemijanie: 0.55
  radość: 0.55
  wojna: 0.54
  śmierć: 0.54
  czas: 0.53
  smutek: 0.53
  pokój: 0.53
 

In [26]:
import sqlite3

db_name = "poems.db"

def initialize_database():
    """Create a new SQLite database and initialize the poems table."""
    conn = sqlite3.connect(db_name)  # Create or connect to the database
    cursor = conn.cursor()

    # Create the poems table
    cursor.execute("""
        CREATE TABLE IF NOT EXISTS poems (
            id INTEGER PRIMARY KEY AUTOINCREMENT,
            title TEXT NOT NULL,
            author TEXT NOT NULL,
            content TEXT NOT NULL,
            themes TEXT
        )
    """)
    conn.commit()  # Save changes
    conn.close()  # Close the connection

# Call the function to initialize the database
initialize_database()
print(f"Database '{db_name}' has been initialized.")

Database 'poems.db' has been initialized.


In [41]:
import requests
from bs4 import BeautifulSoup
import re
import csv
from datetime import datetime
import pdfplumber

# Function to fetch and clean the poem content
def fetch_poem_data(poem_data):
    title = poem_data.get("title")
    author = poem_data.get("author")
    details_url = poem_data.get("href")

    # Fetch detailed info
    details_response = requests.get(details_url)
    details = details_response.json()

    # Get the HTML content of the poem
    html_url = details.get("html")
    if html_url:
        html_response = requests.get(html_url)
        soup = BeautifulSoup(html_response.content, "html.parser")
        poem_body = "\n".join([line.get_text() for line in soup.find_all("div", class_="verse")])
        poem_body_cleaned = re.sub(r"\[.*?\]", "", poem_body)
    else:
        # Fallback to PDF
        pdf_url = details.get("pdf")
        if pdf_url:
            pdf_response = requests.get(pdf_url)
            with open("temp_poem.pdf", "wb") as pdf_file:
                pdf_file.write(pdf_response.content)
            with pdfplumber.open("temp_poem.pdf") as pdf_doc:
                poem_body = ""
                for page in pdf_doc.pages:
                    poem_body += page.extract_text()
            poem_body_cleaned = re.sub(r"\[.*?\]", "", poem_body.strip())
        else:
            # Fallback to TXT
            txt_url = details.get("txt")
            if txt_url:
                print(f"Fetching poem content from TXT for: {details.get('title')}")
                txt_response = requests.get(txt_url)
                poem_body_cleaned = re.sub(r"\[.*?\]", "", txt_response.text.strip())
            else: poem_body_cleaned = None

    # Return cleaned data
    return {
        "title": title,
        "author": author,
        "content": poem_body_cleaned
    }

# Step 1: Fetch all books
# books_url = "https://wolnelektury.pl/api/books/"
# response = requests.get(books_url)
# books = response.json()

# Step 2: Filter for genre "Wiersz"
poems_data = [book for book in books if book.get("genre") == "Wiersz"]
print(len(poems_data))
# Step 3: Limit to the first 5 poems for testing
test_poems = poems_data

# Step 4: Extract data and save to CSV
csv_filename = "poems.csv"
with open(csv_filename, mode="w", encoding="utf-8", newline="") as csvfile:
    fieldnames = ["Title", "Author", "Content"]
    writer = csv.DictWriter(csvfile, fieldnames=fieldnames)
    writer.writeheader()

    for poem_data in test_poems:
        poem = fetch_poem_data(poem_data)

        if poem is None or poem["content"] is None:
            print(f"Skipping poem: {poem_data.get('title')} - No content available.")
            continue

        writer.writerow({
            "Title": poem["title"],
            "Author": poem["author"],
            "Content": poem["content"]
        })

print(f"Saved all poems to {csv_filename}.")

4051
Skipping poem: Noc bezsenna (cykl) - No content available.
Saved all poems to poems.csv.


In [None]:
import pandas as pd
from transformers import pipeline
import torch
from torch.cuda.amp import autocast

# Check if CUDA is available
device = 0 if torch.cuda.is_available() else -1  # Use GPU if available

# Load the CSV file
csv_file = "poems.csv"  # Replace with your CSV filename
df = pd.read_csv(csv_file)
# test
df = df.iloc[:5]

# Comprehensive candidate labels
candidate_labels = [
    "smutek", "miłość", "natura",
    "wojna", "pokój", "wolność", "cierpienie", "radość",
    "samotność", "przyjaźń", "przemijanie",
    "życie", "śmierć", "tęsknota",
    "dzieciństwo", "ojczyzna", "wiara",
    "macierzyństwo", "poświęcenie", "przemiana",
    "zemsta", "przywiązanie"
]

def truncate_text(text, max_length=512):
    tokens = text.split()  # Split text into tokens (words)
    return " ".join(tokens[:max_length])  # Truncate and rejoin tokens


# Load the model and pipeline with GPU support
model_name = "MoritzLaurer/mDeBERTa-v3-base-xnli-multilingual-nli-2mil7"
classifier = pipeline("zero-shot-classification", model=model_name, device=0)  # device=0 for GPU

# Function to classify a batch of texts
def classify_batch(batch_texts, classifier_pipeline, candidate_labels):
    truncated_texts = [truncate_text(text) for text in batch_texts]  # Apply truncation

    with autocast():
        results = classifier_pipeline(batch_texts, candidate_labels=candidate_labels, multi_label=True)
    batch_labels = []
    for result in results:
        # Filter labels with scores > 0.75
        filtered_labels = [
            label for label, score in zip(result["labels"], result["scores"]) if score > 0.75
        ]
        batch_labels.append(", ".join(filtered_labels))
    return batch_labels

# Batch size for RTX 3060 (adjust if needed)
batch_size = 16

# Prepare results storage
classified_poems = []

# Process in batches
for i in range(0, len(df), batch_size):
    batch = df.iloc[i:i+batch_size]  # Get a batch of poems
    batch_texts = batch["Content"].tolist()

    print(f"Classifying batch {i // batch_size + 1} of {len(df) // batch_size + 1}...")

    # Classify the batch
    batch_results = classify_batch(batch_texts, classifier, candidate_labels)

    # Append results to classified_poems
    for idx, poem in batch.iterrows():
        poem_results = {
            "Title": poem["Title"],
            "Author": poem["Author"],
            "mDeBERTa-v3": batch_results[idx - i]  # Map back to batch results
        }
        classified_poems.append(poem_results)

# Convert results to a DataFrame
results_df = pd.DataFrame(classified_poems)

# Save the results to a new CSV file
results_csv = "classification_results_by_model.csv"
results_df.to_csv(results_csv, index=False)

print(f"Filtered classification results saved to {results_csv}.")

ValueError: numpy.dtype size changed, may indicate binary incompatibility. Expected 96 from C header, got 88 from PyObject

In [64]:
results_csv = "classification_results_by_model.csv"
results_df.reset_index(drop=True, inplace=True)
pd.set_option("display.max_colwidth", None)
results_df["mDeBERTa-v3"] = results_df["mDeBERTa-v3"].astype(str)


# results_df.to_csv(results_csv, index=False, encoding="utf-8")
results_df.columns = [str(col) for col in results_df.columns]

results_df.to_dict(orient="records")
results_df = pd.DataFrame(results_df)
results_df.columns = [str(col) for col in results_df.columns]

# Reset the index
results_df.reset_index(drop=True, inplace=True)

# Save to CSV
csv_filename = "classification_results_by_model.csv"
results_df.to_csv(csv_filename, index=False, encoding="utf-8")




AttributeError: 'Index' object has no attribute '_format_native_types'

In [47]:
import torch
print("CUDA available:", torch.cuda.is_available())
print("Device name:", torch.cuda.get_device_name(0))


CUDA available: True
Device name: NVIDIA GeForce RTX 3060
