In [20]:
from selenium import webdriver
from selenium.webdriver.common.by import By
from selenium.webdriver.chrome.service import Service
from selenium.webdriver.chrome.options import Options
from selenium.webdriver.support.ui import WebDriverWait
from selenium.webdriver.support import expected_conditions as EC
import os
import time

# Set up Selenium WebDriver
chrome_options = Options()
chrome_options.add_argument("--headless")  # Run Chrome in headless mode (without opening a window)
chrome_driver_path = "C:/Users/josep/Downloads/chromedriver-win64/chromedriver-win64/chromedriver.exe"  # Set your path to the ChromeDriver

service = Service(chrome_driver_path)
driver = webdriver.Chrome(service=service, options=chrome_options)

def scrape_poems_by_poet_selenium(poet_url, poet_name, output_folder):
    # Create a folder for the poet
    poet_folder = os.path.join(output_folder, poet_name)
    os.makedirs(poet_folder, exist_ok=True)

    page_number = 1

    while True:
        print(f"\nScraping page {page_number}: {poet_url}")

        # Open the poet's page
        driver.get(poet_url)

        # Find all poem containers
        poem_containers = driver.find_elements(By.CLASS_NAME, 'post-bodycopy')  # This class contains poem snippets
        
        # Print how many poems it finds
        print(f"Found {len(poem_containers)} poem containers on page {page_number}.")

        if not poem_containers:
            print("No poems found. Check the HTML structure or class names.")
            break

        for i in range(len(poem_containers)):
            try:
                # Re-fetch poem containers after every interaction to avoid stale elements
                poem_containers = driver.find_elements(By.CLASS_NAME, 'post-bodycopy')
                container = poem_containers[i]

                # Check if "Daugiau" link is present (for poem expansion)
                try:
                    more_link = container.find_element(By.CLASS_NAME, 'more-link')
                    print(f"Found 'Daugiau' link in poem {i+1} on page {page_number}. Clicking to expand...")
                    more_link.click()  # Click to expand the full poem

                    # Wait until the expanded content loads or a new page is navigated
                    WebDriverWait(driver, 10).until(EC.url_changes(poet_url))

                    # After clicking the 'Daugiau' button, check if the URL changed
                    if driver.current_url != poet_url:
                        # The link opened a new page; extract from the new page
                        poem_text = driver.find_element(By.CLASS_NAME, 'post-bodycopy').text.strip()
                    else:
                        # The poem expanded on the same page; extract the expanded poem text
                        poem_text = container.find_element(By.CLASS_NAME, 'post-bodycopy').text.strip()

                except Exception as e:
                    print(f"No 'Daugiau' link in poem {i+1}. Trying to extract visible text.")
                    poem_text = container.text.strip()  # Get visible text if no "Daugiau" link

                # Save the poem text to a file
                with open(os.path.join(poet_folder, f"{poet_name}_poem_{page_number}_{i+1}.txt"), 'w', encoding='utf-8') as f:
                    f.write(poem_text)

                print(f"Saved full poem {i+1} on page {page_number}")

                # Reload the page to avoid stale element references for the next poem
                driver.get(poet_url)

            except Exception as e:
                print(f"Error extracting poem {i+1} on page {page_number}: {e}")

        # Find the "Next" button to go to the next page
        try:
            next_page_link = driver.find_element(By.CLASS_NAME, 'nextpostslink')  # This should match the "Next" button's class
            poet_url = next_page_link.get_attribute('href')
            page_number += 1
            time.sleep(2)  # A short delay to prevent overwhelming the server

        except Exception as e:
            print(f"No more pages found. Scraping completed for {poet_name}.")
            break

    print(f"Poems by {poet_name} have been saved to {poet_folder}")

# Example usage
poet_name = "Maironis"
poet_url = "https://www.eilerasciai.lt/autoriai/maironis/"  # The correct poet URL
output_folder = "C:/Users/josep/OneDrive/Desktop/Erdos/poezija/poezija/"  # Correct path

scrape_poems_by_poet_selenium(poet_url, poet_name, output_folder)

# Close the WebDriver
driver.quit()


Scraping page 1: https://www.eilerasciai.lt/autoriai/maironis/
Found 10 poem containers on page 1.
Found 'Daugiau' link in poem 1 on page 1. Clicking to expand...
Saved full poem 1 on page 1
Found 'Daugiau' link in poem 2 on page 1. Clicking to expand...
Saved full poem 2 on page 1
Found 'Daugiau' link in poem 3 on page 1. Clicking to expand...
Saved full poem 3 on page 1
Found 'Daugiau' link in poem 4 on page 1. Clicking to expand...
Saved full poem 4 on page 1
Found 'Daugiau' link in poem 5 on page 1. Clicking to expand...
Saved full poem 5 on page 1
Found 'Daugiau' link in poem 6 on page 1. Clicking to expand...
Saved full poem 6 on page 1
Found 'Daugiau' link in poem 7 on page 1. Clicking to expand...
Saved full poem 7 on page 1
Found 'Daugiau' link in poem 8 on page 1. Clicking to expand...
Saved full poem 8 on page 1
Found 'Daugiau' link in poem 9 on page 1. Clicking to expand...
Saved full poem 9 on page 1
Found 'Daugiau' link in poem 10 on page 1. Clicking to expand...
Saved f