In [1]:
import pandas as pd
import requests
from bs4 import BeautifulSoup

In [3]:
def scrape_poems(url):
    """
    Scrape poem links, titles, dates, and the first line of the post from the given URL.

    Args:
        url (str): URL of the webpage to scrape.

    Returns:
        DataFrame: Pandas DataFrame containing the scraped data.
    """
    response = requests.get(url)
    soup = BeautifulSoup(response.text, "html.parser")

    poems = []
    for post in soup.find_all("article"):
        title_element = post.find("a", rel="bookmark")
        date_element = post.find("time", class_="entry-date published")
        entry_content = post.find("div", class_="entry-content")
        first_line_element = entry_content.find("p") if entry_content else None

        if title_element and date_element and first_line_element:
            title = title_element.text
            link = title_element["href"]
            date = date_element["datetime"]
            first_line = first_line_element.text.strip()

            poems.append(
                {"title": title, "link": link, "date": date, "first_line": first_line}
            )

    return pd.DataFrame(poems)


# URL of the webpage to scrape
url = "https://beatinpaths.com/category/poetry/"

# Scrape the poems and save to a CSV file
df = scrape_poems(url)
df.to_csv("poems.csv", index=False)


In [4]:
def scrape_links(url):
    """
    Scrape post links from the given URL.

    Args:
        url (str): URL of the webpage to scrape.

    Returns:
        DataFrame: Pandas DataFrame containing the scraped links.
    """
    response = requests.get(url)
    soup = BeautifulSoup(response.text, 'html.parser')

    links = []
    for post in soup.find_all('article'):
        link_element = post.find('a', rel='bookmark')

        if link_element:
            link = link_element['href']
            links.append({'link': link})

    return pd.DataFrame(links)

# URL of the webpage to scrape
url = 'https://beatinpaths.com/category/poetry/'

# Scrape the links and save to a CSV file
df = scrape_links(url)
df.to_csv('links.csv', index=False)

In [6]:
!pip install selenium

Collecting selenium
  Downloading selenium-4.16.0-py3-none-any.whl.metadata (6.9 kB)
Collecting trio~=0.17 (from selenium)
  Downloading trio-0.23.2-py3-none-any.whl.metadata (4.9 kB)
Collecting trio-websocket~=0.9 (from selenium)
  Downloading trio_websocket-0.11.1-py3-none-any.whl.metadata (4.7 kB)
Collecting sortedcontainers (from trio~=0.17->selenium)
  Downloading sortedcontainers-2.4.0-py2.py3-none-any.whl (29 kB)
Collecting outcome (from trio~=0.17->selenium)
  Downloading outcome-1.3.0.post0-py2.py3-none-any.whl.metadata (2.6 kB)
Collecting wsproto>=0.14 (from trio-websocket~=0.9->selenium)
  Downloading wsproto-1.2.0-py3-none-any.whl (24 kB)
Collecting pysocks!=1.5.7,<2.0,>=1.5.6 (from urllib3[socks]<3,>=1.26->selenium)
  Downloading PySocks-1.7.1-py3-none-any.whl (16 kB)
Downloading selenium-4.16.0-py3-none-any.whl (10.0 MB)
[2K   [90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━[0m [32m10.0/10.0 MB[0m [31m1.4 MB/s[0m eta [36m0:00:00[0m0:01[0m00:01[0m
[?25hDownloading 

In [5]:
from selenium import webdriver
from bs4 import BeautifulSoup
import pandas as pd
import time

def scrape_poems(url, driver_path):
    """
    Scrape poem links, titles, dates, and the first line of the post from the given URL.

    Args:
        url (str): URL of the webpage to scrape.
        driver_path (str): Path to the WebDriver executable.

    Returns:
        DataFrame: Pandas DataFrame containing the scraped data.
    """
    driver = webdriver.Chrome(driver_path)
    driver.get(url)

    # Scroll down to the bottom of the page until no more new poems are loaded
    last_height = driver.execute_script("return document.body.scrollHeight")
    while True:
        driver.execute_script("window.scrollTo(0, document.body.scrollHeight);")
        time.sleep(2)  # Wait for the new poems to load

        new_height = driver.execute_script("return document.body.scrollHeight")
        if new_height == last_height:
            break
        last_height = new_height

    soup = BeautifulSoup(driver.page_source, 'html.parser')

    poems = []
    for post in soup.find_all('article'):
        title_element = post.find('a', rel='bookmark')
        date_element = post.find('time', class_='entry-date published')

        if title_element and date_element:
            title = title_element.text
            link = title_element['href']
            date = date_element['datetime']

            # Navigate to the poem page and scrape the first line
            driver.get(link)
            time.sleep(2)  # Wait for the poem page to load
            poem_soup = BeautifulSoup(driver.page_source, 'html.parser')
            first_line_element = poem_soup.find('div', class_='entry-content').find('p')
            first_line = first_line_element.text.strip() if first_line_element else None

            poems.append({'title': title, 'link': link, 'date': date, 'first_line': first_line})

            # Navigate back to the main page
            driver.back()
            time.sleep(2)  # Wait for the main page to load

    driver.quit()

    return pd.DataFrame(poems)

# Path to the WebDriver executable
driver_path = '/path/to/your/driver'

# URL of the webpage to scrape
url = 'https://beatinpaths.com/category/poetry/'

# Scrape the poems and save to a CSV file
df = scrape_poems(url, driver_path)
df.to_csv('poems.csv', index=False)

ModuleNotFoundError: No module named 'selenium'