In [1]:
import requests
from bs4 import BeautifulSoup
from selenium import webdriver
from selenium.webdriver.common.by import By
from selenium.webdriver.chrome.service import Service
from selenium.webdriver.chrome.options import Options
from time import sleep
import pandas as pd
import random
import json
import gc
import os

Selenium was used here because the poem content is **server-side rendering using Javascript into HTML**. Thus, Beutiful Soup can't grab the poem data since the process was *asynchronous*

In [None]:
showAllPoemUrl = "https://www.poemhunter.com/explore/poems/lang-english/{page_number}/"
showPoemUrl = "https://www.poemhunter.com{poem_url}"

def writeFile(fileName, payload, permission="w"):
    txt_url = f"./{fileName}.txt"
    with open(txt_url, permission) as file:
        file.write(f"{payload}\n")

def getPoemLink(minPage=1, maxPage=150, save=True):
    poemLinks = []
    targetClasses = {"phBoxContinue", "txtc", "purpleLink"}
    for i in range(minPage,maxPage):
        url = showAllPoemUrl.format(page_number=i)
        response = requests.get(url)
        if response.status_code != 200:
            print(f"Failed to fetch page {i}")
            return []
        soup = BeautifulSoup(response.text, "html.parser")
        matchingDivs = soup.find_all("div", class_=lambda classList: classList and targetClasses.issubset(set(classList.split())))
        for div in matchingDivs:
            anchor = div.find("a")
            if anchor and "href" in anchor.attrs:
                poemLinks.append(anchor["href"])
    if save:
        filePath = f"poem_links.json"
        with open(filePath, "w", encoding="utf-8") as f:
            json.dump(poemLinks, f)
        return poemLinks
    else:
        return poemLinks

def scrapePoem(name="", i=1, start=0, custom=False, customLists=[]):
    again = True
    prefs = {
        "profile.managed_default_content_settings.images": 2,
        "profile.managed_default_content_settings.media_stream": 2,
    }
    chrome_options = Options()
    chrome_options.add_argument("--headless=new")
    chrome_options.add_argument("--disable-gpu")
    chrome_options.add_argument("--no-sandbox")
    chrome_options.add_argument("--disable-dev-shm-usage")
    chrome_options.add_argument("--disable-software-rasterizer")
    chrome_options.add_argument("--ignore-certificate-errors")
    chrome_options.add_argument("--ignore-ssl-errors")
    chrome_options.add_argument("--disable-setuid-sandbox")
    chrome_options.add_argument("--disable-machine-learning-model-download")
    chrome_options.add_experimental_option("prefs", prefs)
    service = Service(r".\chromedriver.exe")
    driver = webdriver.Chrome(service=service, options=chrome_options)
    data = []
    COBA_LAGI = 3
    
    with open(f"poem_links{name}.json", "r", encoding="utf-8") as f:
        poemsUrl = json.load(f)
        
    if custom:
        poemsUrl = [poemsUrl[customList] for customList in customLists]
        start = 0
    
    for idx, poem_url in enumerate(poemsUrl[start:]): #
        for coba in range(COBA_LAGI):
            try:
                print(f"Scrape {poem_url} index ke-{start+idx}")
                writeFile("log", f"Scrape {poem_url} index ke-{start+idx}", permission="w")
                url = showPoemUrl.format(poem_url=poem_url)
                driver.get(url)
                divs = driver.find_elements(By.CSS_SELECTOR, "div.phContent.phcText")
                
                for _, div in enumerate(divs):
                    data.append(div.text)
                    break
                
                # Pilih tidur sebentar atau lama setelah 100 batch
                if (idx+1)%100 == 0: # 
                    durasi = round(random.uniform(120, 240)) #
                    print(f"Sleep checkpoint panjang selama {durasi} detik")
                    print("Banyak data adalah : ", len(data))
                    pd.Series(data, dtype="object").to_csv(f"./poem{name}/poem_data{name}_{i}.csv", index=False)
                    sleep(durasi)
                    del data
                    del durasi
                    del divs
                    gc.collect()
                    data = []
                    i += 1
                    again = False
                else: 
                    sleep(round(random.uniform(3, 7)))
                    again = True
                break
            except Exception as e:
                print(f"Coba lagi ke-{coba+1} untuk poem {poem_url}. Error : {e}")
                sleep(round(random.uniform(3, 7)))
                if coba+1 == 3:
                    writeFile("fail", f"Scrape {poem_url} index ke-{start+idx}", "a")
        
        # Pembersihan memori
        driver.delete_all_cookies()
        driver.execute_script("window.localStorage.clear();")
        driver.execute_script("window.sessionStorage.clear();")
        del url
        gc.collect()
            
    if again:
        print("Banyak data adalah : ", len(data))
        pd.Series(data, dtype="object").to_csv(f"./poem{name}/poem_data{name}_{i}.csv", index=False)
    
    driver.quit()
    
def getAllScrape(name='', custom=False, customList=[]):
    while True:
        with open(f"poem_links{name}.json", "r", encoding="utf-8") as f:
            poemsUrl = json.load(f)
        maxLen = len(poemsUrl)
        print(maxLen)
        
        files = os.listdir(f"./poem{name}")
        numberFiles = [int(csvFile.split('_')[-1].split('.')[0]) for csvFile in files if csvFile.endswith("csv")]
        if len(numberFiles)==0: numberFiles = [0]
        
        del poemsUrl
        del files
        gc.collect()
        try:
            if maxLen == max(numberFiles)*100:
                break
            print(f"i={max(numberFiles)+1}, start={max(numberFiles)*100}")
            scrapePoem(name, i=max(numberFiles)+1, start=max(numberFiles)*100, custom=custom, customLists=customList)
            print("Done")
        except Exception as e:
            print(e)
            if maxLen == max(numberFiles)*100:
                break

In [None]:
getPoemLink()
getAllScrape()

i=41, start=4000
Scrape /poem/archy-s-song-from-charles-the-first-a-widow-bird-sate-mourning-for-her-love/ index ke-4000
Scrape /poem/a-jet-ring-sent/ index ke-4001
Scrape /poem/don-t-drive-me-away/ index ke-4002
Scrape /poem/death-of-a-cockroach/ index ke-4003
Scrape /poem/never-again/ index ke-4004
Scrape /poem/the-guppy/ index ke-4005
Scrape /poem/foster-the-light/ index ke-4006
Scrape /poem/a-certain-kind-of-holy-men/ index ke-4007
Scrape /poem/the-artist/ index ke-4008
Scrape /poem/fire-flowers/ index ke-4009
Scrape /poem/a-valediction-of-weeping-2/ index ke-4010
Scrape /poem/from-the-philosopher-s-stone/ index ke-4011
Scrape /poem/cinema-calendar-of-the-abstract-heart-09/ index ke-4012
Scrape /poem/how-sleep-the-brave/ index ke-4013
Scrape /poem/pear-tree/ index ke-4014
Scrape /poem/the-mad-gardener-s-song-2/ index ke-4015
Scrape /poem/ballad-of-dead-friends/ index ke-4016
Scrape /poem/i-held-a-shelley-manuscript/ index ke-4017
Scrape /poem/1914-iii-the-dead/ index ke-4018
Scrape