In [1]:
import requests
from bs4 import BeautifulSoup
from selenium import webdriver
from selenium.webdriver.common.by import By
from selenium.webdriver.chrome.service import Service
from selenium.webdriver.chrome.options import Options
from webdriver_manager.chrome import ChromeDriverManager
from supabase import Client, create_client
from datetime import datetime
from dateutil import parser
from time import sleep
import dotenv
import pandas as pd
import random
import json
import gc
import os

dotenv.load_dotenv()

url = os.getenv('SUPABASE_URL')
key = os.getenv('SUPABASE_KEY')

supabase = create_client(url, key)

Selenium was used here because the poem content is **server-side rendering using Javascript into HTML**. Thus, Beutiful Soup can't grab the poem data since the process was *asynchronous*

In [2]:
showAllPoemUrl = "https://www.poemhunter.com/explore/poems/lang-english/{page_number}/"
showPoemUrl = "https://www.poemhunter.com{poem_url}"

def writeFile(fileName, payload, permission="w"):
    txt_url = f"./{fileName}.txt"
    with open(txt_url, permission) as file:
        file.write(f"{payload}\n")

def getPoemLink(minPage=1, maxPage=151, save=True):
    poemLinks = []
    targetClasses = {"phBoxContinue", "txtc", "purpleLink"}
    for i in range(minPage,maxPage):
        url = showAllPoemUrl.format(page_number=i)
        response = requests.get(url)
        if response.status_code != 200:
            print(f"Failed to fetch page {i}")
            return []
        soup = BeautifulSoup(response.text, "html.parser")
        matchingDivs = soup.find_all("div", class_=lambda classList: classList and targetClasses.issubset(set(classList.split())))
        for div in matchingDivs:
            anchor = div.find("a")
            if anchor and "href" in anchor.attrs:
                poemLinks.append(anchor["href"])
    if save:
        filePath = f"poem_links.json"
        with open(filePath, "w", encoding="utf-8") as f:
            json.dump(poemLinks, f)
        return poemLinks
    else:
        return poemLinks

def scrapePoem(name="", i=1, start=0, custom=False, customLists=[]):
    prefs = {
        "profile.managed_default_content_settings.images": 2,
        "profile.managed_default_content_settings.media_stream": 2,
    }
    chrome_options = Options()
    chrome_options.add_argument("--headless=new")
    chrome_options.add_argument("--disable-gpu")
    chrome_options.add_argument("--no-sandbox")
    chrome_options.add_argument("--disable-dev-shm-usage")
    chrome_options.add_argument("--disable-software-rasterizer")
    chrome_options.add_argument("--ignore-certificate-errors")
    chrome_options.add_argument("--ignore-ssl-errors")
    chrome_options.add_argument("--disable-setuid-sandbox")
    chrome_options.add_argument("--disable-machine-learning-model-download")
    chrome_options.add_experimental_option("prefs", prefs)
    service = Service(r"./chromedriver")
    driver = webdriver.Chrome(service=service, options=chrome_options)
    COBA_LAGI = 5
    
    with open(f"poem_links{name}.json", "r", encoding="utf-8") as f:
        poemsUrl = json.load(f)
        
    if custom:
        poemsUrl = [poemsUrl[customList] for customList in customLists]
        start = 0
    
    for idx, poem_url in enumerate(poemsUrl[start:3]): #====
        for coba in range(COBA_LAGI):
            try:
                print(f"Scrape {poem_url} index ke-{start+idx}")
                writeFile("log", f"Scrape {poem_url} index ke-{start+idx}", permission="w")
                url = showPoemUrl.format(poem_url=poem_url)
                driver.get(url)
                
                author = driver.find_elements(By.CSS_SELECTOR, "div.phpdAuthor")[0].text
                title = driver.find_elements(By.CSS_SELECTOR, "div.phPageDetailsTitle")[0].text
                poem_data = driver.find_elements(By.CSS_SELECTOR, "div.phContent.phcText")[0].text
                date_created = driver.find_elements(By.CSS_SELECTOR, "div.phPageDate")[0].text
                rating = driver.find_elements(By.CSS_SELECTOR, "div.phRate > span.rate")[0].text
                date_scraped = datetime.now().date().isoformat()
                
                # print('URL =', url, type(url))
                # print('AUTHOR =', author, type(author))
                # print('TITLE =', title, type(title))
                # print('POEM DATA =', poem_data, type(poem_data))
                # print('DATECREATED =', date_created, type(date_created))
                # print('RATING =', rating, type(rating))
                # print('DATESCRAPED =', date_scraped, type(date_scraped))
                # print("====\n")
                
                data = {
                    'url': url,
                    'author': author,
                    'title': title,
                    'poem': poem_data,
                    'date_created': date_created,
                    'rating': rating,
                    'date_scraped': date_scraped,
                }
                
                respond = supabase.table('poem_data').insert(data).execute()
                print(f'{respond} - {poem_url} index ke-{start+idx}')
                
                # Pilih tidur sebentar atau lama setelah 100 batch
                if (idx+1)%100 == 0: # 
                    durasi = round(random.uniform(60, 120)) #
                    print(f"Sleep checkpoint panjang selama {durasi} detik")
                    sleep(durasi)
                    del durasi
                    del divs
                    gc.collect()
                    i += 1
                else: 
                    sleep(round(random.uniform(2, 5)))
                break
            except Exception as e:
                print(f"Coba lagi ke-{coba+1} untuk poem {poem_url}. Error : {e}")
                sleep(round(random.uniform(5, 10)))
                if coba+1 == 3:
                    writeFile("fail", f"Scrape {poem_url} index ke-{start+idx}", "a")
        
        # Pembersihan memori
        driver.delete_all_cookies()
        driver.execute_script("window.localStorage.clear();")
        driver.execute_script("window.sessionStorage.clear();")
        del url
        gc.collect()
    
    driver.quit()
    
def getAllScrape(name='', custom=False, customList=[]):
    while True:
        with open(f"poem_links{name}.json", "r", encoding="utf-8") as f:
            poemsUrl = json.load(f)
        maxLen = len(poemsUrl)
        print(maxLen)
        
        files = os.listdir(f"./poem{name}")
        numberFiles = [int(csvFile.split('_')[-1].split('.')[0]) for csvFile in files if csvFile.endswith("csv")]
        if len(numberFiles)==0: numberFiles = [0]
        
        del poemsUrl
        del files
        gc.collect()
        try:
            if maxLen == max(numberFiles)*100:
                break
            print(f"i={max(numberFiles)+1}, start={max(numberFiles)*100}")
            scrapePoem(name, i=max(numberFiles)+1, start=max(numberFiles)*100, custom=custom, customLists=customList)
            print("Done")
            break
        except Exception as e:
            print(e)
            if maxLen == max(numberFiles)*100:
                break

In [None]:
getPoemLink(maxPage=111)

In [4]:
df = pd.read_json('poem_links.json')
df.info()

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 11000 entries, 0 to 10999
Data columns (total 1 columns):
 #   Column  Non-Null Count  Dtype 
---  ------  --------------  ----- 
 0   0       11000 non-null  object
dtypes: object(1)
memory usage: 86.1+ KB


In [3]:
scrapePoem()

Scrape /poem/phenomenal-woman/ index ke-0
data=[{'id': 3, 'url': 'https://www.poemhunter.com/poem/phenomenal-woman/', 'author': 'Maya Angelou', 'poem': "Pretty women wonder where my secret lies.\nI'm not cute or built to suit a fashion model's size\nBut when I start to tell them,\nThey think I'm telling lies.\nI say,\nIt's in the reach of my arms\nThe span of my hips,\nThe stride of my step,\nThe curl of my lips.\nI'm a woman\nPhenomenally.\nPhenomenal woman,\nThat's me.\n\nI walk into a room\nJust as cool as you please,\nAnd to a man,\nThe fellows stand or\nFall down on their knees.\nThen they swarm around me,\nA hive of honey bees.\nI say,\nIt's the fire in my eyes,\nAnd the flash of my teeth,\nThe swing in my waist,\nAnd the joy in my feet.\nI'm a woman\nPhenomenally.\nPhenomenal woman,\nThat's me.\n\nMen themselves have wondered\nWhat they see in me.\nThey try so much\nBut they can't touch\nMy inner mystery.\nWhen I try to show them\nThey say they still can't see.\nI say,\nIt's in 