# Get listing pages

In [74]:
import os
import re
import time

from datetime import date
from seleniumwire import webdriver
from selenium.webdriver.common.by import By
from selenium.webdriver.common.keys import Keys

import chromedriver_binary

## Set up browser and driver

In [30]:
def open_browser(headless=False):
    """
    Opens a new automated browser window with all tell-tales of automated browser disabled
    """
    options = webdriver.ChromeOptions()
    options.add_argument("start-maximized")
    if headless:
        options.add_argument("--headless")
    
    # remove all signs of this being an automated browser
    options.add_argument('--disable-blink-features=AutomationControlled')
    options.add_experimental_option("excludeSwitches", ["enable-automation"])
    options.add_experimental_option('useAutomationExtension', False)

    # open the browser with the new options
    driver = webdriver.Chrome(options=options)
    return driver

In [124]:
class CPLSeleniumManager:
    def __init__(self):
        self.setup_fetch_dirs()
        # visit the page
        url = 'https://www.creepypasta.com/archive/?_orderby=date'
        self.driver = self.open_driver()
        self.driver.get(url)

    def open_driver(self):
        # open driver
        try:
            driver.close()
        except:
            print("no webdrivers open")
        finally:
            driver = open_browser()
        return driver

    def setup_fetch_dirs(self):
        # create dirs for page_source
        # this is where all files will be saved
        self.list_pages_dir = "list-pages/"
        os.makedirs(self.list_pages_dir, exist_ok=True)
        self.story_pages_dir = "story-pages/"
        os.makedirs(self.story_pages_dir, exist_ok=True)
        self.test_dir = "test/"
        os.makedirs(self.test_dir, exist_ok=True)
    
    def close_popups(self):
        for i in range(2):
            CPLSeleniumAdHandler.close_first_popups(self.driver)
            CPLSeleniumAdHandler.close_some_ads(self.driver)
            time.sleep(2)

## Close ads that block target elements

In [125]:
class CPLSeleniumAdHandler:    
    @classmethod
    # TODO: accept/reject popup
    def close_first_popups(cls, driver):
        # todo: wait until load
        try:
            signup_updates_popup_element = driver.find_element(
                By.XPATH,
                '//*[@id="onesignal-slidedown-cancel-button"]'
            )
            signup_updates_popup_element.click()
        except:
            print("no signup popup element on page")
    
    @classmethod
    def close_some_ads(cls, driver):
        try:
            pg_top_ad_element_to_close = driver.find_element(
                By.XPATH,
                '//*[@id="pw-close-btn"]'
            )
            pg_top_ad_element_to_close.click()
        except:
            print("no pg top ad element on page")

        try:
            pg_btm_ad_element_to_close = driver.find_element(
                By.XPATH,
                '//*[@id="pw-oop-bottom_rail"]/div[2]'
            )
            pg_btm_ad_element_to_close.click()
        except:
            print("no pg btm ad element on page")

        try:
            pg_btm_ad_element_to_close_2 = driver.find_element(
                By.XPATH,
                '//*[@id="pw-oop-bottom_rail2"]/div[2]'
            )
            pg_btm_ad_element_to_close_2.click()
        except:
            print("no pg btm ad 2 element on page")

## Page Nav Functions

In [133]:
class CPLSeleniumPageNav:
    @classmethod
    def go_to_next_page(cls, driver):
        next_page_element = driver.find_element(
            By.XPATH,
            '//*[@id="post-40339"]/div/div[1]/div[4]/ul/li[8]/a'
        )

        # need to scroll down to click
        # otherwise action will be caught by iframe/ads
        scroll_into_view_js = 'document.querySelector("#post-40339 > div > div.pt-cv-wrapper > div.text-left.pt-cv-pagination-wrapper > ul > li.cv-pageitem-next > a").scrollIntoView();'
        driver.execute_script(scroll_into_view_js, next_page_element)
        driver.execute_script("window.scrollBy(0, -50);")

        # checking to see if clicking does anything, or if button is already "active"
        # todo: refactor to exclude already "active" (red) elements earlier?
        current_url = driver.current_url # before click
        print(current_url)
        next_page_element.click()
        print(driver.current_url)

        # if the "clicked" page is new, then return True
        # this will tell the calling function to use the results
        # and continue looping
        return (current_url != driver.current_url)
    
    @classmethod
    def reset_to_first_page(cls, driver):
        next_page_element = driver.find_element(
            By.XPATH,
            '//*[@id="post-40339"]/div/div[1]/div[4]/ul/li[8]/a'
        )
        scroll_into_view_js = 'document.querySelector("#post-40339 > div > div.pt-cv-wrapper > div.text-left.pt-cv-pagination-wrapper > ul > li.cv-pageitem-next > a").scrollIntoView();'
        driver.execute_script(scroll_into_view_js, next_page_element)
        driver.execute_script("window.scrollBy(0, -50);")
        first_page_carrots = driver.find_element(
            By.XPATH,
            '//*[@id="post-40339"]/div/div[1]/div[4]/ul/li[1]/a'
        )
        first_page_carrots.click()
    
    @classmethod
    def save_current_page(cls, driver, dest_dir):
        dest = f"{dest_dir}{date.today()}_{driver.current_url.split('?')[-1]}.html"
        print(dest)
        source = driver.page_source
        with open(dest, 'w') as f:
            f.write(source)

In [140]:
# testing
CPLSeleniumPageNav.go_to_next_page(cplm.driver)

https://www.creepypasta.com/archive/?_orderby=date&_page=2
https://www.creepypasta.com/archive/?_orderby=date&_page=3


True

In [136]:
# testing
CPLSeleniumPageNav.reset_to_first_page(cplm.driver)

In [141]:
# testing
CPLSeleniumPageNav.save_current_page(cplm.driver, cplm.test_dir)

test/2023-07-25__orderby=date&_page=3.html


# Loop through and save all pages

In [126]:
cplm = CPLSeleniumManager()

no webdrivers open


In [142]:
cplm.close_popups()

no signup popup element on page
no pg top ad element on page
no pg btm ad element on page
no pg btm ad 2 element on page
no signup popup element on page
no pg top ad element on page
no pg btm ad element on page
no pg btm ad 2 element on page


In [110]:
#### NEED TO RETEST
def get_all_list_pages(cplmanager):
    # run for page 1
    CPLSeleniumPageNav.save_current_page(cplmanager.driver, cplmanager.list_pages_dir)

    # do the looping now
    while True:
        if not cplmanager.go_to_next_page(cplmanager.driver):
            break
        # todo: do a smarter wait, lol
        time.sleep(2)
        CPLSeleniumPageNav.save_current_page(cplmanager.driver, cplmanager.list_pages_dir)