This notebook does the following in order:

1. Read from the sitelist.csv file, and generate a dataframe of domains
2. Run Selenium to launch Chrome, and navigate to the sites one at a time, and take screenshots. The browser will scroll down to the end of each page, screen by screen. There's also an escape function, in case the webpage times out.

In [1]:
from selenium import webdriver
from selenium.webdriver.chrome.options import Options
from selenium.common.exceptions import TimeoutException
from datetime import datetime
import time
import os
import pandas as pd

In [3]:
# Load the list of sites from the CSV
sitelist_df = pd.read_csv('./sitelist.csv')

# Convert the column to a list of URLs and add 'https://' prefix
urls = ["https://" + site for site in sitelist_df.iloc[:, 0]]

# Create a screenshots directory if it doesn't exist
if not os.path.exists("screenshots"):
    os.makedirs("screenshots")

# Function to format current timestamp
def get_timestamp():
    return datetime.now().strftime("%Y%m%d_%H%M%S")

In [5]:
# Initialize Chrome options
chrome_options = Options()
chrome_options.add_argument("--disable-gpu")  # Disables GPU acceleration
# chrome_options.add_argument("--headless") # Runs without a window i.e. headless mode

# Set window size for desktop (1920x1080) and mobile (360x800)
desktop_window_size = "--window-size=1920x1080"
mobile_window_size = "--window-size=360x800"

In [7]:
# Capture screenshots for both desktop and mobile views
for device in ["desktop", "mobile"]:
    # Configure Chrome options based on device type
    if device == "desktop":
        chrome_options.add_argument(desktop_window_size)
    elif device == "mobile":
        chrome_options.add_argument(mobile_window_size)
        # Emulate a mobile device
        mobile_emulation = {
            "deviceMetrics": {"width": 360, "height": 800, "pixelRatio": 3.0},
            "userAgent": "Mozilla/5.0 (Linux; Android 9; Pixel 3 XL) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/86.0.4240.198 Mobile Safari/537.36"
        }
        chrome_options.add_experimental_option("mobileEmulation", mobile_emulation)

    # Initialize the WebDriver with the options
    driver = webdriver.Chrome(options=chrome_options)

    # Set a page load timeout of 30 seconds
    driver.set_page_load_timeout(30)

    for url in urls:
        try:
            driver.get(url)
            time.sleep(15)  # Wait for the page to load
        except TimeoutException:
            print(f"Page load timeout for {url}. Moving to the next site.")
            continue  # Move to the next URL if page load times out
        
        # Format the site name and timestamp
        site_name = url.replace('https://', '').replace('/', '_')
        timestamp = get_timestamp()
        
        # Capture full-page screenshot
        full_page_screenshot = f"screenshots/{site_name}_{device}_{timestamp}_full.png"
        driver.save_screenshot(full_page_screenshot)
        
        # Capture viewable-area screenshots while scrolling
        scroll_height = driver.execute_script("return document.body.scrollHeight")
        viewable_area_height = driver.execute_script("return window.innerHeight")
        
        for i in range(0, scroll_height, viewable_area_height):
            driver.execute_script(f"window.scrollTo(0, {i})")
            time.sleep(10)  # Wait for the scroll action to complete
            viewable_screenshot = f"screenshots/{site_name}_{device}_{timestamp}_view_{i}.png"
            driver.save_screenshot(viewable_screenshot)
    
    driver.quit()

Page load timeout for https://ko.yourtripagent.com. Moving to the next site.
Page load timeout for https://laodong.vn. Moving to the next site.
Page load timeout for https://malaysiakini.com. Moving to the next site.
Page load timeout for https://matichon.co.th. Moving to the next site.
Page load timeout for https://philnews.ph. Moving to the next site.
Page load timeout for https://planetware.com. Moving to the next site.
Page load timeout for https://plo.vn. Moving to the next site.
Page load timeout for https://politico.com. Moving to the next site.
Page load timeout for https://politics.com.ph. Moving to the next site.
Page load timeout for https://president.jp. Moving to the next site.
Page load timeout for https://primer.com.ph. Moving to the next site.
Page load timeout for https://princeoftravel.com. Moving to the next site.
Page load timeout for https://qz.com. Moving to the next site.
Page load timeout for https://rdasia.com. Moving to the next site.
Page load timeout for htt