<a href="https://colab.research.google.com/github/jor-mar/HandSpeak-Scraper/blob/main/HandSpeak_Scraper.ipynb" target="_parent"><img src="https://colab.research.google.com/assets/colab-badge.svg" alt="Open In Colab"/></a>

In [None]:
from google.colab import drive
drive.mount('/content/drive')

In [None]:
!apt-get update
!apt-get install -y chromium-chromedriver
!pip install selenium yt-dlp opencv-python

In [3]:
import os
import yt_dlp
import cv2
import time
import random
from selenium import webdriver
from selenium.common.exceptions import TimeoutException, NoSuchElementException
from selenium.webdriver.common.by import By
from selenium.webdriver.support.ui import WebDriverWait
from selenium.webdriver.support import expected_conditions as EC

# Selenium ChromeDriver setup for Colab
chrome_options = webdriver.ChromeOptions()
chrome_options.add_argument('--headless')  # Run in background (no UI)
chrome_options.add_argument('--no-sandbox')
chrome_options.add_argument('--disable-dev-shm-usage')
chrome_options.binary_location = "/usr/bin/chromium-browser"

In [4]:
# Step 1: Scrape video URLs and tab text for folder naming
def get_video_links_and_tab_title(url):
    driver = webdriver.Chrome(options=chrome_options)
    driver.get(url)

    try:
        # Wait for a stable element that exists on ALL pages
        WebDriverWait(driver, 120).until(EC.presence_of_element_located((By.TAG_NAME, 'h1')))
    except TimeoutException:
        print(f"Page {url} failed to load properly — skipping.")
        driver.quit()
        return [], ""

    folder_name = driver.title.replace("•", "").replace("ASL Dictionary", "").strip().lower()

    # Now check for videos (this won't hang if none are present)
    videos = [vid.get_attribute('src') for vid in driver.find_elements(By.TAG_NAME, 'video')]

    driver.quit()
    return videos, folder_name

# Step 2: Download videos using yt-dlp
def download_video(url, folder, filename):
    folder = f"/content/drive/MyDrive/ASL/{folder}"
    os.makedirs(folder, exist_ok=True)
    output_path = os.path.join(folder, filename)

    ydl_opts = {
        'outtmpl': output_path,
        'format': 'bestvideo+bestaudio/best',
        'postprocessors': [{'key': 'FFmpegVideoConvertor', 'preferedformat': 'mp4'}]
    }

    with yt_dlp.YoutubeDL(ydl_opts) as ydl:
      try:
          ydl.download([url])
      except Exception as e:
          print(f"Failed to download {url}: {e}")
          return None
    return output_path if os.path.exists(output_path) else None

# Step 3: Extract frames
def extract_frames(video_path, output_folder):
    os.makedirs(output_folder, exist_ok=True)
    cap = cv2.VideoCapture(video_path)
    frame_count = 0
    while True:
        ret, frame = cap.read()
        if not ret:
            break
        cv2.imwrite(f"{output_folder}/frame_{frame_count:03d}.jpg", frame)
        frame_count += 1
    cap.release()

# Step 4: Delete video after extraction (if needed)
def delete_video(video_path):
    if os.path.exists(video_path):
        os.remove(video_path)
        print(f"Deleted video: {video_path}")

In [None]:
def main():
    delete_video_flag = True  # Set to True or False to delete videos after extraction
    max_attempts = 10

    for page_num in range(1, 10974):  # Loop through pages 1 to 10973
        url = f"https://www.handspeak.com/word/{page_num}/"
        print(f"Processing page: {url}")

        attempts = 0
        while attempts < max_attempts:
            try:
                videos, folder_name = get_video_links_and_tab_title(url)
                break  # Exit loop if successful
            except Exception as e:
                attempts += 1
                print(f"Attempt {attempts} failed: {e}")
                time.sleep(2)  # Short delay before retrying
        else:
            print(f"Failed to fetch videos from {url} after {max_attempts} attempts. Stopping program.")
            return  # Stop execution if all attempts fail

        if not videos:
            print(f"No videos found on {url}")
            continue

        for idx, video_url in enumerate(videos):
            video_file = download_video(video_url, folder_name, f"video_{idx}.mp4")
            if video_file:
                extract_frames(video_file, f"/content/drive/MyDrive/ASL/{folder_name}/video_{idx}")

                # Optionally delete video after extraction
                if delete_video_flag:
                    delete_video(video_file)

            # Random delay to avoid server detection
            time.sleep(random.uniform(2, 5))

if __name__ == '__main__':
    main()