In [13]:
import os
import re
import requests
from bs4 import BeautifulSoup
from urllib.parse import urljoin
import urllib.request
import time


In [5]:
base_url = "https://www.handspeak.com/word/"
output_folder = "asl_videos"
os.makedirs(output_folder, exist_ok=True)

# Get the index page
headers = {
    "User-Agent": "Mozilla/5.0 (Windows NT 10.0; Win64; x64) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/124.0.0.0 Safari/537.36"
}

response = requests.get(base_url, headers=headers)
soup = BeautifulSoup(response.content, 'html.parser')

In [9]:
# Step 1: Get all word page links
# Find all <a> tags with href like "/word/1234"
word_links = soup.find_all("a", href=re.compile(r"^/word/\d+"))

In [11]:
len(word_links)

315

In [8]:
# Find all <a> tags with href like "/word/1234"
word_links = soup.find_all("a", href=re.compile(r"^/word/\d+"))

print(f"Found {len(word_links)} links")
for link in word_links[:5]:  # Show a few examples
    print(f"{link.get_text(strip=True)} -> {link['href']}")

Found 315 links
about -> /word/6/
again -> /word/51/
ask -> /word/117/
bad -> /word/150/
boy -> /word/223/


In [24]:
# Step 2: Visit each word page and download the 2nd video
for link in word_links:
    relative_url = link['href']
    full_url = urljoin(base_url, relative_url)

    if full_url in seen:
        continue
    seen.add(full_url)

    try:
        word_page = requests.get(full_url, headers=headers)
        word_soup = BeautifulSoup(word_page.content, 'html.parser')

        # Get word name from <h1> or <title>
        heading = word_soup.find("h1") or word_soup.find("title")
        word_name = heading.get_text(strip=True).lower().replace(" ", "_") if heading else "unknown"

        # Find the second video with class 'v-asl'
        video_tags = word_soup.find_all("video", class_="v-asl")
        if len(video_tags) < 2:
            print(f"❌ Less than 2 .v-asl videos for {word_name} ({full_url})")
            continue

        video_tag = video_tags[1]
        video_url = video_tag.get('src')

        if not video_url:
            print(f"❌ No src found for video on {full_url}")
            continue

        video_url = urljoin(full_url, video_url)
        video_path = os.path.join(output_folder, f"{word_name}.mp4")

        # Download video using spoofed headers
        video_headers = {
            "User-Agent": headers["User-Agent"],
            "Referer": full_url
        }

        video_response = requests.get(video_url, headers=video_headers, stream=True)
        if video_response.status_code == 200:
            with open(video_path, "wb") as f:
                for chunk in video_response.iter_content(chunk_size=8192):
                    f.write(chunk)
            print(f"✅ Downloaded: {word_name}")
        else:
            print(f"❌ Failed to download {word_name} (status code: {video_response.status_code})")

        time.sleep(0.5)  # Be kind to the server

    except Exception as e:
        print(f"⚠️ Error processing {full_url}: {e}")

❌ Less than 2 .v-asl videos for can_in_sign_language (https://www.handspeak.com/word/307/)
✅ Downloaded: come_in_sign_language
✅ Downloaded: deaf_in_sign_language
✅ Downloaded: different_in_sign_language
✅ Downloaded: drink_in_sign_language
✅ Downloaded: eat_in_sign_language
✅ Downloaded: family_in_sign_language
❌ Less than 2 .v-asl videos for feel_in_sign_language (https://www.handspeak.com/word/768/)
✅ Downloaded: few_in_sign_language
❌ Less than 2 .v-asl videos for find_in_sign_language (https://www.handspeak.com/word/2746/)
✅ Downloaded: fine_in_sign_language
✅ Downloaded: finish_in_sign_language
✅ Downloaded: for_in_sign_language
✅ Downloaded: forget_in_sign_language
✅ Downloaded: friend_in_sign_language
❌ Less than 2 .v-asl videos for get_in_sign_language (https://www.handspeak.com/word/901/)
✅ Downloaded: girl_in_sign_language
✅ Downloaded: give_in_sign_language
✅ Downloaded: go,_went_in_sign_language
⚠️ Error processing https://www.handspeak.com/word/926/: [Errno 22] Invalid ar

Download videos missed in initial scrape

In [26]:
# Get already downloaded words (without .mp4)
downloaded_words = {
    os.path.splitext(f)[0] for f in os.listdir(output_folder) if f.endswith(".mp4")
}
print(f"📂 Found {len(downloaded_words)} videos already downloaded.")


📂 Found 181 videos already downloaded.


In [27]:
seen = set()

# Step 2: Process each word page
for link in word_links:
    relative_url = link['href']
    full_url = urljoin(base_url, relative_url)

    if full_url in seen:
        continue
    seen.add(full_url)

    try:
        word_page = requests.get(full_url, headers=headers)
        word_soup = BeautifulSoup(word_page.content, 'html.parser')

        # Extract the word name
        heading = word_soup.find("h1") or word_soup.find("title")
        word_name = heading.get_text(strip=True).lower().replace(" ", "_") if heading else "unknown"

        # Skip if already downloaded
        if word_name in downloaded_words:
            print(f"⏩ Skipping already downloaded: {word_name}")
            continue

        # Find video tags
        video_tags = word_soup.find_all("video", class_="v-asl")

        video_tag = None
        if len(video_tags) >= 2:
            video_tag = video_tags[1]
        elif len(video_tags) == 1:
            video_tag = video_tags[0]
            print(f"⚠️ Only one video for {word_name}, using fallback.")

        if not video_tag:
            print(f"❌ No usable video tag for {word_name}")
            continue

        # Get video URL
        video_url = video_tag.get("src")
        if not video_url:
            print(f"❌ No src attribute for video tag of {word_name}")
            continue

        video_url = urljoin(full_url, video_url)
        video_path = os.path.join(output_folder, f"{word_name}.mp4")

        # Download with spoofed headers
        video_headers = {
            "User-Agent": headers["User-Agent"],
            "Referer": full_url
        }

        video_response = requests.get(video_url, headers=video_headers, stream=True)
        if video_response.status_code == 200:
            with open(video_path, "wb") as f:
                for chunk in video_response.iter_content(chunk_size=8192):
                    f.write(chunk)
            print(f"✅ Downloaded: {word_name}")
        else:
            print(f"❌ Failed to download {word_name} (status code: {video_response.status_code})")

        time.sleep(0.5)

    except Exception as e:
        print(f"⚠️ Error processing {full_url}: {e}")

✅ Downloaded: about_in_sign_language
⚠️ Only one video for again_in_sign_language, using fallback.
✅ Downloaded: again_in_sign_language
✅ Downloaded: ask_in_sign_language
⚠️ Only one video for bad_in_sign_language, using fallback.
✅ Downloaded: bad_in_sign_language
✅ Downloaded: boy_in_sign_language
✅ Downloaded: but_in_sign_language
⚠️ Only one video for buy_in_sign_language, using fallback.
✅ Downloaded: buy_in_sign_language
⚠️ Only one video for can_in_sign_language, using fallback.
✅ Downloaded: can_in_sign_language
⏩ Skipping already downloaded: come_in_sign_language
⏩ Skipping already downloaded: deaf_in_sign_language
⏩ Skipping already downloaded: different_in_sign_language
⏩ Skipping already downloaded: drink_in_sign_language
⏩ Skipping already downloaded: eat_in_sign_language
⏩ Skipping already downloaded: family_in_sign_language
⚠️ Only one video for feel_in_sign_language, using fallback.
✅ Downloaded: feel_in_sign_language
⏩ Skipping already downloaded: few_in_sign_language


In [28]:
video_urls = [
    "https://www.handspeak.com/word/1496/",
    "https://www.handspeak.com/word/120/",
    "https://www.handspeak.com/word/1112/",
    "https://www.handspeak.com/word/1911/",
    "https://www.handspeak.com/word/926/",
]

headers = {
    "User-Agent": "Mozilla/5.0 (Windows NT 10.0; Win64; x64) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/124.0.0.0 Safari/537.36"
}

for page_url in video_urls:
    try:
        res = requests.get(page_url, headers=headers)
        soup = BeautifulSoup(res.content, "html.parser")

        # Extract the word from the <h1> or <title> tag
        heading = soup.find("h1") or soup.find("title")
        word_name = heading.get_text(strip=True).lower().replace(" ", "_") if heading else "unknown"

        video_tag = soup.find("video")
        video_url = video_tag.get("src") if video_tag else None

        if not video_url:
            print(f"❌ No video found for {word_name} ({page_url})")
            continue

        video_url = urljoin(page_url, video_url)
        video_path = os.path.join(output_folder, f"{word_name}.mp4")

        video_headers = {
            "User-Agent": headers["User-Agent"],
            "Referer": page_url
        }

        video_res = requests.get(video_url, headers=video_headers, stream=True)
        if video_res.status_code == 200:
            with open(video_path, "wb") as f:
                for chunk in video_res.iter_content(chunk_size=8192):
                    f.write(chunk)
            print(f"✅ Downloaded: {word_name}")
        else:
            print(f"❌ Failed to download {word_name} (status code: {video_res.status_code})")

    except Exception as e:
        print(f"⚠️ Error downloading from {page_url}: {e}")

⚠️ Error downloading from https://www.handspeak.com/word/1496/: [Errno 22] Invalid argument: 'asl_videos\\"no"_in_sign_language.mp4'
⚠️ Error downloading from https://www.handspeak.com/word/120/: [Errno 2] No such file or directory: 'asl_videos\\asl_/_american_sign_language_/_ameslan.mp4'
⚠️ Error downloading from https://www.handspeak.com/word/1112/: [Errno 22] Invalid argument: 'asl_videos\\"in"_in_sign_language.mp4'
⚠️ Error downloading from https://www.handspeak.com/word/1911/: [Errno 22] Invalid argument: 'asl_videos\\"see"_in_sign_language.mp4'
⚠️ Error downloading from https://www.handspeak.com/word/926/: [Errno 22] Invalid argument: 'asl_videos\\"good"_in_sign_language.mp4'


File Name Clean Up

In [1]:
import os
import re

# ← Update this to your actual folder path
folder = r"asl_videos"

# Compile regexes for both naming patterns
p1 = re.compile(r"^(?P<word>.+)_in_sign_language\.mp4$")
p2 = re.compile(r"^signs_for_(?P<word>.+)\.mp4$")

for fname in os.listdir(folder):
    # Skip anything that isn’t a .mp4
    if not fname.lower().endswith(".mp4"):
        continue

    new_name = None
    if m := p1.match(fname):
        new_name = f"{m.group('word')}.mp4"
    elif m := p2.match(fname):
        new_name = f"{m.group('word')}.mp4"

    # If it matched one of our patterns, rename it
    if new_name and new_name != fname:
        src = os.path.join(folder, fname)
        dst = os.path.join(folder, new_name)
        if os.path.exists(dst):
            print(f"⚠️ Destination exists, skipping: {dst}")
        else:
            os.rename(src, dst)
            print(f"✅ Renamed: {fname} → {new_name}")

✅ Renamed: about_in_sign_language.mp4 → about.mp4
✅ Renamed: accept_in_sign_language.mp4 → accept.mp4
✅ Renamed: afternoon_in_sign_language.mp4 → afternoon.mp4
✅ Renamed: against_in_sign_language.mp4 → against.mp4
✅ Renamed: again_in_sign_language.mp4 → again.mp4
✅ Renamed: agree_in_sign_language.mp4 → agree.mp4
✅ Renamed: allow_in_sign_language.mp4 → allow.mp4
✅ Renamed: all_in_sign_language.mp4 → all.mp4
✅ Renamed: alone_in_sign_language.mp4 → alone.mp4
✅ Renamed: always_in_sign_language.mp4 → always.mp4
✅ Renamed: and_in_sign_language.mp4 → and.mp4
✅ Renamed: angry_in_sign_language.mp4 → angry.mp4
✅ Renamed: animal_in_sign_language.mp4 → animal.mp4
✅ Renamed: answer_in_sign_language.mp4 → answer.mp4
✅ Renamed: anything_in_sign_language.mp4 → anything.mp4
✅ Renamed: any_in_sign_language.mp4 → any.mp4
✅ Renamed: argue,_argument_in_sign_language.mp4 → argue,_argument.mp4
✅ Renamed: arrive_in_sign_language.mp4 → arrive.mp4
✅ Renamed: ask_in_sign_language.mp4 → ask.mp4
✅ Renamed: asl_in_