In [1]:
import os
import json
import time
import sys
import urllib.request
from multiprocessing.dummy import Pool
import random
import logging

logging.basicConfig(filename='download_{}.log'.format(int(time.time())), filemode='w', level=logging.DEBUG)
logging.getLogger().addHandler(logging.StreamHandler(sys.stdout))

# Set this to youtube-dl if you want to use youtube-dl.
youtube_downloader = "yt-dlp"

In [2]:
def request_video(url, referer=''):
    user_agent = 'Mozilla/5.0 (Windows; U; Windows NT 5.1; en-US; rv:1.9.0.7) Gecko/2009021910 Firefox/3.0.7'
    headers = {'User-Agent': user_agent} 
    if referer:
        headers['Referer'] = referer
    request = urllib.request.Request(url, None, headers)
    logging.info('Requesting {}'.format(url))
    response = urllib.request.urlopen(request)
    return response.read()

In [3]:
def save_video(data, saveto):
    with open(saveto, 'wb+') as f:
        f.write(data)
    time.sleep(random.uniform(0.5, 1.5))

In [4]:
def download_youtube(url, dirname, video_id):
    raise NotImplementedError("Urllib cannot deal with YouTube links.")

def download_aslpro(url, dirname, video_id):
    saveto = os.path.join(dirname, '{}.swf'.format(video_id))
    if os.path.exists(saveto):
        logging.info('{} exists at {}'.format(video_id, saveto))
        return
    data = request_video(url, referer='http://www.aslpro.com/cgi-bin/aslpro/aslpro.cgi')
    save_video(data, saveto)

In [5]:
def download_nonyt_videos(indexfile, saveto='raw_videos'):
    content = json.load(open(indexfile))
    if not os.path.exists(saveto):
        os.mkdir(saveto)
    for entry in content:
        gloss = entry['gloss']
        instances = entry['instances']
        for inst in instances:
            video_url = inst['url']
            video_id = inst['video_id']
            logging.info('gloss: {}, video: {}.'.format(gloss, video_id))
            try:
                data = request_video(video_url)
                save_video(data, os.path.join(saveto, '{}.mp4'.format(video_id)))
            except Exception as e:
                logging.error('Failed to download video: {}'.format(video_id))

In [6]:
def check_youtube_dl_version():
    ver = os.popen(f'{youtube_downloader} --version').read()
    assert ver, f"{youtube_downloader} cannot be found in PATH. Please verify your installation."

def download_yt_videos(indexfile, saveto='raw_videos'):
    content = json.load(open(indexfile))
    if not os.path.exists(saveto):
        os.mkdir(saveto)
    for entry in content:
        gloss = entry['gloss']
        instances = entry['instances']
        for inst in instances:
            video_url = inst['url']
            video_id = inst['video_id']
            if 'youtube' not in video_url and 'youtu.be' not in video_url:
                continue
            cmd = f"{youtube_downloader} '{video_url}' -o '{saveto}/%(id)s.%(ext)s'"
            rv = os.system(cmd)
            if not rv:
                logging.info(f'Finished downloading: {video_url}')
            else:
                logging.error(f'Failed downloading: {video_url}')
            time.sleep(random.uniform(1.0, 1.5))

In [None]:
if __name__ == '__main__':
    logging.info('Start downloading non-YouTube videos.')
    download_nonyt_videos('WLASL_v0.3.json')
   

Start downloading non-YouTube videos.
gloss: book, video: 69241.
Requesting http://aslbricks.org/New/ASL-Videos/book.mp4
gloss: book, video: 65225.
Requesting https://aslsignbank.haskins.yale.edu/dictionary/protected_media/glossvideo/ASL/BO/BOOK-418.mp4
Failed to download video: 65225
gloss: book, video: 68011.
Requesting https://www.youtube.com/watch?v=0UsjUE-TXns
gloss: book, video: 68208.
Requesting https://www.youtube.com/watch?v=1QOYOZ3g-aY
gloss: book, video: 68012.
Requesting https://www.youtube.com/watch?v=aGtIHKEdCds
gloss: book, video: 70212.
Requesting https://www.youtube.com/watch?v=hjS0dQDgbjo
gloss: book, video: 70266.
Requesting https://www.youtube.com/watch?v=WGfiiDgrq1I
gloss: book, video: 07085.
Requesting http://www.aslpro.com/main/b/book_english_grammar.swf
Failed to download video: 07085
gloss: book, video: 07086.
Requesting http://www.aslpro.com/main/b/book_geography.swf
Failed to download video: 07086
gloss: book, video: 07087.
Requesting http://www.aslpro.com/ma

/bin/sh: 1: yt-dlp: not found


AssertionError: yt-dlp cannot be found in PATH. Please verify your installation.

In [None]:
check_youtube_dl_version()
logging.info('Start downloading YouTube videos.')
download_yt_videos('WLASL_v0.3.json')

In [None]:
# Get the list of filenames in the 'videos' directory
filenames = set(os.listdir('raw_videos'))

# Load JSON content from the file
with open('WLASL_v0.3.json', 'r') as json_file:
    content = json.load(json_file)

missing_ids = []

# Iterate over each entry and its instances
for entry in content:
    for inst in entry['instances']:
        video_id = inst['video_id']
        # Check if the corresponding .mp4 file is missing
        if video_id + '.mp4' not in filenames:
            missing_ids.append(video_id)

# Save the missing video IDs in 'missing.txt'
with open('missing.txt', 'w') as f:
    f.write('\n'.join(missing_ids))