In [38]:
import json

input_file_path = 'MSASL_train.json'
output_file_path = 'parsed-datasets/06_train.json'
num_vids = 50
max_seconds = 58
num_classes = 6

# Read the original JSON data
with open(input_file_path, 'r') as file:
    data = json.load(file)

limited_entries_by_label = {label: [] for label in range(num_classes)}  # For labels 0 through 5

for entry in data:
    if 0 <= entry["label"] < num_classes and entry["end_time"] < max_seconds: # less than 10 seconds to reduce download time
        if len(limited_entries_by_label[entry["label"]]) < num_vids:
            limited_entries_by_label[entry["label"]].append(entry)

limited_filtered_data = [item for sublist in limited_entries_by_label.values() for item in sublist]

filtered_data = sorted(limited_filtered_data, key=lambda x: x["label"])


# Write the filtered data to a new JSON file
with open(output_file_path, 'w') as outfile:
    json.dump(filtered_data, outfile, indent=4)

print(f"Filtered JSON file created at: {output_file_path}")
print(f"Number of elements in the filtered JSON: {len(filtered_data)}")

Filtered JSON file created at: parsed-datasets/06_train.json
Number of elements in the filtered JSON: 145


In [39]:
label_counts = {label: 0 for label in range(6)}
for entry in filtered_data:
    label_counts[entry["label"]] += 1
for label, count in label_counts.items():
    print(f"Label {label}: {count} elements")

Label 0: 22 elements
Label 1: 25 elements
Label 2: 28 elements
Label 3: 30 elements
Label 4: 23 elements
Label 5: 17 elements


In [21]:
input_file_path = 'MSASL_test.json'
output_file_path = 'parsed-datasets/06_test.json'

# Read the original JSON data
with open(input_file_path, 'r') as file:
    data = json.load(file)

# Filter the data based on the specified criteria
filtered_data = [entry for entry in data if 0 <= entry["label"] <= 5 and entry["end_time"] < 10]

filtered_data = sorted(filtered_data, key=lambda x: x["label"])

# Write the filtered data to a new JSON file
with open(output_file_path, 'w') as outfile:
    json.dump(filtered_data, outfile, indent=4)

print(f"Filtered JSON file created at: {output_file_path}")
print(f"Number of elements in the filtered JSON: {len(filtered_data)}")

Filtered JSON file created at: parsed-datasets/06_test.json
Number of elements in the filtered JSON: 32


In [22]:
label_counts = {label: 0 for label in range(6)}
for entry in filtered_data:
    label_counts[entry["label"]] += 1
for label, count in label_counts.items():
    print(f"Label {label}: {count} elements")

Label 0: 17 elements
Label 1: 3 elements
Label 2: 4 elements
Label 3: 5 elements
Label 4: 1 elements
Label 5: 2 elements


In [23]:
input_file_path = 'MSASL_val.json'
output_file_path = 'parsed-datasets/06_val.json'

# Read the original JSON data
with open(input_file_path, 'r') as file:
    data = json.load(file)

# Filter the data based on the specified criteria
filtered_data = [entry for entry in data if 0 <= entry["label"] <= 5 and entry["end_time"] < 10]

filtered_data = sorted(filtered_data, key=lambda x: x["label"])

# Write the filtered data to a new JSON file
with open(output_file_path, 'w') as outfile:
    json.dump(filtered_data, outfile, indent=4)

print(f"Filtered JSON file created at: {output_file_path}")
print(f"Number of elements in the filtered JSON: {len(filtered_data)}")

Filtered JSON file created at: parsed-datasets/06_val.json
Number of elements in the filtered JSON: 16


In [24]:
label_counts = {label: 0 for label in range(6)}
for entry in filtered_data:
    label_counts[entry["label"]] += 1
for label, count in label_counts.items():
    print(f"Label {label}: {count} elements")

Label 0: 6 elements
Label 1: 2 elements
Label 2: 1 elements
Label 3: 3 elements
Label 4: 2 elements
Label 5: 2 elements


In [41]:
# create .txt video list for use with yt-dlp

from math import floor, ceil

# the file
json_file_path = 'parsed-datasets/06_train.json'  
txt_file_path = 'video_list.txt' 

with open(json_file_path, 'r') as json_file:
    data = json.load(json_file)

with open(txt_file_path, 'w') as txt_file:
    for entry in data:
        # Extracting and formatting the start and end times
        start_minutes, start_seconds = divmod(floor(entry['start_time']), 60)
        end_minutes, end_seconds = divmod(ceil(entry['end_time']), 60)

        # Formatting the line to be written to the txt file
        line = f'{entry["url"]},*{start_minutes}:{start_seconds:02d}-{end_minutes}:{end_seconds:02d}\n'
        
        # Writing the formatted line to the txt file
        txt_file.write(line)

print(f'YouTube links with times have been written to {txt_file_path}.')



YouTube links with times have been written to video_list.txt.


In [51]:
# Links in the MS-ASL dataset have been either made private, or the link does not exist anymore. When downloadingn files, yt-dlp sends an erro rin the log and moves on to the next download. Therefore, in the downloaded video-files directory, I need to look through every video file and find the corresponding match in the 06_train.json file in order to create labels for each. 

import os
import json
import re


with open('parsed-datasets/06_train.json') as f:
    data = json.load(f)

# Extract IDs and their corresponding labels from the JSON URLs
json_ids_labels = {}
url_pattern = re.compile(r"v=([a-zA-Z0-9_-]+)")
for item in data:
    match = url_pattern.search(item["url"])
    if match:
        json_ids_labels[match.group(1)] = item["label"]

video_dir = 'video-files'

video_labels = {}

# Iterate through each file in the video-files directory
for filename in os.listdir(video_dir):
    if filename.endswith(('.mp4', '.mkv', '.webm')):
        file_id = filename.split('-')[-1].rsplit('.', 1)[0]
        if file_id in json_ids_labels:
            video_labels[filename] = json_ids_labels[file_id]
        else:
            print(filename, file_id)

# Output the mapping of video files to labels
for video, label in video_labels.items():
    print(f"{video}: Label {label}")

TEACHER-hJZhwVjk-eo.mkv eo
First 100 Verbs in ASL with captions-thdb-I-H9kE.webm H9kE
How to sign 'Hello'-QB44Vddoi-w.webm w
Basic ASL Vocabulary for Babies-htsdwxJ-fTo.webm fTo
'teacher' in American Sign Language--DZaI_yoNac.mkv DZaI_yoNac
Examples of Baby Sign Language-6UrcyZ-QeiU.webm QeiU
ASL Class 02-08-10--LB4ENHxcIs.webm LB4ENHxcIs
Unit 2 Vocabulary (Part 1) rev. 2018-AkGYEiN8vOY.mp4: Label 2
eat - ASL sign for eat-E4LtjQ3gUO0.webm: Label 3
SWIC Community Ed - Class Two-SC9lyDxbwUE.mkv: Label 5
SN Unit 1 Video Vocab Review-p8OYydc3WQM.mkv: Label 0
ASL： Food אוכל-ga_qPj5JN9c.mkv: Label 3
ASL Lesson 1-DOZJOFHs75s.webm: Label 4
TEACHER-CrUCwJklAUA.webm: Label 2
Next Words： People in ASL with captions-Ni6Uixw5Qmo.mkv: Label 2
How to sign Food - ASL Vocabulary Series-0YnizmAgAfU.mkv: Label 3
NO-[opposite-of-yes]-pkzfT9cYvH0.webm: Label 4
ASL Vocabulary ＂nice＂-hesnczlStrA.mkv: Label 1
Hello-FVjpLa8GqeM.webm: Label 0
ASL Level 1 Unit 2 Vocabulary-vcrTUbE1NoY.mkv: Label 2
How to sign Ea

In [52]:
# TEACHER-hJZhwVjk-eo.mkv eo
# First 100 Verbs in ASL with captions-thdb-I-H9kE.webm H9kE
# How to sign 'Hello'-QB44Vddoi-w.webm w
# Basic ASL Vocabulary for Babies-htsdwxJ-fTo.webm fTo
# 'teacher' in American Sign Language--DZaI_yoNac.mkv DZaI_yoNac
# Examples of Baby Sign Language-6UrcyZ-QeiU.webm QeiU
# ASL Class 02-08-10--LB4ENHxcIs.webm LB4ENHxcIs

video_labels['TEACHER-hJZhwVjk-eo.mkv'] = 2
video_labels['First 100 Verbs in ASL with captions-thdb-I-H9kE.webm'] = 3
video_labels["How to sign 'Hello'-QB44Vddoi-w.webm"] = 0
video_labels['Basic ASL Vocabulary for Babies-htsdwxJ-fTo.webm'] = 3
video_labels["'teacher' in American Sign Language--DZaI_yoNac.mkv"] = 2
video_labels['Examples of Baby Sign Language-6UrcyZ-QeiU.webm'] = 3
video_labels['ASL Class 02-08-10--LB4ENHxcIs.webm'] = 4

In [53]:
len(video_labels)

67

In [54]:
file_path = './video_labels.json'

with open(file_path, 'w') as file:
    json.dump(video_labels, file)