Start by making sure you have the following packages in your environment:

In [None]:
# ! pip install huggingface_hub datasets

In [None]:
MANIFEST_PATH = "../dataset/output/manifests.json"
MY_DATASET = "janaab/supreme-court-speech" ## <user-name>/<dataset-name>

In [None]:
import json

segments = []
with open(MANIFEST_PATH, 'r') as file:
    for line in file:
        seg = json.loads(line)
        segments.append(seg)

Check out a few stats on the dataset you have right now:

In [None]:
def total_duration(segments):
    return sum([seg["duration"] for seg in segments])

In [None]:
THRESHOLD = -2.0
clean_segments = [seg for seg in segments if seg["score"]>=THRESHOLD]

In [None]:
total_duration(segments)/3600

In [None]:
total_duration(clean_segments)/3600

## HF Dataset

Create train and test splits, and upload to HF

In [None]:
import random

def split_segments_by_duration_and_length(segments, train_ratio=0.7):
    # Calculate total duration and length
    total_duration = sum(segment['duration'] for segment in segments)
    total_length = len(segments)
    
    # Determine the split thresholds
    train_duration_threshold = train_ratio * total_duration
    train_length_threshold = int(train_ratio * total_length)
    
    # Shuffle segments to randomize order
    random.shuffle(segments)
    
    # Initialize accumulators and split lists
    train_list = []
    test_list = []
    accumulated_duration = 0
    num_train_segments = 0
    
    # Accumulate segments for the train split
    for segment in segments:
        if (accumulated_duration + segment['duration'] <= train_duration_threshold and
            num_train_segments + 1 <= train_length_threshold):
            train_list.append(segment)
            accumulated_duration += segment['duration']
            num_train_segments += 1
        else:
            test_list.append(segment)
    
    return train_list, test_list


In [None]:
train, test = split_segments_by_duration_and_length(clean_segments)

Again, some stats on your dataset:

In [None]:
total_duration(train)/3600

In [None]:
total_duration(test)/3600

In [None]:
print(len(train), len(test))

In [None]:
from datasets import Dataset, DatasetDict, Audio

def create_dataset(segments):
    data = {
        "audio": [seg["audio_filepath"] for seg in segments],
        "transcript": [seg["text"] for seg in segments],
        "duration": [seg["duration"] for seg in segments]
    }
    dataset = Dataset.from_dict(data).cast_column("audio", Audio())
    return dataset

In [None]:
dataset_dict = DatasetDict({
    'train': create_dataset(train),
    'test': create_dataset(test)
})

In [None]:
# Login to HF hub

from huggingface_hub import interpreter_login
# interpreter_login()

In [None]:
# Upload to HF
dataset_dict.push_to_hub(MY_DATASET)