In [8]:
from datasets import load_dataset
from datasets import get_dataset_config_names, get_dataset_config_info

import pandas as pd
import json

# Explore the Features of the dataset

In [2]:
'''
    >> Get all avalailable configs
        * Configs are the set of folders in a way the dataset is organized into.
'''
all_configs = get_dataset_config_names("iDRAMALab/iDRAMA-rumble-2024")
all_configs

Downloading readme:   0%|          | 0.00/5.42k [00:00<?, ?B/s]

['face_embeddings',
 'representative_images',
 'speaker_diarization',
 'transcripts']

In [5]:
'''
    >> Get information about a specific config
        * Config info allows to explore the dataset, e.g., features of dataset, columns, size, number of data-points.
'''
config_info = get_dataset_config_info("iDRAMALab/iDRAMA-rumble-2024", config_name="representative_images")
config_info

DatasetInfo(description='', citation='', homepage='', license='', features={'image': Image(mode=None, decode=True, id=None), 'channel_name_lower': Value(dtype='string', id=None), 'v_id': Value(dtype='string', id=None), 'filename': Value(dtype='string', id=None)}, post_processed=None, supervised_keys=None, task_templates=None, builder_name='parquet', dataset_name='i_drama-rumble-2024', config_name='representative_images', version=0.0.0, splits={'train': SplitInfo(name='train', num_bytes=29786708764.246, num_examples=252387, shard_lengths=None, dataset_name=None)}, download_checksums=None, download_size=23599930820, post_processing_size=None, dataset_size=29786708764.246, size_in_bytes=None)

In [6]:
config_info.features.keys()

dict_keys(['image', 'channel_name_lower', 'v_id', 'filename'])

# Load Dataset through Huggingface APIs
* HF dataset documentation: [Load from HF-Hub](https://huggingface.co/docs/datasets/en/load_hub)

In [None]:
# Download & Load complete dataset
# dataset = load_dataset("iDRAMALab/iDRAMA-rumble-2024")

# Load dataset with specific config
dataset = load_dataset("iDRAMALab/iDRAMA-rumble-2024", name="transcripts")

In [4]:
# Convert dataset to pandas df
pd_df = pd.DataFrame(dataset["train"])
pd_df

Unnamed: 0,v_id,transcription,named_entities
0,v1006yj,"[{""id"": 1, ""seek"": 2550, ""start"": 0.0, ""end"": ...","[[""Biden"", ""PERSON""], [""Russia"", ""GPE""], [""Rus..."
1,v100aft,"[{""id"": 1, ""seek"": 2942, ""start"": 0.69, ""end"":...","[[""US"", ""GPE""], [""Russia"", ""GPE""], [""Putin"", ""..."
2,v101u17,"[{""id"": 1, ""seek"": 3000, ""start"": 22.53, ""end""...","[[""Lawsplaining the Interwebs"", ""WORK_OF_ART""]..."
3,v10245a,"[{""id"": 1, ""seek"": 2698, ""start"": 0.42, ""end"":...","[[""one"", ""CARDINAL""], [""the week"", ""DATE""], [""..."
4,v102di1,"[{""id"": 1, ""seek"": 2650, ""start"": 4.93, ""end"":...","[[""Bridget Phetasy"", ""PERSON""], [""Bridget Phet..."
...,...,...,...
6730,vzz9ab,"[{""id"": 1, ""seek"": 2878, ""start"": 0.26, ""end"":...","[[""Oklahoma"", ""GPE""], [""Texas"", ""GPE""], [""Okla..."
6731,vzzgm4,"[{""id"": 1, ""seek"": 2926, ""start"": 0.0, ""end"": ...","[[""florida"", ""GPE""], [""the democrat party"", ""O..."
6732,vzzk8h,"[{""id"": 1, ""seek"": 2936, ""start"": 0.0, ""end"": ...","[[""Today"", ""DATE""], [""the Matt Wall Show"", ""WO..."
6733,v16uipm,"[{""id"": 1, ""seek"": 2862, ""start"": 0.0, ""end"": ...","[[""Texas"", ""GPE""], [""Uvalde"", ""PRODUCT""], [""th..."


In [None]:
# Convert transcript to JSON
json.loads(pd_df.transcription.iloc[0])

## Stream the Dataset

* HF dataset streaming: [Stream from HF-Hub](https://huggingface.co/docs/datasets/en/stream)

In [5]:
# Stream dataset as iterator 
dataset = load_dataset("iDRAMALab/iDRAMA-rumble-2024", name="transcripts", streaming=True)

In [None]:
# Iterator to iter over dataset samples
for row in dataset["train"]:
    print(row)
    break

#### Stream with selected columns

* When you stream the dataset, you can select columns to build your dataset.

In [None]:
# Stream dataset as iterator 
dataset = load_dataset("iDRAMALab/iDRAMA-rumble-2024", 
                       name="transcripts", 
                       columns=["v_id", "transcription"], # Only get the posts
                       streaming=True)

# Simply, iterate over the streamed points
for val in dataset["train"]:
    print(val)
    break
    
# Complete dataset for only selected columns
pd_df = pd.DataFrame([val for val in dataset["train"]])
pd_df

# Save the data in local instead to HF-dataset format
    * We recommend to save the data in standard formats like CSV or JSON instead pickle objects.
    * However, as the dataset is large, compressed `parquet` format is recommended.

In [None]:
'''
Assuming, you have loaded dataset and casted to pandas -- `pd_df`
'''
# Save as standard serialized format
pd_df.to_json("filename.ndjson", orient="records", lines=True)

# Save as compressed parquet
pd_df.to_parquet("filename.parquet", compression="zstd")