In [None]:
from datasets import load_dataset
from datasets import get_dataset_config_names, get_dataset_config_info

import pandas as pd

# Explore the Features of the dataset

In [9]:
'''
    >> Get all avalailable configs
        * Configs are the set of folders in a way the dataset is organized into.
'''
all_configs = get_dataset_config_names("iDRAMALab/iDRAMA-scored-2024")
all_configs

['comments-2020',
 'comments-2021',
 'comments-2022',
 'comments-2023',
 'submissions-2020-to-2023']

In [30]:
'''
    >> Get information about a specific config
        * Config info allows to explore the dataset, e.g., features of dataset, columns, size, number of data-points.
'''
config_info = get_dataset_config_info("iDRAMALab/iDRAMA-scored-2024", config_name="comments-2020")
config_info

DatasetInfo(description='', citation='', homepage='', license='', features={'uuid': Value(dtype='string', id=None), 'score': Value(dtype='int64', id=None), 'created': Value(dtype='int64', id=None), 'score_up': Value(dtype='int64', id=None), 'community': Value(dtype='string', id=None), 'is_deleted': Value(dtype='bool', id=None), 'score_down': Value(dtype='int64', id=None), 'raw_content': Value(dtype='string', id=None), 'is_moderator': Value(dtype='bool', id=None), 'date': Value(dtype='string', id=None), 'author': Value(dtype='string', id=None), 'embedding': Sequence(feature=Value(dtype='float32', id=None), length=-1, id=None)}, post_processed=None, supervised_keys=None, task_templates=None, builder_name='parquet', dataset_name='i_drama-scored-2024', config_name='comments-2020', version=0.0.0, splits={'train': SplitInfo(name='train', num_bytes=31938791495, num_examples=12774203, shard_lengths=[200000, 191000, 193000, 191000, 191388, 197000, 198000, 196000, 204388, 201000, 200000, 202000,

In [31]:
config_info.features.keys()

dict_keys(['uuid', 'score', 'created', 'score_up', 'community', 'is_deleted', 'score_down', 'raw_content', 'is_moderator', 'date', 'author', 'embedding'])

# Load Dataset through Huggingface APIs
* HF dataset documentation: [Load from HF-Hub](https://huggingface.co/docs/datasets/en/load_hub)

In [11]:
# Download & Load complete dataset
# dataset = load_dataset("iDRAMALab/iDRAMA-scored-2024")

# Load dataset with specific config
dataset = load_dataset("iDRAMALab/iDRAMA-scored-2024", name="comments-2020")

Downloading data:   0%|          | 0/16 [00:00<?, ?files/s]

Generating train split:   0%|          | 0/12774203 [00:00<?, ? examples/s]

Loading dataset shards:   0%|          | 0/64 [00:00<?, ?it/s]

In [12]:
# Convert dataset to pandas df
pd_df = dataset["train"].to_pandas()
pd_df

Unnamed: 0,uuid,score,created,score_up,community,is_deleted,score_down,raw_content,is_moderator,date,author,embedding
0,FMlZuZ8b,2,1577836858453,2,TheDonald,False,0,"Queue the, ""...And he's a terrorist."", meme.",False,2020-01-01,6201a88cb31263f1ad073d11a161d28d,"[-0.011283473, -0.008606288, -0.039070208, 0.0..."
1,FMlZuZ8c,2,1577836894284,2,TheDonald,False,0,Most of the commenters there are not Americans...,False,2020-01-01,4d685104b75832a93d380a7682e49434,"[-0.024856854, -0.008211391, -0.028705617, 0.0..."
2,FMlZuZ8d,9,1577836900803,9,TheDonald,False,0,The fact that users felt like they didn't know...,False,2020-01-01,a0be10e61102803d94757de6247a8a44,"[-0.06430399, -0.030431006, -0.02697326, 0.000..."
3,FMlZuZ8e,15,1577836907939,15,TheDonald,False,0,Time's up. Deferred means just that.,False,2020-01-01,fe2df53932c26e4fb38e76c5e12c54b2,"[-0.0023614091, -0.03463356, -0.04153254, -0.0..."
4,FMlZuZ8f,1,1577836958731,1,TheDonald,True,0,,False,2020-01-01,,
...,...,...,...,...,...,...,...,...,...,...,...,...
12774198,4Drwos6fl2X,1,1609459191157,1,TheDonald,False,0,"Traitors, traitors everywhere",False,2020-12-31,9f9f398733b92d29c12b66424c1ab7a5,"[-0.044996005, -0.03565693, -0.025443707, 0.02..."
12774199,4Drwos6fl2Y,3,1609459192443,3,TheDonald,False,0,I've seen that it's okay that he did that (you...,False,2020-12-31,f83775d14e08c39e611dcb74ef3e6577,"[-0.022224393, 0.0008413438, -0.011335765, 0.0..."
12774200,4Drwos6fl2Z,1,1609459197195,2,TheDonald,False,1,100% divorce only by abuse and or proven adult...,False,2020-12-31,f7a65e082ff19112af0ce39b1da24929,"[0.019342886, -0.021640928, 0.006174896, 0.037..."
12774201,4Drwos6fl2a,1,1609459198394,1,TheDonald,True,0,,False,2020-12-31,,


## Stream the Dataset

* HF dataset streaming: [Stream from HF-Hub](https://huggingface.co/docs/datasets/en/stream)

In [13]:
# Stream dataset as iterator 
dataset = load_dataset("iDRAMALab/iDRAMA-scored-2024", name="comments-2020", streaming=True)

In [None]:
# Iterator to iter over dataset samples
for row in dataset["train"]:
    print(row)
    break

#### Stream with selected columns

* When you stream the dataset, you can select columns to build your dataset.

In [3]:
# Stream dataset as iterator 
dataset = load_dataset("iDRAMALab/iDRAMA-scored-2024", 
                       name="comments-2021", 
                       columns=["uuid", "raw_content", "embedding"], # Only get the posts
                       streaming=True)

# Simply, iterate over the streamed points
for val in dataset["train"]:
    print(val)
    break
    
# Complete dataset for only selected columns
# pd_df = pd.DataFrame([val for val in dataset["train"]])
# pd_df

Downloading readme:   0%|          | 0.00/11.8k [00:00<?, ?B/s]

Resolving data files:   0%|          | 0/21 [00:00<?, ?it/s]

{'uuid': '4Drwos6fl2c', 'raw_content': 'MUST decertify Biden electors and Certify Trump electors', 'embedding': None}


# Save the data in local instead to HF-dataset format
    * We recommend to save the data in standard formats like CSV or JSON instead pickle objects.
    * However, as the dataset is large, compressed `parquet` format is recommended.

In [None]:
'''
Assuming, you have loaded dataset and casted to pandas -- `pd_df`
'''
# Save as standard serialized format
pd_df.to_json("filename.ndjson", orient="records", lines=True)

# Save as compressed parquet
pd_df.to_parquet("filename.parquet", compression="zstd")