In [1]:
from typing import get_args

from IPython.display import clear_output
import pandas as pd
from tqdm.auto import tqdm

from pretrain_on_test import load_classification_data_from_hf, HuggingFaceDatasetNames

In [2]:
hf_dataset_names = get_args(HuggingFaceDatasetNames)

In [3]:
len(hf_dataset_names)

20

In [4]:
def describe(df: pd.DataFrame, random_state: int | None = None):
    random_obs = df.sample(n=1, random_state=random_state)
    return {
        "lengths": df["text"].str.len(),
        "num_classes": len(df["label"].unique()),
        "example": {
            "text": random_obs["text"].iloc[0],
            "label": random_obs["label"].iloc[0],
        },
    }

In [5]:
descriptions = []
_progress_bar = tqdm(enumerate(hf_dataset_names), total=len(hf_dataset_names))
for i, dataset in _progress_bar:
    clear_output(wait=True)
    print(_progress_bar)
    descriptions.append(
        describe(load_classification_data_from_hf(dataset), random_state=i)
    )

 95%|█████████▌| 19/20 [03:21<00:29, 29.05s/it]


In [6]:
max_text_length_displayed = 50
records = []
for dataset, description in zip(hf_dataset_names, descriptions):
    records.append(
        {
            "dataset": dataset,
            "number of classes": description["num_classes"],
            "example text": description["example"]["text"][:max_text_length_displayed],
        }
    )

In [7]:
df = pd.DataFrame(records)

In [8]:
df

Unnamed: 0,dataset,number of classes,example text
0,ag_news,4,First class to the moon London - British airli...
1,SetFit/amazon_counterfactual_en,2,I don't know why X-Mini doesn't update their p...
2,app_reviews,5,watshpp
3,christinacdl/clickbait_notclickbait_dataset,2,"France, Italy, UK sending military advisers to..."
4,climate_fever,4,The peer-reviewed study by two scientists and ...
5,aladar/craigslist_bargains,6,Buyer: That's too much for me can you go lower...
6,emo,4,i hate everyone why so they talk about me
7,dair-ai/emotion,6,i feel increasingly energetic and comfortable ...
8,SetFit/enron_spam,2,largest collection of porn mo \ / ies ever - x...
9,financial_phrasebank,3,The MET is located in the Central Business Dis...


In [9]:
df["number of classes"].max()

18