**Description**: Brief descriptions of the classification tasks used in this experiment.

**Estimated runtime**: ~10 min b/c it has to locally download datasets, ~5 min if you've
already downloaded them

In [1]:
from typing import get_args

from IPython.display import clear_output
import pandas as pd
from tqdm.auto import tqdm

from pretrain_on_test import load_classification_data_from_hf, HuggingFaceDatasetNames

In [2]:
hf_dataset_names: tuple[str] = get_args(HuggingFaceDatasetNames)

In [3]:
len(hf_dataset_names)

20

In [4]:
def describe(df: pd.DataFrame, random_state: int | None = None):
    random_obs = df.sample(n=1, random_state=random_state)
    return {
        "lengths": df["text"].str.len(),
        "num_classes": len(df["label"].unique()),
        "example": {
            "text": random_obs["text"].iloc[0],
            "label": random_obs["label"].iloc[0],
        },
    }

In [5]:
descriptions = []
_progress_bar = tqdm(enumerate(hf_dataset_names), total=len(hf_dataset_names))
for i, dataset in _progress_bar:
    clear_output(wait=True)
    print(_progress_bar)
    descriptions.append(
        describe(load_classification_data_from_hf(dataset), random_state=i)
    )

 95%|█████████▌| 19/20 [03:33<00:28, 28.40s/it]


In [6]:
max_text_length_displayed = 30
records = []
for dataset, description in zip(hf_dataset_names, descriptions):
    dataset_link = f"https://huggingface.co/datasets/{dataset}"
    dataset_tex = "\\texttt{" + dataset.replace("_", "\_") + "}"
    records.append(
        {
            "HuggingFace dataset": f"\\href{{{dataset_link}}}{{{dataset_tex}}}",
            "Number of classes": description["num_classes"],
            "Text length (25, 75) percentiles": tuple(
                description["lengths"].describe()[["25%", "75%"]].astype(int).values
            ),
            "Example text (truncated)": description["example"]["text"][:max_text_length_displayed],
        }
    )

In [7]:
df = pd.DataFrame(records)

In [8]:
df

Unnamed: 0,HuggingFace dataset,Number of classes,"Text length (25, 75) percentiles",Example text (truncated)
0,\href{https://huggingface.co/datasets/ag_news}...,4,"(196, 266)",First class to the moon London
1,\href{https://huggingface.co/datasets/SetFit/a...,2,"(60, 125)",I don't know why X-Mini doesn'
2,\href{https://huggingface.co/datasets/app_revi...,5,"(10, 77)",watshpp
3,\href{https://huggingface.co/datasets/christin...,2,"(46, 69)","France, Italy, UK sending mili"
4,\href{https://huggingface.co/datasets/climate_...,4,"(80, 156)",The peer-reviewed study by two
5,\href{https://huggingface.co/datasets/aladar/c...,6,"(346, 713)",Buyer: That's too much for me
6,\href{https://huggingface.co/datasets/emo}{\te...,4,"(44, 83)",i hate everyone why so they ta
7,\href{https://huggingface.co/datasets/dair-ai/...,6,"(53, 129)",i feel increasingly energetic
8,\href{https://huggingface.co/datasets/SetFit/e...,2,"(342, 1553)",largest collection of porn mo
9,\href{https://huggingface.co/datasets/financia...,3,"(79, 157)",The MET is located in the Cent


In [9]:
print(df["HuggingFace dataset"].iloc[0])

\href{https://huggingface.co/datasets/ag_news}{\texttt{ag\_news}}


In [10]:
df["Number of classes"].max()

18

In [11]:
# df.to_csv("dataset_descriptions.csv", index=False)