**Description**: Brief descriptions of the classification tasks used in this experiment.
The table is meant to be sent to this [LaTeX table
generator](https://www.tablesgenerator.com/latex_tables).

**Estimated runtime**: ~10 min b/c it has to locally download datasets, ~5 min if you've
already downloaded them

In [1]:
from typing import get_args

from IPython.display import clear_output
import pandas as pd
from tqdm.auto import tqdm

from pretrain_on_test import load_classification_data_from_hf, HuggingFaceDatasetNames

In [2]:
hf_dataset_names: tuple[str] = get_args(HuggingFaceDatasetNames)

In [3]:
len(hf_dataset_names)

25

In [4]:
def describe(df: pd.DataFrame, random_state: int | None = None):
    random_obs = df.sample(n=1, random_state=random_state)
    return {
        "lengths": df["text"].str.len(),
        "num_classes": len(df["label"].unique()),
        "example": {
            "text": random_obs["text"].iloc[0],
            "label": random_obs["label"].iloc[0],
        },
    }

In [5]:
descriptions = []
_progress_bar = tqdm(enumerate(hf_dataset_names), total=len(hf_dataset_names))
for i, dataset in _progress_bar:
    clear_output(wait=True)
    print(_progress_bar)
    descriptions.append(
        describe(load_classification_data_from_hf(dataset), random_state=i)
    )

 96%|█████████▌| 24/25 [04:29<00:27, 27.23s/it]


In [6]:
dataset_to_citation = {
    "ag_news": "zhang2015character",
    "SetFit/amazon_counterfactual_en": "oneill-etal-2021-wish",
    "app_reviews": "grano2017android",
    "blog_authorship_corpus": "schler2006effects",
    "christinacdl/clickbait_notclickbait_dataset": None,
    "climate_fever": "diggelmann2020climatefever",
    "aladar/craigslist_bargains": "he2018decoupling",
    "disaster_response_messages": None,
    "emo": "chatterjee-etal-2019-semeval",
    "dair-ai/emotion": "saravia-etal-2018-carer",
    "SetFit/enron_spam": "metsis2006spam",
    "financial_phrasebank": "Malo2014GoodDO",
    "classla/FRENK-hate-en": "ljubešić2019frenk",
    "hyperpartisan_news_detection": "kiesel-etal-2019-semeval",
    "limit": "manotas-etal-2020-limit",
    "AmazonScience/massive": "fitzgerald2022massive",
    "movie_rationales": "deyoung-etal-2020-eraser",
    "mteb/mtop_domain": "muennighoff-etal-2023-mteb",
    "ccdv/patent-classification": "sharma-etal-2019-bigpatent",
    "rotten_tomatoes": "Pang+Lee:05a",
    "silicone": "chapuis-etal-2020-hierarchical",
    "trec": "wang-etal-2007-jeopardy",
    "tweets_hate_speech_detection": "sharma2019",
    "yahoo_answers_topics": "zhang2018",
    "yelp_review_full": "zhang2015character",
}

In [7]:
assert set(dataset_to_citation.keys()) == set(hf_dataset_names)

In [8]:
max_text_length_displayed = 30
records = []
for dataset, description in zip(hf_dataset_names, descriptions):
    dataset_link = f"https://huggingface.co/datasets/{dataset}"
    dataset_tex = "\\texttt{" + dataset.replace("_", "\_") + "}"
    name_and_link = f"\\href{{{dataset_link}}}{{{dataset_tex}}}"
    citation = dataset_to_citation[dataset]
    if citation is None:
        citation_tex = ""
    else:
        citation_tex = f"\\citet{{{dataset_to_citation[dataset]}}}"
    records.append(
        {
            "HuggingFace dataset": name_and_link,
            "Author(s)": citation_tex,
            "Number of classes": description["num_classes"],
            "Text length (25, 75) percentiles": tuple(
                description["lengths"].describe()[["25%", "75%"]].astype(int).values
            ),
            "Example text (truncated)": description["example"]["text"][:max_text_length_displayed],
        }
    )

In [9]:
df = pd.DataFrame(records)

In [10]:
df

Unnamed: 0,HuggingFace dataset,Author(s),Number of classes,"Text length (25, 75) percentiles",Example text (truncated)
0,\href{https://huggingface.co/datasets/ag_news}...,\citet{zhang2015character},4,"(196, 266)",First class to the moon London
1,\href{https://huggingface.co/datasets/SetFit/a...,\citet{oneill-etal-2021-wish},2,"(60, 125)",I don't know why X-Mini doesn'
2,\href{https://huggingface.co/datasets/app_revi...,\citet{grano2017android},5,"(10, 77)",watshpp
3,\href{https://huggingface.co/datasets/blog_aut...,\citet{schler2006effects},2,"(92, 556)","""Why did you date him again? W"
4,\href{https://huggingface.co/datasets/christin...,,2,"(46, 69)",South China Sea: Beijing says
5,\href{https://huggingface.co/datasets/climate_...,\citet{diggelmann2020climatefever},4,"(80, 156)",'Our harmless emissions of tri
6,\href{https://huggingface.co/datasets/aladar/c...,\citet{he2018decoupling},6,"(346, 713)",Buyer: I am very interested in
7,\href{https://huggingface.co/datasets/disaster...,,3,"(74, 178)","A week later, another thunders"
8,\href{https://huggingface.co/datasets/emo}{\te...,\citet{chatterjee-etal-2019-semeval},4,"(44, 83)",get lost lost with you sure go
9,\href{https://huggingface.co/datasets/dair-ai/...,\citet{saravia-etal-2018-carer},6,"(53, 129)",i just remember feeling really


In [11]:
print(df["HuggingFace dataset"].iloc[0])

\href{https://huggingface.co/datasets/ag_news}{\texttt{ag\_news}}


In [12]:
print(df["Author(s)"].iloc[0])

\citet{zhang2015character}


In [13]:
df["Number of classes"].min(), df["Number of classes"].max()

(2, 18)

In [14]:
# df.to_csv("dataset_descriptions.csv", index=False)