In [1]:
from datasets import load_dataset
from transformers import AutoTokenizer
from torch.utils.data import Subset

In [2]:
dataset = load_dataset('knowledgator/events_classification_biotech')

You can avoid this message in future by passing the argument `trust_remote_code=True`.
Passing `trust_remote_code=True` will be mandatory to load this dataset from the next major release of `datasets`.


In [3]:
classes = set(i[0] for i in dataset['train']["all_labels"])
class2id = {class_: id for id, class_ in enumerate(classes)}
tokenizer = AutoTokenizer.from_pretrained("bert-base-uncased")

In [14]:
def preprocess_function(example):
    text = f"{example['title']}.\n{example['content']}"
    labels = [0.0 for _ in range(len(classes))]
    label_id = class2id[example["all_labels"][0]]
    labels[label_id] = 1.0

    example = tokenizer(text, truncation=True, max_length=512, padding="max_length")
    example["labels"] = labels
    return example

In [15]:
tokenized_dataset = dataset.map(preprocess_function)
tokenized_dataset.set_format(type="torch", columns=["input_ids", "token_type_ids", "attention_mask", "labels"])

Map:   0%|          | 0/2759 [00:00<?, ? examples/s]

{'title': "Sarah Polley's Book Recommendations", 'content': 'Drive Your Plow Over the Bones of The Dead\nby Olga Tokarczuk. I am an incredibly slow reader, but the tone and specificity of the world she creates in this book was something I couldnt leave behind until it was done. Also: All We Sawby Anne Michaels, Fight Nightby Miriam Toews, and The Summer Before the Darkby Doris Lessing.\nId like turned into a Netflix show:\nby Amia Srinivasan. One of the most brain-shattering books Ive ever read. Her thinking is so electrically rigorous and fearless. (I double DARE them to make this into a Netflix show!)\n...I last bought:\n. I rediscovered her poetry lately, and I feel like I dont want to read anything else for a while. She owns desire and submerged things.\n...has the greatest ending:\nby J.D. Salinger. The last page always leaves me breathless. The intimacy and truth of that final page is so arresting and almost painful to read.\nshould be on every college syllabus:\nby Anton Piatigo

NameError: name 'gas' is not defined

In [6]:
tokenized_dataset

DatasetDict({
    train: Dataset({
        features: ['title', 'content', 'target organization', 'all_labels', 'all_labels_concat', 'label 1', 'label 2', 'label 3', 'label 4', 'label 5', 'input_ids', 'token_type_ids', 'attention_mask', 'labels'],
        num_rows: 2759
    })
    test: Dataset({
        features: ['title', 'content', 'target organization', 'all_labels', 'all_labels_concat', 'label 1', 'label 2', 'label 3', 'label 4', 'label 5', 'input_ids', 'token_type_ids', 'attention_mask', 'labels'],
        num_rows: 381
    })
})

In [7]:
n_labels = tokenized_dataset['train']['labels'].sum(dim=0)

In [8]:
def top_n_indices(n_labels, n):
    top_indices = []
    for _ in range(n):
        i = n_labels.argmax().item()
        n_labels[i] = 0.
        top_indices.append(i)
    return top_indices


In [9]:
top_indices = top_n_indices(n_labels, 2)

In [11]:
indices = [i for i, _ in enumerate(tokenized_dataset['train']['labels'].argmax(dim=1)) if _ in top_indices]

In [12]:
subset = Subset(tokenized_dataset['train'], indices=indices)

In [13]:
len(subset)

829