In [1]:
from datasets import load_dataset
from transformers import AutoTokenizer
from torch.utils.data import Subset
import pandas as pd
import torch
import random

In [52]:
n_classes = 2
batch_size = 2
tokenizer = AutoTokenizer.from_pretrained('bert-base-uncased')
unlabel_rate = 0.99

In [57]:
train_df = pd.read_csv("../data/train.csv", index_col=0)
test_df = pd.read_csv("../data/test.csv", index_col=0)

df = pd.concat([train_df, test_df], axis=0)
df = df.dropna(subset=["Title", "Content"])
df = pd.concat([df, pd.get_dummies(df["Label 1"], dtype=int)], axis=1)
df = df.drop(["Target Organization", "Label 2", "Label 3", "Label 4", "Label 5"], axis=1)
df["input"] = df["Title"].str.cat(df["Content"], sep=".\n")

xy_ids = tokenizer.batch_encode_plus(
    list(df["input"]), truncation=True, max_length=512, padding="max_length", return_tensors="pt"
)

In [58]:
labels = torch.tensor(df.drop(["Title", "Content", "Label 1", "input"], axis=1).values)
top_indices = torch.argsort(labels.sum(dim=0), descending=True)[: n_classes]
indices = [i for i, _ in enumerate(labels.argmax(dim=1)) if _ in top_indices]
xy_ids["labels"] = labels

for k in xy_ids.keys():
    xy_ids[k] = xy_ids[k][indices]

if n_classes == 2:
    xy_ids["labels"] = xy_ids["labels"][:, top_indices[0]]
    xy_ids["labels"][xy_ids["labels"] == 0] = -1
else:
    xy_ids["labels"] = xy_ids["labels"][:, top_indices]

train_xy_ids = {k: v[: int(len(xy_ids['labels'])*0.9)] for k, v in xy_ids.items()}
test_xy_ids = {k: v[int(len(xy_ids['labels'])*0.9) :] for k, v in xy_ids.items()}

dataset = {"train": train_xy_ids, "test": test_xy_ids}

unlabel_indices = [False for _ in range(int(len(train_xy_ids['labels']) * 0.01))] + [
    True for _ in range(len(train_xy_ids['labels']) - int(len(train_xy_ids['labels']) * 0.01))
]
random.shuffle(unlabel_indices)
if n_classes == 2:
    dataset["train"]["labels"][unlabel_indices] = torch.zeros([sum(unlabel_indices)], dtype=int)
else:
    dataset["train"]["labels"][unlabel_indices] = torch.zeros([sum(unlabel_indices), n_classes], dtype=int)


In [59]:
dataset['test']

{'input_ids': tensor([[  101,  4824,  1996,  ...,  2065,  2017,   102],
         [  101,  5264,  4311,  ...,     0,     0,     0],
         [  101,  6492,  3444,  ...,     0,     0,     0],
         ...,
         [  101, 10927,  3367,  ...,     0,     0,     0],
         [  101, 20248, 12618,  ...,     0,     0,     0],
         [  101,  2006,  1996,  ...,  3460,  1012,   102]]),
 'token_type_ids': tensor([[0, 0, 0,  ..., 0, 0, 0],
         [0, 0, 0,  ..., 0, 0, 0],
         [0, 0, 0,  ..., 0, 0, 0],
         ...,
         [0, 0, 0,  ..., 0, 0, 0],
         [0, 0, 0,  ..., 0, 0, 0],
         [0, 0, 0,  ..., 0, 0, 0]]),
 'attention_mask': tensor([[1, 1, 1,  ..., 1, 1, 1],
         [1, 1, 1,  ..., 0, 0, 0],
         [1, 1, 1,  ..., 0, 0, 0],
         ...,
         [1, 1, 1,  ..., 0, 0, 0],
         [1, 1, 1,  ..., 0, 0, 0],
         [1, 1, 1,  ..., 1, 1, 1]]),
 'labels': tensor([ 1,  1,  1,  1,  1,  1,  1,  1,  1,  1,  1,  1,  1,  1,  1,  1, -1,  1,
         -1,  1,  1,  1,  1,  1,  1,  

In [61]:
p_ratio = dataset['train']['labels'].sum(dim=0) / len(dataset['train']['labels'])

In [63]:
p_ratio.item()

0.006818181835114956

In [47]:
unlabel_indices[-10:]

[True, True, True, True, True, True, True, True, True, True]