In [9]:
from datasets import load_dataset
from transformers import AutoTokenizer
from torch.utils.data import Subset
import pandas as pd
import torch
import random

In [31]:
def data_preprocess(train_df, test_df, tokenizer, n_classes, unlabel_rate):
    df = pd.concat([train_df, test_df], axis=0)
    df = df.dropna(subset=["Title", "Content"])
    df = pd.concat([df, pd.get_dummies(df["Label 1"], dtype=int)], axis=1)
    df = df.drop(["Target Organization", "Label 2", "Label 3", "Label 4", "Label 5"], axis=1)
    df["input"] = df["Title"].str.cat(df["Content"], sep=".\n")

    xy_ids = tokenizer.batch_encode_plus(
        list(df["input"]), truncation=True, max_length=512, padding="max_length", return_tensors="pt"
    )

    labels = torch.tensor(df.drop(["Title", "Content", "Label 1", "input"], axis=1).values)
    top_indices = torch.argsort(labels.sum(dim=0), descending=True)[:n_classes]
    indices = [i for i, _ in enumerate(labels.argmax(dim=1)) if _ in top_indices]
    xy_ids["labels"] = labels

    for k in xy_ids.keys():
        xy_ids[k] = xy_ids[k][indices]

    if n_classes == 2:
        xy_ids["labels"] = xy_ids["labels"][:, top_indices]
        xy_ids["labels"][xy_ids["labels"] == 0] = -1
    else:
        xy_ids["labels"] = xy_ids["labels"][:, top_indices]

    train_xy_ids = {k: v[: int(len(xy_ids["labels"]) * 0.9)] for k, v in xy_ids.items()}
    test_xy_ids = {k: v[int(len(xy_ids["labels"]) * 0.9) :] for k, v in xy_ids.items()}

    dataset = {"train": train_xy_ids, "test": test_xy_ids}

    unlabel_indices = [
        False for _ in range(len(train_xy_ids["labels"]) - int(len(train_xy_ids["labels"]) * unlabel_rate))
    ] + [True for _ in range(int(len(train_xy_ids["labels"]) * unlabel_rate))]
    random.shuffle(unlabel_indices)
    # if n_classes == 2:
    #     dataset["train"]["labels"][unlabel_indices] = torch.zeros([sum(unlabel_indices)], dtype=int)
    # else:
    #     dataset["train"]["labels"][unlabel_indices] = torch.zeros([sum(unlabel_indices), n_classes], dtype=int)
    dataset["train"]["labels"][unlabel_indices] = torch.zeros([sum(unlabel_indices), n_classes], dtype=int)

    p_ratio = (dataset["train"]["labels"]==1).sum(dim=0) / len(dataset["train"]["labels"])
    # if n_classes == 2:
    #     p_ratio = p_ratio.item()

    return dataset, p_ratio

In [34]:
train_df = pd.read_csv("../data/train.csv", index_col=0)
test_df = pd.read_csv("../data/test.csv", index_col=0)
tokenizer = AutoTokenizer.from_pretrained("bert-base-uncased")

dataset, p_ratio = data_preprocess(train_df, test_df, tokenizer, 2, 0.99)

In [35]:
p_ratio

tensor([0.0057, 0.0045])

In [5]:
import os
import pandas as pd

In [32]:
os.path.isfile('../resul')

False

In [33]:
if os.path.isfile("../results.csv"):
    print("File exists")
    results = pd.read_csv("../results.csv")
else:
    results = pd.DataFrame()

File exists


In [34]:
result = {}
result["n_classes"] = 3
result["use_multi_loss"] = True
result["unlabel_rate"] = 0.99
result["epoch"] = 4
result["accuracy"] = 0.323472893
results = pd.concat([results, pd.DataFrame([result])], ignore_index=True)

In [35]:
results

Unnamed: 0,n_classes,use_multi_loss,unlabel_rate,epoch,accuracy
0,3,True,0.99,4,0.323473
1,3,True,0.99,4,0.323473
