In [1]:
import pandas as pd
from datasets import Dataset
import warnings
from sklearn.metrics import accuracy_score, precision_recall_fscore_support
from transformers import AutoTokenizer, AutoModelForSequenceClassification, TrainingArguments, Trainer, DataCollatorWithPadding
import os
import shutil
os.environ["WANDB_DISABLED"] = "true"
warnings.filterwarnings("ignore")


In [None]:
model_name = "kiddothe2b/hierarchical-transformer-base-4096"


In [None]:
def preprocess_function(tokenizer, examples):
    return tokenizer(examples["text"], truncation=True, padding=False)


In [None]:
def flow(df, name, epoch=5):
    data = Dataset.from_pandas(df)
    tokenizer = AutoTokenizer.from_pretrained(
        model_name, trust_remote_code=True)
    model = AutoModelForSequenceClassification.from_pretrained(
        model_name, num_labels=7, trust_remote_code=True)

    tokenized_data = data.map(
        lambda batch: preprocess_function(tokenizer, batch), batched=True)
    tokenized_data = tokenized_data.train_test_split(test_size=0.2, seed=42)
    data_collator = DataCollatorWithPadding(tokenizer=tokenizer)

    training_args = TrainingArguments(
        output_dir=f"./results/{name}",
        learning_rate=2e-5,
        per_device_train_batch_size=1,
        per_device_eval_batch_size=64,
        num_train_epochs=epoch,
        weight_decay=0.01,
        push_to_hub=False,
        evaluation_strategy='epoch',
        save_strategy="epoch",
        load_best_model_at_end=True
    )

    trainer = Trainer(
        model=model,
        args=training_args,
        train_dataset=tokenized_data["train"],
        eval_dataset=tokenized_data["test"],
        tokenizer=tokenizer,
        data_collator=data_collator,
        compute_metrics=lambda pred: {
            'accuracy': accuracy_score(pred.label_ids, pred.predictions.argmax(-1))
        }

    )

    trainer.train()
    trainer.save_model(f'./model/{name}')


In [None]:
df = pd.read_csv("/kaggle/input/zero-shot-hierarchical/hierarchical_train.csv")


In [None]:
df = df[["text", "label.1"]]
df.rename(columns={"label.1": "label"}, inplace=True)
df.dropna(inplace=True)


In [None]:
input_dir = "/kaggle/input/hierarchical"
data_types = os.list_dir(input_dir)

print(f"Different data count: {len(data_types)}")
for data_type in data_types:
    print(data_type)
    flow(df, data_type, 10)
