In [1]:
import pandas as pd
from datasets import Dataset
import warnings
from sklearn.metrics import accuracy_score, precision_recall_fscore_support
from transformers import AutoTokenizer, AutoModelForSequenceClassification, TrainingArguments, Trainer, DataCollatorWithPadding
import os
import shutil
os.environ["WANDB_DISABLED"] = "true"
warnings.filterwarnings("ignore")


In [None]:
model_name = "bert-base-uncased"


In [None]:
id2label = {
    0: "evaluation",
    1: "introduction",
    2: "pretrained_model",
    3: "requirements",
    4: "results",
    5: "training"
}

label2id = {
    "evaluation": 0,
    "introduction": 1,
    "pretrained_model": 2,
    "requirements": 3,
    "results": 4,
    "training": 5
}


In [None]:
def preprocess_function(tokenizer, examples):
    return tokenizer(examples["text"], truncation=True)


In [None]:
def flow(data_dir, name):
    df = pd.read_csv(data_dir, index_col=0)
    if "parent_header" in df.columns:
        df.parent_header.fillna("", inplace=True)
        df["text"] = df[["parent_header", "header", "content"]].apply(
            lambda x: '\n'.join(filter(None, x)), axis=1)
    else:
        df["text"] = df[["header", "content"]].apply(
            lambda x: '\n'.join(filter(None, x)), axis=1)
    df.label = df.label.apply(lambda x: label2id[x])
    df = df.reset_index()[["text", "label"]]
    data = Dataset.from_pandas(df)

    tokenizer = AutoTokenizer.from_pretrained(model_name)
    model = AutoModelForSequenceClassification.from_pretrained(
        model_name, num_labels=6, id2label=id2label, label2id=label2id)

    tokenized_data = data.map(
        lambda batch: preprocess_function(tokenizer, batch), batched=True)
    tokenized_data = tokenized_data.train_test_split(test_size=0.2)
    data_collator = DataCollatorWithPadding(tokenizer=tokenizer)

    training_args = TrainingArguments(
        output_dir=f"./results/{name}",
        learning_rate=2e-5,
        per_device_train_batch_size=16,
        per_device_eval_batch_size=64,
        num_train_epochs=3,
        weight_decay=0.01,
        push_to_hub=False,
        evaluation_strategy='epoch',
        save_strategy="epoch",
        load_best_model_at_end=True,
    )

    trainer = Trainer(
        model=model,
        args=training_args,
        train_dataset=tokenized_data["train"],
        eval_dataset=tokenized_data["test"],
        tokenizer=tokenizer,
        data_collator=data_collator,
        compute_metrics=lambda pred: {
            'accuracy': accuracy_score(pred.label_ids, pred.predictions.argmax(-1))
        }

    )

    trainer.train()
    trainer.save_model(f'./model/{name}')


In [None]:
input_dir = "/kaggle/input/classification"
data_types = os.listdir(input_dir)

print(f"Different data count: {len(data_types)}")
for data_type in data_types:
    flow(os.path.join(dir, data_type), data_type)
