In [None]:
import os
import sys
sys.path.append(os.getcwd() + '/..')
import torch
import pandas as pd
import numpy as np
from datetime import datetime
from transformers import AutoTokenizer, AutoModelForSequenceClassification, DataCollatorWithPadding, Trainer, TrainingArguments
from datasets import Dataset
import evaluate

In [None]:
train_data_path = './../../data/sub/ebay_us_nonleaf-20240523-1349-train.csv'
test_data_path = './../../data/sub/ebay_us_nonleaf-20240523-1349-eval.csv'
model_checkpoint = 'YOUR_MODEL_CHECKPOINT'
model_name = model_checkpoint.split("/")[-1]
device = 'cuda' if torch.cuda.is_available() else 'cpu'

train_batch_size = 16
eval_batch_size = 16
num_train_epochs = 4
lr = 5e-5
lr_schedule='linear'
np.random.seed(114514)
torch.manual_seed(114514)

os.environ['TOKENIZERS_PARALLELISM'] = 'false'

In [None]:
model = AutoModelForSequenceClassification.from_pretrained(model_checkpoint).to(device)
tokenizer = AutoTokenizer.from_pretrained(model_checkpoint,model_max_length=72)
data_collator = DataCollatorWithPadding(tokenizer,padding=True)
precision_score = evaluate.load('precision')
recall_score = evaluate.load('recall')
f1_score = evaluate.load("f1")
acc_score = evaluate.load('accuracy')

In [None]:
def tokenize(examples,return_tensors=None):
    return tokenizer(examples["Subclass"],examples["Superclass"],return_tensors=return_tensors)

In [None]:
def compute_metrics(eval_pred):
    logits, labels = eval_pred
    predictions = np.argmax(logits, axis=-1)
    acc = acc_score.compute(predictions=predictions, references=labels)
    p = precision_score.compute(predictions=predictions, references=labels)
    r = recall_score.compute(predictions=predictions, references=labels)
    f1 = f1_score.compute(predictions=predictions, references=labels)
    result = {**acc, **p, **r, **f1}
    return {k: round(v, 6) for k, v in result.items()}

In [None]:
train_data = pd.read_csv(train_data_path)
eval_data = pd.read_csv(test_data_path)
train_dataset = Dataset.from_pandas(train_data)
eval_dataset = Dataset.from_pandas(eval_data)
train_dataset = train_dataset.map(tokenize, batched=True)
eval_dataset = eval_dataset.map(tokenize, batched=True)
train_dataset.set_format(type="torch", columns=["input_ids", "attention_mask","token_type_ids","labels"])
eval_dataset.set_format(type="torch", columns=["input_ids", "attention_mask","token_type_ids","labels"])

In [None]:
now = datetime.now()
timestr = now.strftime('%Y%m%d-%H%M')
args = TrainingArguments(
    output_dir=f"YOUR_OUTPUT_DIR/{model_name}-{timestr}",
    evaluation_strategy="epoch",
    learning_rate=lr,
    lr_scheduler_type=lr_schedule,
    per_device_train_batch_size=train_batch_size,
    per_device_eval_batch_size=eval_batch_size,
    weight_decay=0.01,
    save_total_limit=5,
    num_train_epochs=num_train_epochs,
    logging_strategy='epoch',
    save_strategy='epoch'
)

trainer = Trainer(
    model,
    args,
    train_dataset=train_dataset,
    eval_dataset=eval_dataset,
    data_collator=data_collator,
    tokenizer=tokenizer,
    compute_metrics=compute_metrics
)

In [None]:
training_outputs = trainer.train()
now = datetime.now()
timestr = now.strftime('%Y%m%d-%H%M')
training_outputs 