In [1]:
import os
import sys
sys.path.append(os.getcwd() + '/..')
import torch
import pandas as pd
import numpy as np
from datetime import datetime
from ellement.transformers import AutoTokenizer, AutoModelForSequenceClassification, DataCollatorWithPadding, Trainer, TrainingArguments
from datasets import Dataset
import evaluate

In [2]:
train_data_path = './../../data/sub/ebay_us_nonleaf-20240523-1349-train.csv'
test_data_path = './../../data/sub/ebay_us_nonleaf-20240523-1349-eval.csv'
model_checkpoint = 'mms://core-ai-nlp/eBERT2-base-cased'
model_name = model_checkpoint.split("/")[-1]
device = 'cuda' if torch.cuda.is_available() else 'cpu'

train_batch_size = 16
eval_batch_size = 16
num_train_epochs = 4
lr = 5e-5
lr_schedule='linear'
np.random.seed(114514)
torch.manual_seed(114514)

os.environ['TOKENIZERS_PARALLELISM'] = 'false'

In [3]:
model = AutoModelForSequenceClassification.from_pretrained(model_checkpoint).to(device)
tokenizer = AutoTokenizer.from_pretrained(model_checkpoint,model_max_length=72)
data_collator = DataCollatorWithPadding(tokenizer,padding=True)
precision_score = evaluate.load('precision')
recall_score = evaluate.load('recall')
f1_score = evaluate.load("f1")
acc_score = evaluate.load('accuracy')

Some weights of MegatronBertForSequenceClassification were not initialized from the model checkpoint at /data/ebay-slc-a100/data/jingcshi/.cache/ellement/core-ai-nlp/eBERT2-base-cased/1 and are newly initialized: ['bert.pooler.dense.bias', 'bert.pooler.dense.weight', 'classifier.bias', 'classifier.weight']
You should probably TRAIN this model on a down-stream task to be able to use it for predictions and inference.


In [4]:
def tokenize(examples,return_tensors=None):
    return tokenizer(examples["Subclass"],examples["Superclass"],return_tensors=return_tensors)

In [5]:
def compute_metrics(eval_pred):
    logits, labels = eval_pred
    predictions = np.argmax(logits, axis=-1)
    acc = acc_score.compute(predictions=predictions, references=labels)
    p = precision_score.compute(predictions=predictions, references=labels)
    r = recall_score.compute(predictions=predictions, references=labels)
    f1 = f1_score.compute(predictions=predictions, references=labels)
    result = {**acc, **p, **r, **f1}
    return {k: round(v, 6) for k, v in result.items()}

In [6]:
train_data = pd.read_csv(train_data_path)
eval_data = pd.read_csv(test_data_path)
train_dataset = Dataset.from_pandas(train_data)
eval_dataset = Dataset.from_pandas(eval_data)
train_dataset = train_dataset.map(tokenize, batched=True)
eval_dataset = eval_dataset.map(tokenize, batched=True)
train_dataset.set_format(type="torch", columns=["input_ids", "attention_mask","token_type_ids","labels"])
eval_dataset.set_format(type="torch", columns=["input_ids", "attention_mask","token_type_ids","labels"])

Map:   0%|          | 0/12384 [00:00<?, ? examples/s]

Map:   0%|          | 0/1377 [00:00<?, ? examples/s]

In [7]:
now = datetime.now()
timestr = now.strftime('%Y%m%d-%H%M')
args = TrainingArguments(
    output_dir=f"/data/ebay-slc-a100/data/jingcshi/ICON_models/sub/ebert2-nonleaf-tuned",
    evaluation_strategy="epoch",
    learning_rate=lr,
    lr_scheduler_type=lr_schedule,
    per_device_train_batch_size=train_batch_size,
    per_device_eval_batch_size=eval_batch_size,
    weight_decay=0.01,
    save_total_limit=5,
    num_train_epochs=num_train_epochs,
    logging_strategy='epoch',
    save_strategy='epoch'
)

trainer = Trainer(
    model,
    args,
    train_dataset=train_dataset,
    eval_dataset=eval_dataset,
    data_collator=data_collator,
    tokenizer=tokenizer,
    compute_metrics=compute_metrics
)

In [8]:
training_outputs = trainer.train()
now = datetime.now()
timestr = now.strftime('%Y%m%d-%H%M')
training_outputs 

Epoch,Training Loss,Validation Loss,Accuracy,Precision,Recall,F1
1,0.4008,0.206351,0.933188,0.869215,0.941176,0.903766
2,0.1762,0.192239,0.94626,0.881188,0.969499,0.923237
3,0.1095,0.191514,0.957879,0.913402,0.965142,0.938559
4,0.062,0.187536,0.959332,0.910387,0.973856,0.941053


TrainOutput(global_step=3096, training_loss=0.18711510493157754, metrics={'train_runtime': 123.2375, 'train_samples_per_second': 401.956, 'train_steps_per_second': 25.122, 'total_flos': 520622778573120.0, 'train_loss': 0.18711510493157754, 'epoch': 4.0})

In [10]:
example = {'Subclass':'Men\'s Vintage T-Shirts', 'Superclass':'Men\'s T-Shirts'}
inputs = tokenize(example,return_tensors='pt').to(device)
predictions = torch.softmax(model(**inputs).logits.detach().cpu().squeeze(),0).numpy()
predictions

array([0.00558775, 0.99441224], dtype=float32)