In [8]:
import os
import sys
sys.path.append(os.getcwd() + '/..')
import torch
import pandas as pd
import numpy as np
from datetime import datetime
from transformers import BertTokenizer, BertForSequenceClassification, DataCollatorWithPadding, Trainer, TrainingArguments
from datasets import Dataset
import evaluate

In [9]:
train_data_path = './../../data/sub/train.csv'
test_data_path = './../../data/sub/test.csv'
model_checkpoint = "/data2T/jingchuan/untuned/ebert/base/"
model_name = model_checkpoint.split("/")[-1]
device = 'cuda' if torch.cuda.is_available() else 'cpu'

train_batch_size = 64
eval_batch_size = 64
num_train_epochs = 3
lr = 1.5e-5
lr_schedule='linear'
np.random.seed(114514)
torch.manual_seed(114514)

<torch._C.Generator at 0x7f6c4dec7990>

In [10]:
model = BertForSequenceClassification.from_pretrained(model_checkpoint).to(device)
tokenizer = BertTokenizer.from_pretrained(model_checkpoint,model_max_length=72)
data_collator = DataCollatorWithPadding(tokenizer,padding=True)
precision_score = evaluate.load('precision')
recall_score = evaluate.load('recall')
f1_score = evaluate.load("f1")
acc_score = evaluate.load('accuracy')

Some weights of BertForSequenceClassification were not initialized from the model checkpoint at /data2T/jingchuan/untuned/ebert/base/ and are newly initialized: ['classifier.weight', 'classifier.bias']
You should probably TRAIN this model on a down-stream task to be able to use it for predictions and inference.
The tokenizer class you load from this checkpoint is not the same type as the class this function is called from. It may result in unexpected tokenization. 
The tokenizer class you load from this checkpoint is 'EBertTokenizer'. 
The class this function is called from is 'BertTokenizer'.


In [11]:
def tokenize(examples,return_tensors=None):
    return tokenizer(examples["Subclass"],examples["Superclass"],return_tensors=return_tensors)

In [12]:
def compute_metrics(eval_pred):
    logits, labels = eval_pred
    predictions = np.argmax(logits, axis=-1)
    acc = acc_score.compute(predictions=predictions, references=labels)
    p = precision_score.compute(predictions=predictions, references=labels)
    r = recall_score.compute(predictions=predictions, references=labels)
    f1 = f1_score.compute(predictions=predictions, references=labels)
    result = {**acc, **p, **r, **f1}
    return {k: round(v, 6) for k, v in result.items()}

In [13]:
train_data = pd.read_csv(train_data_path)
eval_data = pd.read_csv(test_data_path)
train_dataset = Dataset.from_pandas(train_data)
eval_dataset = Dataset.from_pandas(eval_data)
train_dataset = train_dataset.map(tokenize, batched=True)
eval_dataset = eval_dataset.map(tokenize, batched=True)
train_dataset.set_format(type="torch", columns=["input_ids", "attention_mask","token_type_ids","labels"])
eval_dataset.set_format(type="torch", columns=["input_ids", "attention_mask","token_type_ids","labels"])

Map:   0%|          | 0/242229 [00:00<?, ? examples/s]

Map:   0%|          | 0/12750 [00:00<?, ? examples/s]

In [14]:
now = datetime.now()
timestr = now.strftime('%Y%m%d-%H%M')
args = TrainingArguments(
    output_dir=f"/data2T/jingchuan/tuned/sub/{timestr}",
    evaluation_strategy="epoch",
    learning_rate=lr,
    lr_scheduler_type=lr_schedule,
    per_device_train_batch_size=train_batch_size,
    per_device_eval_batch_size=eval_batch_size,
    weight_decay=0.01,
    save_total_limit=5,
    num_train_epochs=num_train_epochs,
    logging_strategy='epoch',
    save_strategy='epoch'
)

trainer = Trainer(
    model,
    args,
    train_dataset=train_dataset,
    eval_dataset=eval_dataset,
    data_collator=data_collator,
    tokenizer=tokenizer,
    compute_metrics=compute_metrics
)

In [15]:
training_outputs = trainer.train()
now = datetime.now()
timestr = now.strftime('%Y%m%d-%H%M')
training_outputs 



Epoch,Training Loss,Validation Loss,Accuracy,Precision,Recall,F1
1,0.105,0.047041,0.983216,0.973265,0.976471,0.974865
2,0.0416,0.049538,0.986039,0.967616,0.991294,0.979312
3,0.0259,0.042683,0.987922,0.973198,0.991059,0.982047




TrainOutput(global_step=5679, training_loss=0.05752038527467045, metrics={'train_runtime': 7737.9876, 'train_samples_per_second': 93.912, 'train_steps_per_second': 0.734, 'total_flos': 1.042452990223728e+16, 'train_loss': 0.05752038527467045, 'epoch': 3.0})

In [31]:
example = {'Subclass':'Vintage Clothing', 'Superclass':'Women\'s Clothing, Shoes & Accessories'}
inputs = tokenize(example,return_tensors='pt').to(device)
predictions = torch.softmax(model(**inputs).logits.detach().cpu().squeeze(),0)
predictions

tensor([4.7969e-04, 9.9952e-01])

In [27]:
model.save_pretrained('/data2T/jingchuan/tuned/sub/bertsubs-sota')
tokenizer.save_pretrained('/data2T/jingchuan/tuned/sub/bertsubs-sota')

('/data2T/jingchuan/tuned/sub/bertsubs-sota/tokenizer_config.json',
 '/data2T/jingchuan/tuned/sub/bertsubs-sota/special_tokens_map.json',
 '/data2T/jingchuan/tuned/sub/bertsubs-sota/vocab.txt',
 '/data2T/jingchuan/tuned/sub/bertsubs-sota/added_tokens.json')

In [26]:
! pwd

/home/jingchuan/Taxonomy_Completion-main/experiments/training
