In [None]:
!pip install transformers
!apt install aptitude swig
!aptitude install mecab libmecab-dev mecab-ipadic-utf8 git make curl xz-utils file -y
!pip install mecab-python3
!pip install datasets
!pip install fugashi unidic-lite
!pip install onnxruntime

Looking in indexes: https://pypi.org/simple, https://us-python.pkg.dev/colab-wheels/public/simple/
Collecting transformers
  Downloading transformers-4.23.1-py3-none-any.whl (5.3 MB)
[K     |████████████████████████████████| 5.3 MB 14.9 MB/s 
[?25hCollecting tokenizers!=0.11.3,<0.14,>=0.11.1
  Downloading tokenizers-0.13.1-cp37-cp37m-manylinux_2_17_x86_64.manylinux2014_x86_64.whl (7.6 MB)
[K     |████████████████████████████████| 7.6 MB 60.8 MB/s 
Collecting huggingface-hub<1.0,>=0.10.0
  Downloading huggingface_hub-0.10.1-py3-none-any.whl (163 kB)
[K     |████████████████████████████████| 163 kB 82.1 MB/s 
Installing collected packages: tokenizers, huggingface-hub, transformers
Successfully installed huggingface-hub-0.10.1 tokenizers-0.13.1 transformers-4.23.1
Reading package lists... Done
Building dependency tree       
Reading state information... Done
The following package was automatically installed and is no longer required:
  libnvidia-common-460
Use 'apt autoremove' to remo

In [None]:
!git clone https://kami9811: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@github.com/kami9811/multi_emotion_datasets.git

%cd multi_emotion_datasets

Cloning into 'multi_emotion_datasets'...
remote: Enumerating objects: 12, done.[K
remote: Counting objects: 100% (12/12), done.[K
remote: Compressing objects: 100% (7/7), done.[K
remote: Total 12 (delta 4), reused 12 (delta 4), pack-reused 0[K
Unpacking objects: 100% (12/12), done.
/content/multi_emotion_datasets


In [None]:
! mkdir -p results
! mkdir -p logs

In [None]:
import pandas as pd
from sklearn.model_selection import train_test_split
from datasets import Dataset, DatasetDict

# picklename = 'wrime_zero-one.pickle'
picklename = 'wrime_zero-one_avg.pickle'
# picklename = 'wrime_zero-one_avg_all.pickle'

df = pd.DataFrame.from_dict(pd.read_pickle(picklename))

# separate labels
LABEL_COLUMNS = ['joy','sadness', 'anticipation', 'surprise', 'anger', 'fear', 'disgust', 'trust']
df[LABEL_COLUMNS] = pd.DataFrame(df.label.tolist(), index= df.index)

# データセットの分割
train_df, val_test_df = train_test_split(df, train_size=0.8, random_state=32)
val_df, test_df = train_test_split(val_test_df, train_size=0.5, random_state=32)
train_df, val_df, test_df = train_df.reset_index(drop=True), val_df.reset_index(drop=True), test_df.reset_index(drop=True)

print("FULL Dataset: {}".format(len(df)))
print("TRAIN Dataset: {}".format(len(train_df)))
print("TEST Dataset: {}".format(len(test_df)))
print("VALIDATION Dataset: {}".format(len(val_df)))

columns = ["sentence", "label"]
dataset = DatasetDict({
    "train": Dataset.from_pandas(train_df[columns]),
    "test": Dataset.from_pandas(test_df[columns]),
    "validation": Dataset.from_pandas(val_df[columns])
})

FULL Dataset: 43200
TRAIN Dataset: 34560
TEST Dataset: 4320
VALIDATION Dataset: 4320


In [None]:
import transformers
import torch

# MAX_TOKEN_COUNT = 128 # 最大トークン数
MAX_TOKEN_COUNT = 256 # 最大トークン数
TRAIN_BATCH_SIZE = 32 # train時のbatch数
VALID_BATCH_SIZE = 32 # validation時のbatch数
EPOCHS = 5
LEARNING_RATE = 1e-05

MODEL_CHECKPOINT = "cl-tohoku/bert-base-japanese-v2"

In [None]:
tokenizer = transformers.AutoTokenizer.from_pretrained(MODEL_CHECKPOINT, use_fast=True)

In [None]:
def preprocess_function(examples):
    # return tokenizer(examples['sentence'], truncation=True, max_length=MAX_TOKEN_COUNT)
    return tokenizer(
        examples['sentence'],
        padding="max_length",
        max_length=MAX_TOKEN_COUNT,
        truncation=True,
    )

In [None]:
tokenized_dataset = dataset.map(preprocess_function, batched=True, batch_size=TRAIN_BATCH_SIZE)

  0%|          | 0/1080 [00:00<?, ?ba/s]

  0%|          | 0/135 [00:00<?, ?ba/s]

  0%|          | 0/135 [00:00<?, ?ba/s]

In [None]:
num_labels = 8
model = transformers.AutoModelForSequenceClassification.from_pretrained(MODEL_CHECKPOINT, num_labels=num_labels)

Downloading:   0%|          | 0.00/447M [00:00<?, ?B/s]

Some weights of the model checkpoint at cl-tohoku/bert-base-japanese-v2 were not used when initializing BertForSequenceClassification: ['cls.predictions.transform.dense.bias', 'cls.predictions.decoder.weight', 'cls.predictions.transform.dense.weight', 'cls.predictions.bias', 'cls.predictions.decoder.bias', 'cls.predictions.transform.LayerNorm.bias', 'cls.seq_relationship.weight', 'cls.predictions.transform.LayerNorm.weight', 'cls.seq_relationship.bias']
- This IS expected if you are initializing BertForSequenceClassification from the checkpoint of a model trained on another task or with another architecture (e.g. initializing a BertForSequenceClassification model from a BertForPreTraining model).
- This IS NOT expected if you are initializing BertForSequenceClassification from the checkpoint of a model that you expect to be exactly identical (initializing a BertForSequenceClassification model from a BertForSequenceClassification model).
Some weights of BertForSequenceClassification wer

In [None]:
from transformers import TrainingArguments
training_args = TrainingArguments(
    output_dir = "./results",                        # チェックポイント保存先ディレクトリ 
    learning_rate=LEARNING_RATE,                     # 学習率
    per_device_train_batch_size=TRAIN_BATCH_SIZE,    # training時のバッチ数
    per_device_eval_batch_size=VALID_BATCH_SIZE,     # evaluation時のバッチ数
    num_train_epochs=EPOCHS, # 1,                    # epoch数
    weight_decay=0.01,                               # weight decayの設定（過学習を抑える設定）
    load_best_model_at_end=True,                     # 学習後に最適のモデルを読み込む
    metric_for_best_model="accuracy",                # 最適なモデルを選択する際の評価基準
    evaluation_strategy = "steps",                   # 評価の実行頻度 "epoch"にすると毎epoch後に実行
    logging_steps=500, 
    logging_dir='./logs'
)

In [None]:
import numpy as np
from sklearn.metrics import (
    label_ranking_average_precision_score, 
    roc_auc_score, 
    accuracy_score,
    average_precision_score,
    f1_score
)

def sigmoid(x):
    return 1 / (1 + np.exp(-x))
sigmoid_v = np.vectorize(sigmoid)

In [None]:
def compute_metrics(eval_pred):
    predictions, labels = eval_pred
    predictions = sigmoid_v(predictions)  # sigmoid関数を使ってlogitを[0,1]に変換する
    # print(predictions)
    return dict(
        lrap_score=label_ranking_average_precision_score(labels, predictions),
        roc_score = roc_auc_score(labels, predictions),
        full_accuracy = accuracy_score(labels, predictions>0.5),
        accuracy = np.average(
            np.sum((predictions > 0.5) == labels, axis=1) / np.shape(labels)[1]
        ),
        f1_score = f1_score(labels, predictions > 0.5, average='samples'),
        average_precision = average_precision_score(labels, predictions)
    )

In [None]:
class MultilabelTrainer(transformers.Trainer):
    def compute_loss(self, model, inputs, return_outputs=False):
        labels = inputs.pop("labels")
        outputs = model(**inputs)
        logits = outputs.logits
        loss_fct = torch.nn.BCEWithLogitsLoss()
        loss = loss_fct(logits.view(-1, self.model.config.num_labels),
                        labels.float().view(-1, self.model.config.num_labels))
        return (loss, outputs) if return_outputs else loss

In [None]:
trainer = MultilabelTrainer(
    model=model,
    args=training_args,
    train_dataset=tokenized_dataset["train"],  
    eval_dataset=tokenized_dataset["validation"],
    tokenizer=tokenizer,
    compute_metrics=compute_metrics,
    callbacks=[transformers.EarlyStoppingCallback(early_stopping_patience=5)]
)

In [None]:
# Train
trainer.train()

The following columns in the training set don't have a corresponding argument in `BertForSequenceClassification.forward` and have been ignored: sentence. If sentence are not expected by `BertForSequenceClassification.forward`,  you can safely ignore this message.
***** Running training *****
  Num examples = 34560
  Num Epochs = 5
  Instantaneous batch size per device = 32
  Total train batch size (w. parallel, distributed & accumulation) = 32
  Gradient Accumulation steps = 1
  Total optimization steps = 5400


Step,Training Loss,Validation Loss,Lrap Score,Roc Score,Full Accuracy,Accuracy,F1 Score,Average Precision
500,0.2186,0.174922,0.893929,0.789503,0.614815,0.942159,0.080787,0.277189
1000,0.1644,0.157989,0.919373,0.834824,0.634028,0.945573,0.101829,0.37919
1500,0.1462,0.148771,0.929568,0.859929,0.641204,0.946817,0.129205,0.421865
2000,0.1385,0.144611,0.933915,0.873237,0.648611,0.948351,0.177953,0.441249
2500,0.1277,0.144262,0.933803,0.87553,0.650463,0.948553,0.167485,0.445858
3000,0.1224,0.142866,0.936465,0.880052,0.648148,0.948409,0.177431,0.453373
3500,0.1148,0.143502,0.937019,0.880821,0.646528,0.947569,0.192076,0.455835
4000,0.1102,0.143963,0.937109,0.883156,0.645602,0.948061,0.204035,0.459441
4500,0.1049,0.14341,0.938226,0.883866,0.653009,0.949219,0.186829,0.459531
5000,0.1013,0.143672,0.938788,0.884787,0.649074,0.948438,0.195856,0.46204


The following columns in the evaluation set don't have a corresponding argument in `BertForSequenceClassification.forward` and have been ignored: sentence. If sentence are not expected by `BertForSequenceClassification.forward`,  you can safely ignore this message.
***** Running Evaluation *****
  Num examples = 4320
  Batch size = 32
  _warn_prf(average, "true nor predicted", "F-score is", len(true_sum))
Saving model checkpoint to ./results/checkpoint-500
Configuration saved in ./results/checkpoint-500/config.json
Model weights saved in ./results/checkpoint-500/pytorch_model.bin
tokenizer config file saved in ./results/checkpoint-500/tokenizer_config.json
Special tokens file saved in ./results/checkpoint-500/special_tokens_map.json
The following columns in the evaluation set don't have a corresponding argument in `BertForSequenceClassification.forward` and have been ignored: sentence. If sentence are not expected by `BertForSequenceClassification.forward`,  you can safely ignore this 

TrainOutput(global_step=5400, training_loss=0.1324028672112359, metrics={'train_runtime': 8054.4364, 'train_samples_per_second': 21.454, 'train_steps_per_second': 0.67, 'total_flos': 2.27340198346752e+16, 'train_loss': 0.1324028672112359, 'epoch': 5.0})

In [None]:
# Test
trainer.evaluate(eval_dataset=tokenized_dataset["test"])

The following columns in the evaluation set don't have a corresponding argument in `BertForSequenceClassification.forward` and have been ignored: sentence. If sentence are not expected by `BertForSequenceClassification.forward`,  you can safely ignore this message.
***** Running Evaluation *****
  Num examples = 4320
  Batch size = 32


  _warn_prf(average, "true nor predicted", "F-score is", len(true_sum))


{'eval_loss': 0.145767942070961,
 'eval_lrap_score': 0.9332762437536756,
 'eval_roc_score': 0.8805275808755313,
 'eval_full_accuracy': 0.6405092592592593,
 'eval_accuracy': 0.9471354166666667,
 'eval_f1_score': 0.17171296296296296,
 'eval_average_precision': 0.4325676667226465,
 'eval_runtime': 71.0676,
 'eval_samples_per_second': 60.787,
 'eval_steps_per_second': 1.9,
 'epoch': 5.0}

In [None]:
from datetime import datetime
best_model_path = f"./results/best-models-{datetime.now().strftime('%Y%m%d%H%M%S')}/"

In [None]:
# Save Model
! mkdir -p {best_model_path}

model.save_pretrained(best_model_path)

Configuration saved in ./results/best-models-20221022020414/config.json
Model weights saved in ./results/best-models-20221022020414/pytorch_model.bin


In [None]:
model.to('cpu').save_pretrained(best_model_path)

Configuration saved in ./results/best-models-20221022020414/config.json
Model weights saved in ./results/best-models-20221022020414/pytorch_model.bin


In [None]:
# Load Model
model = transformers.AutoModelForSequenceClassification.from_pretrained(best_model_path).to('cpu')
model.eval()
LABEL_COLUMNS = ['joy','sadness', 'anticipation', 'surprise', 'anger', 'fear', 'disgust', 'trust']

# Predict Function
def infer(text):
    # tokenized_text = tokenizer.encode_plus(text, return_tensors="pt")
    tokenized_text = tokenizer(
        text,
        padding="max_length",
        max_length=MAX_TOKEN_COUNT,
        truncation=True,
        return_tensors="pt"
    )
    classification_logits = model(**tokenized_text)[0]
    result = torch.sigmoid(classification_logits)[0]
    return [{"label": LABEL_COLUMNS[i], "scores": b.item()} for i, b in enumerate(result)]

loading configuration file ./results/best-models-20221022020414/config.json
Model config BertConfig {
  "_name_or_path": "./results/best-models-20221022020414/",
  "architectures": [
    "BertForSequenceClassification"
  ],
  "attention_probs_dropout_prob": 0.1,
  "classifier_dropout": null,
  "hidden_act": "gelu",
  "hidden_dropout_prob": 0.1,
  "hidden_size": 768,
  "id2label": {
    "0": "LABEL_0",
    "1": "LABEL_1",
    "2": "LABEL_2",
    "3": "LABEL_3",
    "4": "LABEL_4",
    "5": "LABEL_5",
    "6": "LABEL_6",
    "7": "LABEL_7"
  },
  "initializer_range": 0.02,
  "intermediate_size": 3072,
  "label2id": {
    "LABEL_0": 0,
    "LABEL_1": 1,
    "LABEL_2": 2,
    "LABEL_3": 3,
    "LABEL_4": 4,
    "LABEL_5": 5,
    "LABEL_6": 6,
    "LABEL_7": 7
  },
  "layer_norm_eps": 1e-12,
  "max_position_embeddings": 512,
  "model_type": "bert",
  "num_attention_heads": 12,
  "num_hidden_layers": 12,
  "pad_token_id": 0,
  "position_embedding_type": "absolute",
  "tokenizer_class": "Bert

In [None]:
from pathlib import Path
from transformers.convert_graph_to_onnx import convert, optimize, quantize

ModuleNotFoundError: ignored

In [None]:
model_path = '/content/multi_emotion_datasets/results/best-models-20221022020414'

output_path = "/content/multi_emotion_datasets/onnx/conversion/model.onnx"

# https://github.com/huggingface/transformers/blob/master/src/transformers/convert_graph_to_onnx.py#L32-L42
convert(pipeline_name="sentiment-analysis", framework="pt", model=model_path, tokenizer=tokenizer, output=Path(output_path), opset=12)

# optimized_output = optimize(Path(output_path))
# quantize(optimized_output)

loading configuration file /content/multi_emotion_datasets/results/best-models-20221022020414/config.json
Model config BertConfig {
  "_name_or_path": "/content/multi_emotion_datasets/results/best-models-20221022020414",
  "architectures": [
    "BertForSequenceClassification"
  ],
  "attention_probs_dropout_prob": 0.1,
  "classifier_dropout": null,
  "hidden_act": "gelu",
  "hidden_dropout_prob": 0.1,
  "hidden_size": 768,
  "id2label": {
    "0": "LABEL_0",
    "1": "LABEL_1",
    "2": "LABEL_2",
    "3": "LABEL_3",
    "4": "LABEL_4",
    "5": "LABEL_5",
    "6": "LABEL_6",
    "7": "LABEL_7"
  },
  "initializer_range": 0.02,
  "intermediate_size": 3072,
  "label2id": {
    "LABEL_0": 0,
    "LABEL_1": 1,
    "LABEL_2": 2,
    "LABEL_3": 3,
    "LABEL_4": 4,
    "LABEL_5": 5,
    "LABEL_6": 6,
    "LABEL_7": 7
  },
  "layer_norm_eps": 1e-12,
  "max_position_embeddings": 512,
  "model_type": "bert",
  "num_attention_heads": 12,
  "num_hidden_layers": 12,
  "pad_token_id": 0,
  "posit

ONNX opset version set to: 12
Loading pipeline (model: /content/multi_emotion_datasets/results/best-models-20221022020414, tokenizer: PreTrainedTokenizer(name_or_path='cl-tohoku/bert-base-japanese-v2', vocab_size=32768, model_max_len=1000000000000000019884624838656, is_fast=False, padding_side='right', truncation_side='right', special_tokens={'unk_token': '[UNK]', 'sep_token': '[SEP]', 'pad_token': '[PAD]', 'cls_token': '[CLS]', 'mask_token': '[MASK]'}))


All model checkpoint weights were used when initializing BertForSequenceClassification.

All the weights of BertForSequenceClassification were initialized from the model checkpoint at /content/multi_emotion_datasets/results/best-models-20221022020414.
If your task is similar to the task the model of the checkpoint was trained on, you can already use BertForSequenceClassification for predictions without further training.


Using framework PyTorch: 1.12.1+cu113
Found input input_ids with shape: {0: 'batch', 1: 'sequence'}
Found input token_type_ids with shape: {0: 'batch', 1: 'sequence'}
Found input attention_mask with shape: {0: 'batch', 1: 'sequence'}
Found output output_0 with shape: {0: 'batch'}
Ensuring inputs are in correct order
position_ids is not present in the generated input list.
Generated inputs order: ['input_ids', 'attention_mask', 'token_type_ids']


In [None]:
import os
# Change to True when onnxruntime (like onnxruntime-gpu 1.0.0 ~ 1.1.2) cannot be imported.
add_cuda_path = False

# For Linux, see https://docs.nvidia.com/cuda/cuda-installation-guide-linux/index.html#environment-setup
# Below is example for Windows
# if add_cuda_path:
#     cuda_dir = 'D:/NVidia/CUDA/v11.0/bin'
#     cudnn_dir = 'D:/NVidia/CUDA/v11.0/bin'
#     if not (os.path.exists(cuda_dir) and os.path.exists(cudnn_dir)):
#         raise ValueError("Please specify correct path for CUDA and cuDNN. Otherwise onnxruntime cannot be imported.")
#     else:
#         if cuda_dir == cudnn_dir:
#             os.environ["PATH"] = cuda_dir + ';' + os.environ["PATH"]
#         else:
#             os.environ["PATH"] = cuda_dir + ';' + cudnn_dir + ';' + os.environ["PATH"]

In [None]:
import psutil
import onnxruntime
import numpy

# assert 'CUDAExecutionProvider' in onnxruntime.get_available_providers()
device_name = 'cpu'

sess_options = onnxruntime.SessionOptions()

# Optional: store the optimized graph and view it using Netron to verify that model is fully optimized.
# Note that this will increase session creation time so enable it for debugging only.
# sess_options.optimized_model_filepath = "./model.onnx"

# Please change the value according to best setting in Performance Test Tool result.
# sess_options.intra_op_num_threads=psutil.cpu_count(logical=True)

# session = onnxruntime.InferenceSession(export_model_path, sess_options)
# session = onnxruntime.InferenceSession("./model.onnx")
session = onnxruntime.InferenceSession(output_path)

# latency = []
# for i in range(total_samples):
#     data = dataset[i]
#     ort_inputs = {
#         'input_ids':  data[0].cpu().reshape(1, max_seq_length).numpy(),
#         'input_mask': data[1].cpu().reshape(1, max_seq_length).numpy(),
#         'segment_ids': data[2].cpu().reshape(1, max_seq_length).numpy()
#     }
#     start = time.time()
#     ort_outputs = session.run(None, ort_inputs)
#     latency.append(time.time() - start)

In [None]:
text = "よろしくお願いします！一緒に頑張りましょう"
# eval_dataset = [tokenizer(raw["sentence1"],
#                               padding=True,
#                               max_length=128,
#                               truncation=True,
#                               return_tensors="np") for raw in raw_datasets]
eval_data = tokenizer(
    text,
    padding="max_length",
    max_length=MAX_TOKEN_COUNT,
    truncation=True,
    return_tensors="np"
)
# ONNX形式のモデルから推論用モデルを作成
# session = InferenceSession(args.onnx_path)

# result = session.run(output_names=["last_hidden_state"], input_feed=dict(eval_dataset[0]))
# result_ = session.run(output_names=["last_hidden_state"], input_feed=dict(eval_dataset[1]))
result_ = session.run(None, input_feed=dict(eval_data))
r = {
    LABEL_COLUMNS[i]: (1.0 / (1.0 + np.exp(-b)))
    for i, b in enumerate(result_[0][0].tolist())
}

In [None]:
r

{'joy': 0.0037869287958787303,
 'sadness': 0.282583237192597,
 'anticipation': 0.009742174719288055,
 'surprise': 0.013730459849224013,
 'anger': 0.02705808395921563,
 'fear': 0.6450797084461645,
 'disgust': 0.35239159495264594,
 'trust': 0.007315111800065529}