In [1]:
# !pip install transformers
!pip install transformers[torch]
!apt install aptitude swig
!aptitude install mecab libmecab-dev mecab-ipadic-utf8 git make curl xz-utils file -y
!pip install mecab-python3
!pip install datasets
!pip install fugashi unidic-lite
!pip install onnxruntime
!pip install --upgrade onnx
!pip install accelerate~=0.21.0

Reading package lists... Done
Building dependency tree... Done
Reading state information... Done
aptitude is already the newest version (0.8.13-3ubuntu1).
swig is already the newest version (4.0.2-1ubuntu1).
0 upgraded, 0 newly installed, 0 to remove and 35 not upgraded.
mecab is already installed at the requested version (0.996-14build9)
libmecab-dev is already installed at the requested version (0.996-14build9)
mecab-ipadic-utf8 is already installed at the requested version (2.7.0-20070801+main-3)
git is already installed at the requested version (1:2.34.1-1ubuntu1.10)
make is already installed at the requested version (4.3-4.1build1)
curl is already installed at the requested version (7.81.0-1ubuntu1.15)
xz-utils is already installed at the requested version (5.2.5-2ubuntu1)
file is already installed at the requested version (1:5.41-3ubuntu0.1)
mecab is already installed at the requested version (0.996-14build9)
libmecab-dev is already installed at the requested version (0.996-14bui

In [2]:
!git clone https://kami9811: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@github.com/kami9811/multi_emotion_datasets.git

%cd multi_emotion_datasets

fatal: destination path 'multi_emotion_datasets' already exists and is not an empty directory.
/content/multi_emotion_datasets


In [None]:
!git clone https://github.com/kami9811/multi_emotion_datasets.git

In [3]:
! mkdir -p results
! mkdir -p logs

In [4]:
import pandas as pd
from sklearn.model_selection import train_test_split
from datasets import Dataset, DatasetDict

# picklename = 'wrime_zero-one.pickle'
picklename = 'wrime_zero-one_avg.pickle'
# picklename = 'wrime_zero-one_avg_all.pickle'

df = pd.DataFrame.from_dict(pd.read_pickle(picklename))

# separate labels
LABEL_COLUMNS = ['joy','sadness', 'anticipation', 'surprise', 'anger', 'fear', 'disgust', 'trust']
df[LABEL_COLUMNS] = pd.DataFrame(df.label.tolist(), index= df.index)

# データセットの分割
train_df, val_test_df = train_test_split(df, train_size=0.8, random_state=32)
val_df, test_df = train_test_split(val_test_df, train_size=0.5, random_state=32)
train_df, val_df, test_df = train_df.reset_index(drop=True), val_df.reset_index(drop=True), test_df.reset_index(drop=True)

print("FULL Dataset: {}".format(len(df)))
print("TRAIN Dataset: {}".format(len(train_df)))
print("TEST Dataset: {}".format(len(test_df)))
print("VALIDATION Dataset: {}".format(len(val_df)))

columns = ["sentence", "label"]
dataset = DatasetDict({
    "train": Dataset.from_pandas(train_df[columns]),
    "test": Dataset.from_pandas(test_df[columns]),
    "validation": Dataset.from_pandas(val_df[columns])
})

FULL Dataset: 43200
TRAIN Dataset: 34560
TEST Dataset: 4320
VALIDATION Dataset: 4320


In [5]:
import transformers
import torch

# MAX_TOKEN_COUNT = 128 # 最大トークン数
MAX_TOKEN_COUNT = 256 # 最大トークン数
TRAIN_BATCH_SIZE = 32 # train時のbatch数
VALID_BATCH_SIZE = 32 # validation時のbatch数
EPOCHS = 5
LEARNING_RATE = 1e-05

MODEL_CHECKPOINT = "cl-tohoku/bert-base-japanese-v2"

In [6]:
tokenizer = transformers.AutoTokenizer.from_pretrained(MODEL_CHECKPOINT, use_fast=True)

The secret `HF_TOKEN` does not exist in your Colab secrets.
To authenticate with the Hugging Face Hub, create a token in your settings tab (https://huggingface.co/settings/tokens), set it as secret in your Google Colab and restart your session.
You will be able to reuse this secret in all of your notebooks.
Please note that authentication is recommended but still optional to access public models or datasets.


In [7]:
def preprocess_function(examples):
    # return tokenizer(examples['sentence'], truncation=True, max_length=MAX_TOKEN_COUNT)
    return tokenizer(
        examples['sentence'],
        padding="max_length",
        max_length=MAX_TOKEN_COUNT,
        truncation=True,
    )

In [8]:
tokenized_dataset = dataset.map(preprocess_function, batched=True, batch_size=TRAIN_BATCH_SIZE)



Map:   0%|          | 0/34560 [00:00<?, ? examples/s]

Map:   0%|          | 0/4320 [00:00<?, ? examples/s]

Map:   0%|          | 0/4320 [00:00<?, ? examples/s]

In [9]:
num_labels = 8
model = transformers.AutoModelForSequenceClassification.from_pretrained(MODEL_CHECKPOINT, num_labels=num_labels)

  return self.fget.__get__(instance, owner)()
Some weights of BertForSequenceClassification were not initialized from the model checkpoint at cl-tohoku/bert-base-japanese-v2 and are newly initialized: ['classifier.bias', 'classifier.weight']
You should probably TRAIN this model on a down-stream task to be able to use it for predictions and inference.


In [10]:
from transformers import TrainingArguments
training_args = TrainingArguments(
    output_dir = "./results",                        # チェックポイント保存先ディレクトリ
    learning_rate=LEARNING_RATE,                     # 学習率
    per_device_train_batch_size=TRAIN_BATCH_SIZE,    # training時のバッチ数
    per_device_eval_batch_size=VALID_BATCH_SIZE,     # evaluation時のバッチ数
    num_train_epochs=EPOCHS, # 1,                    # epoch数
    weight_decay=0.01,                               # weight decayの設定（過学習を抑える設定）
    load_best_model_at_end=True,                     # 学習後に最適のモデルを読み込む
    metric_for_best_model="accuracy",                # 最適なモデルを選択する際の評価基準
    evaluation_strategy = "steps",                   # 評価の実行頻度 "epoch"にすると毎epoch後に実行
    logging_steps=500,
    logging_dir='./logs'
)

In [11]:
import numpy as np
from sklearn.metrics import (
    label_ranking_average_precision_score,
    roc_auc_score,
    accuracy_score,
    average_precision_score,
    f1_score
)

def sigmoid(x):
    return 1 / (1 + np.exp(-x))
sigmoid_v = np.vectorize(sigmoid)

In [12]:
def compute_metrics(eval_pred):
    predictions, labels = eval_pred
    predictions = sigmoid_v(predictions)  # sigmoid関数を使ってlogitを[0,1]に変換する
    # print(predictions)
    return dict(
        lrap_score=label_ranking_average_precision_score(labels, predictions),
        roc_score = roc_auc_score(labels, predictions),
        full_accuracy = accuracy_score(labels, predictions>0.5),
        accuracy = np.average(
            np.sum((predictions > 0.5) == labels, axis=1) / np.shape(labels)[1]
        ),
        f1_score = f1_score(labels, predictions > 0.5, average='samples'),
        average_precision = average_precision_score(labels, predictions)
    )

In [13]:
class MultilabelTrainer(transformers.Trainer):
    def compute_loss(self, model, inputs, return_outputs=False):
        labels = inputs.pop("labels")
        outputs = model(**inputs)
        logits = outputs.logits
        loss_fct = torch.nn.BCEWithLogitsLoss()
        loss = loss_fct(logits.view(-1, self.model.config.num_labels),
                        labels.float().view(-1, self.model.config.num_labels))
        return (loss, outputs) if return_outputs else loss

In [14]:
trainer = MultilabelTrainer(
    model=model,
    args=training_args,
    train_dataset=tokenized_dataset["train"],
    eval_dataset=tokenized_dataset["validation"],
    tokenizer=tokenizer,
    compute_metrics=compute_metrics,
    callbacks=[transformers.EarlyStoppingCallback(early_stopping_patience=5)]
)

In [15]:
# Train
trainer.train()

Step,Training Loss,Validation Loss,Lrap Score,Roc Score,Full Accuracy,Accuracy,F1 Score,Average Precision
500,0.2126,0.173053,0.897416,0.795458,0.615972,0.942853,0.058526,0.287437
1000,0.1653,0.15699,0.919033,0.853479,0.63125,0.945052,0.097454,0.394179
1500,0.1449,0.149078,0.925601,0.864498,0.64375,0.947483,0.153511,0.420143
2000,0.1396,0.145904,0.933474,0.871828,0.640741,0.94702,0.188503,0.43735
2500,0.1275,0.146261,0.933523,0.871035,0.645602,0.947569,0.167052,0.453637
3000,0.1215,0.142849,0.938024,0.878086,0.647685,0.948351,0.18591,0.457848
3500,0.1168,0.144489,0.937791,0.878461,0.640741,0.947338,0.195787,0.453037
4000,0.1084,0.143653,0.937768,0.879221,0.640972,0.947454,0.202924,0.461752
4500,0.1063,0.143724,0.938591,0.880031,0.644213,0.947743,0.195085,0.464841
5000,0.1032,0.14477,0.938054,0.880608,0.643519,0.947541,0.198756,0.463231


  _warn_prf(average, "true nor predicted", "F-score is", len(true_sum))
  _warn_prf(average, "true nor predicted", "F-score is", len(true_sum))
  _warn_prf(average, "true nor predicted", "F-score is", len(true_sum))
  _warn_prf(average, "true nor predicted", "F-score is", len(true_sum))
  _warn_prf(average, "true nor predicted", "F-score is", len(true_sum))
  _warn_prf(average, "true nor predicted", "F-score is", len(true_sum))
  _warn_prf(average, "true nor predicted", "F-score is", len(true_sum))
  _warn_prf(average, "true nor predicted", "F-score is", len(true_sum))
  _warn_prf(average, "true nor predicted", "F-score is", len(true_sum))
  _warn_prf(average, "true nor predicted", "F-score is", len(true_sum))


TrainOutput(global_step=5400, training_loss=0.132063335136131, metrics={'train_runtime': 7998.8249, 'train_samples_per_second': 21.603, 'train_steps_per_second': 0.675, 'total_flos': 2.27340198346752e+16, 'train_loss': 0.132063335136131, 'epoch': 5.0})

In [16]:
# Test
trainer.evaluate(eval_dataset=tokenized_dataset["test"])

  _warn_prf(average, "true nor predicted", "F-score is", len(true_sum))


{'eval_loss': 0.1453091949224472,
 'eval_lrap_score': 0.9293333425191077,
 'eval_roc_score': 0.8825726111075256,
 'eval_full_accuracy': 0.6407407407407407,
 'eval_accuracy': 0.947511574074074,
 'eval_f1_score': 0.1789891975308642,
 'eval_average_precision': 0.42510685055700603,
 'eval_runtime': 67.8239,
 'eval_samples_per_second': 63.694,
 'eval_steps_per_second': 1.99,
 'epoch': 5.0}

In [15]:
from datetime import datetime
date = datetime.now().strftime('%Y%m%d%H%M%S')
# best_model_path = f"./results/best-models-{date}/"
best_model_path = "./results/best-models-20240222154729"

In [17]:
# Save Model
!mkdir -p {best_model_path}

In [None]:
model.save_pretrained(best_model_path)

In [17]:
model.to('cpu').save_pretrained(best_model_path)

In [None]:
!git add .
!git commit -m "Added model"
!git push origin HEAD

^C

*** Please tell me who you are.

Run

  git config --global user.email "you@example.com"
  git config --global user.name "Your Name"

to set your account's default identity.
Omit --global to set the identity only in this repository.

fatal: unable to auto-detect email address (got 'root@2cc74461023d.(none)')
remote: Invalid username or password.
fatal: Authentication failed for 'https://kami9811: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@github.com/kami9811/multi_emotion_datasets.git/'


In [18]:
# Load Model
model = transformers.AutoModelForSequenceClassification.from_pretrained(best_model_path).to('cpu')
model.eval()
LABEL_COLUMNS = ['joy','sadness', 'anticipation', 'surprise', 'anger', 'fear', 'disgust', 'trust']

# Predict Function
def infer(text):
    # tokenized_text = tokenizer.encode_plus(text, return_tensors="pt")
    tokenized_text = tokenizer(
        text,
        padding="max_length",
        max_length=MAX_TOKEN_COUNT,
        truncation=True,
        return_tensors="pt"
    )
    classification_logits = model(**tokenized_text)[0]
    result = torch.sigmoid(classification_logits)[0]
    return [{"label": LABEL_COLUMNS[i], "scores": b.item()} for i, b in enumerate(result)]

In [19]:
# text = "3歳の子供がまだ起きてます😫 夫と二人でカウントダウン予定が寝かしつけー。 幼児で、こんな夜中におきてるおうちありますか？"
text = "何がつらいか分からないけどもうとにかく しんどくなる時ないですか？   働いてるより専業主婦やってこのコロナ禍過ごしてる 方が体調崩してる気がする"
token = tokenizer.encode_plus(text, return_tensors="pt")
token

{'input_ids': tensor([[    2,  1154,   862,   886, 12500,   861, 19206, 11148, 18578, 12431,
         32506, 13950,  6796,  6156, 11164,  2754, 11148, 12461,   861,    45,
         14868, 15125, 11159, 26565, 21795, 12538,   888, 11156, 17232,  3931,
         15077, 15125,  2706,   862, 18722, 23021, 15125,  3139,   862, 11137,
             3]]), 'token_type_ids': tensor([[0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0,
         0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0]]), 'attention_mask': tensor([[1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1,
         1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1]])}

In [20]:
token.values()

dict_values([tensor([[    2,  1154,   862,   886, 12500,   861, 19206, 11148, 18578, 12431,
         32506, 13950,  6796,  6156, 11164,  2754, 11148, 12461,   861,    45,
         14868, 15125, 11159, 26565, 21795, 12538,   888, 11156, 17232,  3931,
         15077, 15125,  2706,   862, 18722, 23021, 15125,  3139,   862, 11137,
             3]]), tensor([[0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0,
         0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0]]), tensor([[1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1,
         1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1]])])

In [21]:
result = model(**token)
result

SequenceClassifierOutput(loss=None, logits=tensor([[-5.2288, -0.9075, -4.3565, -3.9626, -4.0556,  0.8492, -1.8024, -5.2674]],
       grad_fn=<AddmmBackward0>), hidden_states=None, attentions=None)

In [22]:
infer("3歳の子供がまだ起きてます😫 夫と二人でカウントダウン予定が寝かしつけー。 幼児で、こんな夜中におきてるおうちありますか？")

[{'label': 'joy', 'scores': 0.007525517605245113},
 {'label': 'sadness', 'scores': 0.02284095622599125},
 {'label': 'anticipation', 'scores': 0.013108786195516586},
 {'label': 'surprise', 'scores': 0.12006685137748718},
 {'label': 'anger', 'scores': 0.007231175433844328},
 {'label': 'fear', 'scores': 0.16505961120128632},
 {'label': 'disgust', 'scores': 0.01943563111126423},
 {'label': 'trust', 'scores': 0.0017516353400424123}]

In [23]:
infer("あけましておめでとうございます┏○ﾍﾟｺｯ")

[{'label': 'joy', 'scores': 0.1116100549697876},
 {'label': 'sadness', 'scores': 0.041458889842033386},
 {'label': 'anticipation', 'scores': 0.016502587124705315},
 {'label': 'surprise', 'scores': 0.012950001284480095},
 {'label': 'anger', 'scores': 0.001998380059376359},
 {'label': 'fear', 'scores': 0.00556955486536026},
 {'label': 'disgust', 'scores': 0.008367715403437614},
 {'label': 'trust', 'scores': 0.001949393772520125}]

In [28]:
infer("もうすぐで3歳になる娘にキッズカメラを誕生日プレゼントしようかなと思っています🌸  お聞きしたいのですか、ハピカムt3 をお持ちの方いらっしゃいますか？？💓 色々と調べてこちらかいいかな？と第一候補に上がっています。  使い勝手など、教えていただきたいです🎀🌸🌻")

[{'label': 'joy', 'scores': 0.025021303445100784},
 {'label': 'sadness', 'scores': 0.011531459167599678},
 {'label': 'anticipation', 'scores': 0.8565406799316406},
 {'label': 'surprise', 'scores': 0.012321189045906067},
 {'label': 'anger', 'scores': 0.006971614435315132},
 {'label': 'fear', 'scores': 0.014531666412949562},
 {'label': 'disgust', 'scores': 0.00812917947769165},
 {'label': 'trust', 'scores': 0.008547312580049038}]

In [29]:
infer("何がつらいか分からないけどもうとにかく しんどくなる時ないですか？   働いてるより専業主婦やってこのコロナ禍過ごしてる 方が体調崩してる気がする")

[{'label': 'joy', 'scores': 0.005331307649612427},
 {'label': 'sadness', 'scores': 0.2875157296657562},
 {'label': 'anticipation', 'scores': 0.012660914100706577},
 {'label': 'surprise', 'scores': 0.018659114837646484},
 {'label': 'anger', 'scores': 0.017029354348778725},
 {'label': 'fear', 'scores': 0.7004057168960571},
 {'label': 'disgust', 'scores': 0.14156441390514374},
 {'label': 'trust', 'scores': 0.005130612291395664}]

In [31]:
infer("いきなり夜に来て前にケンカしたことでまだ怒ってて子供の前で1時間ほど怒鳴り散らしてきました😓  殴られても仕方ない 殴りたくなる顔してる 今までの女はこんなんじゃなかった 最低な女だね 離婚してもいいと思ってる 離婚しないのは行く場所がないからでしょ？  などなど言われました...😇  旦那は味方せず  ほんっとに疲れました 思い出しただけでイライラします  私の親が今までの殴られたことに対して慰謝料取って離婚しろって言ったことに、慰謝料なんて取れる訳ない立件されない と、言われましたけど取れますよね？？  旦那が浮気して女を乗せた車が嫌で新車買い替えたことにたいして  そんなことで買い替えるな。って。 そんなことでってなに？？  女乗せるくらいいいでしょ！って言われました、 じゃあお前の息子は私が男乗せたら気が狂うようにキレるよな？？？ 同じことしてええんか？？怒るよな？ 息子は浮気していい、私はダメ？はあん？  人の親の悪口を親子揃って言って  もうやり場のない怒りがすごいです")

[{'label': 'joy', 'scores': 0.03772152587771416},
 {'label': 'sadness', 'scores': 0.13115018606185913},
 {'label': 'anticipation', 'scores': 0.03895532339811325},
 {'label': 'surprise', 'scores': 0.09338989108800888},
 {'label': 'anger', 'scores': 0.7319818139076233},
 {'label': 'fear', 'scores': 0.15851303935050964},
 {'label': 'disgust', 'scores': 0.8724462985992432},
 {'label': 'trust', 'scores': 0.029044685885310173}]

In [25]:
from pathlib import Path
from transformers.convert_graph_to_onnx import convert, optimize, quantize

In [27]:
# 変換元のモデルがあるディレクトリを指定します
# model_path = f'/content/multi_emotion_datasets/results/best-models-{date}'
model_path = '/content/multi_emotion_datasets/results/best-models-20240222154729'

output_path = "/content/multi_emotion_datasets/onnx/conversion/model.onnx"

# 実はこちらはこちらでpipelineとしてtoken-classificationがサポート外なんですが、変換できちゃうのでそのままいきます
# https://github.com/huggingface/transformers/blob/master/src/transformers/convert_graph_to_onnx.py#L32-L42
convert(pipeline_name="sentiment-analysis", framework="pt", model=model_path, tokenizer=tokenizer, output=Path(output_path), opset=12)
# ダミーの入力でモデルを実行し、エクスポートする
# inputs = tokenizer("Hello, my dog is cute", return_tensors="pt")
# with torch.no_grad():
#     symbolic_names = {0: 'batch_size', 1: 'max_seq_len'}
#     torch.onnx.export(model,                                             # モデル
#                       args=(inputs['input_ids'],                         # モデルの入力（ここではダミーの入力を使用）
#                             inputs['attention_mask']),                   # attention_maskも含める
#                       f=output_path,                                     # 出力されるONNXファイルのパス
#                       opset_version=12,                                  # 使用するONNXのバージョン
#                       do_constant_folding=True,                          # 定数畳み込みの最適化を行うかどうか
#                       input_names=['input_ids', 'attention_mask'],       # 入力テンソルの名前
#                       output_names=['outputs'],                          # 出力テンソルの名前
#                       dynamic_axes={'input_ids': symbolic_names,         # バッチサイズに動的軸を使用
#                                      'attention_mask': symbolic_names,   # attention_maskにも動的軸を使用
#                                      'outputs': symbolic_names})

# これはオマケで、さらにoptimize/quantizeすると速くなるので試します
# optimized_output = optimize(Path(output_path))
# quantize(optimized_output)

ONNX opset version set to: 12
Loading pipeline (model: /content/multi_emotion_datasets/results/best-models-20240222154729, tokenizer: BertJapaneseTokenizer(name_or_path='cl-tohoku/bert-base-japanese-v2', vocab_size=32768, model_max_length=1000000000000000019884624838656, is_fast=False, padding_side='right', truncation_side='right', special_tokens={'unk_token': '[UNK]', 'sep_token': '[SEP]', 'pad_token': '[PAD]', 'cls_token': '[CLS]', 'mask_token': '[MASK]'}, clean_up_tokenization_spaces=True),  added_tokens_decoder={
	0: AddedToken("[PAD]", rstrip=False, lstrip=False, single_word=False, normalized=False, special=True),
	1: AddedToken("[UNK]", rstrip=False, lstrip=False, single_word=False, normalized=False, special=True),
	2: AddedToken("[CLS]", rstrip=False, lstrip=False, single_word=False, normalized=False, special=True),
	3: AddedToken("[SEP]", rstrip=False, lstrip=False, single_word=False, normalized=False, special=True),
	4: AddedToken("[MASK]", rstrip=False, lstrip=False, single_w

In [29]:
!zip -r /content/multi_emotion_datasets/onnx/conversion.zip /content/multi_emotion_datasets/onnx/conversion

updating: content/multi_emotion_datasets/onnx/conversion/ (stored 0%)
  adding: content/multi_emotion_datasets/onnx/conversion/model.onnx (deflated 7%)


In [30]:
# dummy_input = torch.randn(1, 224, device='cpu').type(torch.LongTensor)
# dummy_input = torch.randn((1, 3, 224, 224))
text = "3歳の子供がまだ起きてます😫 夫と二人でカウントダウン予定が寝かしつけー。 幼児で、こんな夜中におきてるおうちありますか？"
# inputs = tokenizer.encode_plus(text, return_tensors="pt")
inputs = tokenizer(
    text,
    padding="max_length",
    max_length=MAX_TOKEN_COUNT,
    truncation=True,
    return_tensors="pt"
)
# print(dummy_input)
onnx_path =  "./model.onnx"
# torch.onnx.export(model, dummy_input, onnx_path, verbose=True)
torch.onnx.export(
  model,                                            # model being run
  args=tuple(inputs.values()),                      # model input (or a tuple for multiple inputs)
  f=onnx_path,                              # where to save the model (can be a file or file-like object)
  verbose=True,
  # opset_version=opset_version,                      # the ONNX version to export the model to
  # do_constant_folding=True,                         # whether to execute constant folding for optimization
  input_names=['input_ids',                         # the model's input names
    'token_type_ids',
    'attention_mask'
  ],
  # output_names=['start', 'end'],                    # the model's output names
  # dynamic_axes={'input_ids': symbolic_names,        # variable length axes
  #               'input_mask' : symbolic_names,
  #               'segment_ids' : symbolic_names,
  #               'start' : symbolic_names,
  #               'end' : symbolic_names}
)

KeyboardInterrupt: 

In [None]:
inputs

{'input_ids': tensor([[    2,    33,  3099,   896, 11924,   862, 13116, 13010,   888, 12343,
             1,  1849,   890, 12804,   889, 31577, 11849,   862,  2014, 12441,
         12406,  1026,   829, 19308,   889,   828, 19622, 26019,   893, 17849,
         15125,   860, 11512, 11175, 12343,   861,    45,     3,     0,     0,
             0,     0,     0,     0,     0,     0,     0,     0,     0,     0,
             0,     0,     0,     0,     0,     0,     0,     0,     0,     0,
             0,     0,     0,     0,     0,     0,     0,     0,     0,     0,
             0,     0,     0,     0,     0,     0,     0,     0,     0,     0,
             0,     0,     0,     0,     0,     0,     0,     0,     0,     0,
             0,     0,     0,     0,     0,     0,     0,     0,     0,     0,
             0,     0,     0,     0,     0,     0,     0,     0,     0,     0,
             0,     0,     0,     0,     0,     0,     0,     0,     0,     0,
             0,     0,     0,     0,  

In [31]:
import os
# Change to True when onnxruntime (like onnxruntime-gpu 1.0.0 ~ 1.1.2) cannot be imported.
add_cuda_path = False

# For Linux, see https://docs.nvidia.com/cuda/cuda-installation-guide-linux/index.html#environment-setup
# Below is example for Windows
# if add_cuda_path:
#     cuda_dir = 'D:/NVidia/CUDA/v11.0/bin'
#     cudnn_dir = 'D:/NVidia/CUDA/v11.0/bin'
#     if not (os.path.exists(cuda_dir) and os.path.exists(cudnn_dir)):
#         raise ValueError("Please specify correct path for CUDA and cuDNN. Otherwise onnxruntime cannot be imported.")
#     else:
#         if cuda_dir == cudnn_dir:
#             os.environ["PATH"] = cuda_dir + ';' + os.environ["PATH"]
#         else:
#             os.environ["PATH"] = cuda_dir + ';' + cudnn_dir + ';' + os.environ["PATH"]

In [32]:
import psutil
import onnxruntime
import numpy

# assert 'CUDAExecutionProvider' in onnxruntime.get_available_providers()
device_name = 'cpu'

sess_options = onnxruntime.SessionOptions()

# Optional: store the optimized graph and view it using Netron to verify that model is fully optimized.
# Note that this will increase session creation time so enable it for debugging only.
# sess_options.optimized_model_filepath = "./model.onnx"

# Please change the value according to best setting in Performance Test Tool result.
# sess_options.intra_op_num_threads=psutil.cpu_count(logical=True)

# session = onnxruntime.InferenceSession(export_model_path, sess_options)
# session = onnxruntime.InferenceSession("./model.onnx")
session = onnxruntime.InferenceSession(output_path)

# latency = []
# for i in range(total_samples):
#     data = dataset[i]
#     ort_inputs = {
#         'input_ids':  data[0].cpu().reshape(1, max_seq_length).numpy(),
#         'input_mask': data[1].cpu().reshape(1, max_seq_length).numpy(),
#         'segment_ids': data[2].cpu().reshape(1, max_seq_length).numpy()
#     }
#     start = time.time()
#     ort_outputs = session.run(None, ort_inputs)
#     latency.append(time.time() - start)

In [33]:
text = "何がつらいか分からないけどもうとにかく しんどくなる時ないですか？   働いてるより専業主婦やってこのコロナ禍過ごしてる 方が体調崩してる気がする"
session.run(None, tuple(tokenizer.encode_plus(text, return_tensors="pt").values()))

AttributeError: 'tuple' object has no attribute 'keys'

In [34]:
# text = "何がつらいか分からないけどもうとにかく しんどくなる時ないですか？   働いてるより専業主婦やってこのコロナ禍過ごしてる 方が体調崩してる気がする"
# text = "3歳の子供がまだ起きてます😫 夫と二人でカウントダウン予定が寝かしつけー。 幼児で、こんな夜中におきてるおうちありますか？"
text = "何がつらいか分からないけどもうとにかく しんどくなる時ないですか？   働いてるより専業主婦やってこのコロナ禍過ごしてる 方が体調崩してる気がする"
# eval_dataset = [tokenizer(raw["sentence1"],
#                               padding=True,
#                               max_length=128,
#                               truncation=True,
#                               return_tensors="np") for raw in raw_datasets]
eval_data = tokenizer(
    text,
    padding="max_length",
    max_length=MAX_TOKEN_COUNT,
    truncation=True,
    return_tensors="np"
)
# ONNX形式のモデルから推論用モデルを作成
# session = InferenceSession(args.onnx_path)

# result = session.run(output_names=["last_hidden_state"], input_feed=dict(eval_dataset[0]))
# result_ = session.run(output_names=["last_hidden_state"], input_feed=dict(eval_dataset[1]))
result_ = session.run(None, input_feed=dict(eval_data))
r = {
    LABEL_COLUMNS[i]: (1.0 / (1.0 + np.exp(-b)))
    for i, b in enumerate(result_[0][0].tolist())
}

In [35]:
r

{'joy': 0.005331292435993417,
 'sadness': 0.28751617428412735,
 'anticipation': 0.01266092558048564,
 'surprise': 0.018659087140092415,
 'anger': 0.01702935399461363,
 'fear': 0.7004051278930683,
 'disgust': 0.14156438154587764,
 'trust': 0.005130605073044438}

In [None]:
d = tokenizer(
    text,
    padding="max_length",
    max_length=512,
    truncation=True,
    return_tensors="np"
)
d

{'input_ids': array([[    2,    33,  3099,   896, 11924,   862, 13116, 13010,   888,
        12343,     1,  1849,   890, 12804,   889, 31577, 11849,   862,
         2014, 12441, 12406,  1026,   829, 19308,   889,   828, 19622,
        26019,   893, 17849, 15125,   860, 11512, 11175, 12343,   861,
           45,     3,     0,     0,     0,     0,     0,     0,     0,
            0,     0,     0,     0,     0,     0,     0,     0,     0,
            0,     0,     0,     0,     0,     0,     0,     0,     0,
            0,     0,     0,     0,     0,     0,     0,     0,     0,
            0,     0,     0,     0,     0,     0,     0,     0,     0,
            0,     0,     0,     0,     0,     0,     0,     0,     0,
            0,     0,     0,     0,     0,     0,     0,     0,     0,
            0,     0,     0,     0,     0,     0,     0,     0,     0,
            0,     0,     0,     0,     0,     0,     0,     0,     0,
            0,     0,     0,     0,     0,     0,     0,     0,

In [None]:
d["input_ids"].reshape(1, MAX_TOKEN_COUNT)

ValueError: ignored

In [None]:
MAX_TOKEN_COUNT

512

In [None]:
result_

[array([[-4.3698835, -3.7723987, -5.5445747, -3.4371116, -6.4516773,
         -4.2010937, -4.703767 , -6.6990643]], dtype=float32)]

In [None]:
eval_data

{'input_ids': array([[    2,  1154,   862,   886, 12500,   861, 19206, 11148, 18578,
        12431, 32506, 13950,  6796,  6156, 11164,  2754, 11148, 12461,
          861,    45, 14868, 15125, 11159, 26565, 21795, 12538,   888,
        11156, 17232,  3931, 15077, 15125,  2706,   862, 18722, 23021,
        15125,  3139,   862, 11137,     3]]), 'token_type_ids': array([[0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0,
        0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0]]), 'attention_mask': array([[1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1,
        1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1]])}

In [None]:
tuple(tokenizer.encode_plus(text, return_tensors="pt").values())

(tensor([[    2,  1154,   862,   886, 12500,   861, 19206, 11148, 18578, 12431,
          32506, 13950,  6796,  6156, 11164,  2754, 11148, 12461,   861,    45,
          14868, 15125, 11159, 26565, 21795, 12538,   888, 11156, 17232,  3931,
          15077, 15125,  2706,   862, 18722, 23021, 15125,  3139,   862, 11137,
              3]]),
 tensor([[0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0,
          0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0]]),
 tensor([[1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1,
          1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1]]))