In [1]:
#===============================================================================================
# onnx 모델 혹은 sbert 모델을 sts 말뭉치로 테스트 하는 예시
# - sentence_transformer 에 EmbeddingSimilarityEvaluator 를 참조항
# - 참고 : https://github.com/UKPLab/sentence-transformers/blob/957c87b3b4cabb96049e9991c7b77624736188af/sentence_transformers/evaluation/EmbeddingSimilarityEvaluator.py#L5
#
#
#===============================================================================================

import numpy as np
import pandas as pd
import torch
import os
import torch.nn.functional as F
import sys
from transformers import AutoTokenizer
sys.path.append('..')
from myutils import seed_everything, GPU_info, mlogging
logger = mlogging(loggername="sbert-optimized", logfilename="sbert-optimized")
seed_everything(111)

# sbert 모델 경로
smodel_path = "bongsoo/sentencebert_v1.2"

# onnx 모델 경로 
onnxmodel_path = "../../data11/model/onnx/sentencebert_v1.2-onnx"

# onnx 모델일때,  모델출력 embedding 값을 어떻게 만들어서 비교할지 type 설정값
# 0=2차원으로 reshape 해서 비교, 1=평균값으로 비교 
embed_type = 1  

# 평가 sts 형태의 test 파일 
test_file_type = 2  # 0이면 korsts, 1이면 kluests 파일, 2이면 통합

logfilepath:sbert-optimized_2022-08-26.log


In [2]:
#============================================================
# 양자화 모델 불러옴
# => 양자화 모델은 model.eval() 하면 에러남.
#============================================================

from transformers import AutoTokenizer
from optimum.onnxruntime import ORTModelForFeatureExtraction, ORTModelForSequenceClassification

tokenizer = AutoTokenizer.from_pretrained(onnxmodel_path)

# 문장임베딩이면, ORTModelForFeatureExtraction 호출
onnxmodel = ORTModelForFeatureExtraction.from_pretrained(onnxmodel_path)
print(onnxmodel)

<optimum.onnxruntime.modeling_ort.ORTModelForFeatureExtraction object at 0x7f6da03f5f40>


In [3]:
#============================================================
#sentence bert 원래 모델 로딩 
#============================================================
from sentence_transformers import SentenceTransformer, util

# device='cpu'로 함
smodel = SentenceTransformer(smodel_path, device='cpu')
print(smodel)



SentenceTransformer(
  (0): Transformer({'max_seq_length': 128, 'do_lower_case': False}) with Transformer model: DistilBertModel 
  (1): Pooling({'word_embedding_dimension': 768, 'pooling_mode_cls_token': False, 'pooling_mode_mean_tokens': True, 'pooling_mode_max_tokens': False, 'pooling_mode_mean_sqrt_len_tokens': False})
)


In [5]:
sentence1 = []
sentence2 = []
scores = [] 
    
#============================================================
# korsts 로딩
#============================================================
if test_file_type == 0 or test_file_type == 2:
    
    test_file1 = '../../data11/korpora/korsts/tune_test.tsv'

    with open(test_file1, 'rt', encoding='utf-8') as fIn1:
        lines = fIn1.readlines()
        for line in lines:
            s1, s2, score = line.split('\t')
            score = score.strip()
            score = float(score) / 5.0

            sentence1.append(s1)
            sentence2.append(s2)
            scores.append(score)

    print(f'{test_file1}-len: {len(sentence1)}')
    print(f's1: {sentence1[0]}')
    print(f's2: {sentence2[0]}')
    print(f'scores: {scores[0]}')

#============================================================
# kluests 로딩
#============================================================
if test_file_type == 1 or test_file_type == 2:
    
    test_file2 = '../../data11/korpora/klue-sts/klue-sts-v1.1_dev.json'
    import json

    with open(test_file2, "r") as fIn2:
        data = json.load(fIn2)
        for el in data:
            s1 = el["sentence1"]
            s2 = el["sentence2"]
            score = el["labels"]['label']

            sentence1.append(s1)
            sentence2.append(s2)
            scores.append(score)

    print(f'{test_file2}-len: {len(sentence1)}')
    print(f's1: {sentence1[-1]}')
    print(f's2: {sentence2[-1]}')
    print(f'scores: {scores[-1]}')

../../data11/korpora/korsts/tune_test.tsv-len: 1379
s1: 한 소녀가 머리를 스타일링하고 있다.
s2: 한 소녀가 머리를 빗고 있다.
scores: 0.5
../../data11/korpora/klue-sts/klue-sts-v1.1_dev.json-len: 1898
s1: 여느 포르투갈의 비앤비와 같이 엘리베이터는 없습니다.
s2: 포르투의 거의 모든 숙박 시설은 엘리베이터는 없습니다.
scores: 2.9


In [6]:
#============================================================
# tokenize 처리 
# ** 멀티로 한번에 tokenizer 할때는 반드시 padding=True 해야 함.(그래야 최대 길이 token에 맞춰서 padding 됨)
#============================================================
# sentence1 + sentence2를 묶어서 tokenizer 처리함
corpus = sentence1 + sentence2
print(len(corpus))

corpus_inputs = tokenizer(corpus, 
                 add_special_tokens=True, 
                 truncation=True, 
                 padding=True,   
                 max_length=128, 
                 return_tensors="pt")

print(corpus_inputs)
print(corpus_inputs['input_ids'])
print(f'type:{type(corpus_inputs)}')
print(corpus_inputs['input_ids'].size())

3796
{'input_ids': tensor([[   101,   9954, 121260,  ...,      0,      0,      0],
        [   101,   9954, 120936,  ...,      0,      0,      0],
        [   101,   9954, 100006,  ...,      0,      0,      0],
        ...,
        [   101, 120851, 119737,  ...,      0,      0,      0],
        [   101, 119559, 135617,  ...,      0,      0,      0],
        [   101, 123002,  10459,  ...,      0,      0,      0]]), 'attention_mask': tensor([[1, 1, 1,  ..., 0, 0, 0],
        [1, 1, 1,  ..., 0, 0, 0],
        [1, 1, 1,  ..., 0, 0, 0],
        ...,
        [1, 1, 1,  ..., 0, 0, 0],
        [1, 1, 1,  ..., 0, 0, 0],
        [1, 1, 1,  ..., 0, 0, 0]])}
tensor([[   101,   9954, 121260,  ...,      0,      0,      0],
        [   101,   9954, 120936,  ...,      0,      0,      0],
        [   101,   9954, 100006,  ...,      0,      0,      0],
        ...,
        [   101, 120851, 119737,  ...,      0,      0,      0],
        [   101, 119559, 135617,  ...,      0,      0,      0],
        [   

In [11]:
import time
start = time.time()

#============================================================
# onnx 모델 임베딩 구하기
#============================================================
outputs = onnxmodel(**corpus_inputs)
embedding = outputs.last_hidden_state
print(f'embed_len:{embedding.shape}')

# 구한 embeding 값을 sentence1, sentence2 로 나눔.
embed_len = len(embedding)//2
print(embed_len)
tembed1 = embedding[0:embed_len]
tembed2 = embedding[embed_len:]

print(tembed1.shape)
print(tembed2.shape)

if embed_type == 0:
    logger.info(f'*2D reshape embedding 사용')
        
    # 3차원을 -> 2차원으로  reshape 시키고, numpy()로 만듬
    # - 예: [1379, 52, 768] -> (1379, 39936)
    embedlist1 = tembed1.reshape(embed_len, -1).numpy()
    embedlist2 = tembed2.reshape(embed_len, -1).numpy() 

    print(embedlist1.shape)
    print(embedlist1.shape)
    
elif embed_type == 1:
    logger.info(f'*평균 embedding 사용')
    
    # 아래는 1차원 평균값으로 만드는 예시임
    # => paired_cosine_distances 를 사용하려면, >= 2D numpy 배열로 만들어야 함.
    # => embed1,2는 3차원->1차원 평균값으로 만듬
    embedlist1 = []
    for idx,embedding in enumerate(tembed1): # enumerate는 index, value 값이 리턴됨
        embed = torch.mean(embedding, dim=0).numpy()
        embedlist1.append(embed)

    embedlist2 = []
    for idx,embedding in enumerate(tembed2): # enumerate는 index, value 값이 리턴됨
        embed = torch.mean(embedding, dim=0).numpy()
        embedlist2.append(embed)

    print(embedlist1[0].shape)
    print(embedlist2[0].shape)


# sklearn 을 이용하여 cosine_scores를 구함
from sklearn.metrics.pairwise import paired_cosine_distances, paired_euclidean_distances, paired_manhattan_distances

cosine_scores = 1 - (paired_cosine_distances(embedlist1, embedlist2))

manhattan_distances = -paired_manhattan_distances(embedlist1, embedlist2)

euclidean_distances = -paired_euclidean_distances(embedlist1, embedlist2)

dot_products = [np.dot(emb1, emb2) for emb1, emb2 in zip(embedlist1, embedlist2)]
        
print(type(cosine_scores))
print(len(cosine_scores))
print(cosine_scores[0])

# pearson 과 spearman 평균을 구함.
# => 실제 sts문장들 scores와 모델에서 구한 cosine_scores를 비교하여 Acc 평균 값들을 구함
from scipy.stats import pearsonr, spearmanr

eval_pearson_cosine, _ = pearsonr(scores, cosine_scores)
eval_spearman_cosine, _ = spearmanr(scores, cosine_scores)

eval_pearson_manhattan, _ = pearsonr(scores, manhattan_distances)
eval_spearman_manhattan, _ = spearmanr(scores, manhattan_distances)

eval_pearson_euclidean, _ = pearsonr(scores, euclidean_distances)
eval_spearman_euclidean, _ = spearmanr(scores, euclidean_distances)

eval_pearson_dot, _ = pearsonr(scores, dot_products)
eval_spearman_dot, _ = spearmanr(scores, dot_products)
        
logger.info(f'---------------------------------------------------------')
logger.info(f'onnx모델: {onnxmodel_path}')
logger.info(f'---------------------------------------------------------')
logger.info(f'=== pearson_cosine: {eval_pearson_cosine} ===')
logger.info(f'=== spearman_cosine: {eval_spearman_cosine} ===')
logger.info(f'---------------------------------------------------------')
logger.info(f'=== pearson_manhattan: {eval_pearson_manhattan} ===')
logger.info(f'=== spearman_manhattan: {eval_spearman_manhattan} ===')
logger.info(f'---------------------------------------------------------')
logger.info(f'=== pearson_euclidean: {eval_pearson_euclidean} ===')
logger.info(f'=== spearman_euclidean: {eval_spearman_euclidean} ===')
logger.info(f'---------------------------------------------------------')
logger.info(f'=== pearson_dot: {eval_pearson_dot} ===')
logger.info(f'=== spearman_dot: {eval_spearman_dot} ===')
logger.info(f'---------------------------------------------------------')
logger.info(f'=== 처리시간: {time.time() - start:.3f} 초 ===')
logger.info(f'-END-\n')


2022-08-26 13:22:35,184 - sbert-optimized - INFO - *평균 embedding 사용


embed_len:torch.Size([3796, 54, 768])
1898
torch.Size([1898, 54, 768])
torch.Size([1898, 54, 768])
(768,)
(768,)
<class 'numpy.ndarray'>
1898
0.851942


2022-08-26 13:22:35,390 - sbert-optimized - INFO - ---------------------------------------------------------
2022-08-26 13:22:35,390 - sbert-optimized - INFO - onnx모델: ../../data11/model/onnx/sentencebert_v1.2-onnx
2022-08-26 13:22:35,391 - sbert-optimized - INFO - ---------------------------------------------------------
2022-08-26 13:22:35,392 - sbert-optimized - INFO - === pearson_cosine: 0.41840956189397926 ===
2022-08-26 13:22:35,392 - sbert-optimized - INFO - === spearman_cosine: 0.6203559496512039 ===
2022-08-26 13:22:35,393 - sbert-optimized - INFO - ---------------------------------------------------------
2022-08-26 13:22:35,393 - sbert-optimized - INFO - === pearson_manhattan: 0.4272547480861082 ===
2022-08-26 13:22:35,393 - sbert-optimized - INFO - === spearman_manhattan: 0.6402805941144337 ===
2022-08-26 13:22:35,394 - sbert-optimized - INFO - ---------------------------------------------------------
2022-08-26 13:22:35,394 - sbert-optimized - INFO - === pearson_euclidean:

In [13]:
import time
start = time.time()

#============================================================
# sbert 모델 임베딩 구하기
#============================================================
embeddings1 = smodel.encode(sentence1, batch_size = len(sentence1), convert_to_numpy=True)
embeddings2 = smodel.encode(sentence2, batch_size = len(sentence2), convert_to_numpy=True)

print(embeddings1.shape)
print(embeddings2.shape)

# sklearn 을 이용하여 cosine_scores를 구함
from sklearn.metrics.pairwise import paired_cosine_distances, paired_euclidean_distances, paired_manhattan_distances

cosine_scores = 1 - (paired_cosine_distances(embeddings1, embeddings2))

manhattan_distances = -paired_manhattan_distances(embeddings1, embeddings2)

euclidean_distances = -paired_euclidean_distances(embeddings1, embeddings2)

dot_products = [np.dot(emb1, emb2) for emb1, emb2 in zip(embeddings1, embeddings2)]

print(type(cosine_scores))
print(len(cosine_scores))
print(cosine_scores[0])

# pearson 과 spearman 평균을 구함.
# => 실제 sts문장들 scores와 모델에서 구한 cosine_scores를 비교하여 Acc 평균 값들을 구함
from scipy.stats import pearsonr, spearmanr

eval_pearson_cosine, _ = pearsonr(scores, cosine_scores)
eval_spearman_cosine, _ = spearmanr(scores, cosine_scores)

eval_pearson_manhattan, _ = pearsonr(scores, manhattan_distances)
eval_spearman_manhattan, _ = spearmanr(scores, manhattan_distances)

eval_pearson_euclidean, _ = pearsonr(scores, euclidean_distances)
eval_spearman_euclidean, _ = spearmanr(scores, euclidean_distances)

eval_pearson_dot, _ = pearsonr(scores, dot_products)
eval_spearman_dot, _ = spearmanr(scores, dot_products)

logger.info(f'---------------------------------------------------------')
logger.info(f'sbert모델: {smodel_path}')
logger.info(f'---------------------------------------------------------')
logger.info(f'=== pearson_cosine: {eval_pearson_cosine} ===')
logger.info(f'=== spearman_cosine: {eval_spearman_cosine} ===')
logger.info(f'---------------------------------------------------------')
logger.info(f'=== pearson_manhattan: {eval_pearson_manhattan} ===')
logger.info(f'=== spearman_manhattan: {eval_spearman_manhattan} ===')
logger.info(f'---------------------------------------------------------')
logger.info(f'=== pearson_euclidean: {eval_pearson_euclidean} ===')
logger.info(f'=== spearman_euclidean: {eval_spearman_euclidean} ===')
logger.info(f'---------------------------------------------------------')
logger.info(f'=== pearson_dot: {eval_pearson_dot} ===')
logger.info(f'=== spearman_dot: {eval_spearman_dot} ===')
logger.info(f'---------------------------------------------------------')
logger.info(f'=== 처리시간: {time.time() - start:.3f} 초 ===')
logger.info(f'-END-\n')

2022-08-26 13:27:37,447 - sbert-optimized - INFO - ---------------------------------------------------------
2022-08-26 13:27:37,449 - sbert-optimized - INFO - sbert모델: bongsoo/sentencebert_v1.2
2022-08-26 13:27:37,450 - sbert-optimized - INFO - ---------------------------------------------------------
2022-08-26 13:27:37,451 - sbert-optimized - INFO - === pearson_cosine: 0.42790362083793543 ===
2022-08-26 13:27:37,452 - sbert-optimized - INFO - === spearman_cosine: 0.6305919207416886 ===
2022-08-26 13:27:37,452 - sbert-optimized - INFO - ---------------------------------------------------------
2022-08-26 13:27:37,453 - sbert-optimized - INFO - === pearson_manhattan: 0.4538038691905121 ===
2022-08-26 13:27:37,454 - sbert-optimized - INFO - === spearman_manhattan: 0.67585103663608 ===
2022-08-26 13:27:37,455 - sbert-optimized - INFO - ---------------------------------------------------------
2022-08-26 13:27:37,456 - sbert-optimized - INFO - === pearson_euclidean: 0.4548321135918387 ==

(1898, 768)
(1898, 768)
<class 'numpy.ndarray'>
1898
0.7331225
