In [1]:
import sys
from sentence_transformers import SentenceTransformer, InputExample
from sentence_transformers.evaluation import EmbeddingSimilarityEvaluator

sys.path.append('..')
from myutils import seed_everything, GPU_info, mlogging

logger = mlogging(loggername="s-bert-test", logfilename="../../log/s-bert-test")
device = GPU_info()
seed_everything(111)

logfilepath:../../log/s-bert-test_2022-06-16.log
True
device: cuda:0
cuda index: 0
gpu 개수: 1
graphic name: NVIDIA A30


In [2]:
import os

# 평가할 s-bert 모델 경로
#smodel_path = "../model/sbert/sbert-distilbert-base-multilingual-cased-nli"
#smodel_path = "../model/sbert/distiluse-base-multilingual-cased-v2"
smodel_path = "../../data11/model/albert/albert-ts-2022-06-16-1"

#smodel_path = "distilbert-base-nli-mean-tokens"

# 평가시 cosine 유사도등 측정 결과값 파일 (similarity_evaluation_xxxx.xls) 저장될 경로
output_path = smodel_path
os.makedirs(output_path, exist_ok=True)

# 평가 sts 형태의 test 파일 
test_file_type = 0  # 0이면 .tsv, 1이면 .json 파이

# tsv 파일 인 경우
if test_file_type == 0:
    test_file = '../../data11/korpora/korsts/tune_test.tsv'

# json 파일 인 경우
if test_file_type == 1:
    test_file = '../../data11/korpora/klue-sts/klue-sts-v1.1_dev.json'

train_batch_size = 32

In [3]:

test_samples = []


# /korsts/tune_test.tsv 파일을 불러옴
if test_file_type == 0:
    with open(test_file, 'rt', encoding='utf-8') as fIn:
        lines = fIn.readlines()
        for line in lines:
            s1, s2, score = line.split('\t')
            score = score.strip()
            score = float(score) / 5.0
            test_samples.append(InputExample(texts=[s1,s2], label=score))


# /klue-sts/klue-sts-v1.1_dev.json 파일을 불러옴
if test_file_type == 1:
    import json

    with open(test_file, "r") as fIn:
        data = json.load(fIn)
        for el in data:
            s1 = el["sentence1"]
            s2 = el["sentence2"]
            score = el["labels"]['label']
            test_samples.append(InputExample(texts=[s1,s2], label=score))
        

print(test_samples[0:3])

[<sentence_transformers.readers.InputExample.InputExample object at 0x7f0012125a90>, <sentence_transformers.readers.InputExample.InputExample object at 0x7f0012125af0>, <sentence_transformers.readers.InputExample.InputExample object at 0x7f0012125b50>]


In [4]:
##############################################################################
# sentence bert 를 불러옴
# => 훈련되어서 저장된 s-bert 모델을 불러와서 성능 평가 해봄
# => 평가 내용은 해당 모델 경로에 'similarity_evaluation_korstr_tune_test_results.csv' 파일에 기록됨
# => 모델 다운로드 폴더 지정 : cache_folder=경로 
##############################################################################

logger.info("\n")
logger.info("======================TEST===================")

model = SentenceTransformer(smodel_path)
#model = SentenceTransformer(smodel_path, cache_folder=output_path)
model.to(device)
logger.info(f'{model}')

2022-06-17 08:23:14,820 - s-bert-test - INFO - 

2022-06-17 08:23:20,393 - s-bert-test - INFO - SentenceTransformer(
  (0): Transformer({'max_seq_length': 512, 'do_lower_case': False}) with Transformer model: AlbertModel 
  (1): Pooling({'word_embedding_dimension': 768, 'pooling_mode_cls_token': False, 'pooling_mode_mean_tokens': True, 'pooling_mode_max_tokens': False, 'pooling_mode_mean_sqrt_len_tokens': False})
)


In [5]:
import time
start = time.time()

test_evaluator = EmbeddingSimilarityEvaluator.from_input_examples(test_samples, batch_size=train_batch_size, name='korstr_tune_test', show_progress_bar=True)
result = test_evaluator(model, output_path=output_path)
logger.info(f"\n")
logger.info(f"model path: {smodel_path}")
logger.info(f"test_file path: {test_file}")
logger.info(f'=== result: {result} ===')
logger.info(f'=== 처리시간: {time.time() - start:.3f} 초 ===')
logger.info("=====================================================")
logger.info("\n")


Batches:   0%|          | 0/44 [00:00<?, ?it/s]

Batches:   0%|          | 0/44 [00:00<?, ?it/s]

2022-06-17 08:23:23,872 - s-bert-test - INFO - 

2022-06-17 08:23:23,873 - s-bert-test - INFO - model path: ../../data11/model/albert/albert-ts-2022-06-16-1
2022-06-17 08:23:23,873 - s-bert-test - INFO - test_file path: ../../data11/korpora/korsts/tune_test.tsv
2022-06-17 08:23:23,874 - s-bert-test - INFO - === result: 0.0792475205323965 ===
2022-06-17 08:23:23,874 - s-bert-test - INFO - === 처리시간: 3.474 초 ===
2022-06-17 08:23:23,875 - s-bert-test - INFO - 

