In [1]:
#======================================================================================================
# sentence-bert nli로 훈련된 모델을 다시 sts(simentic textual similiarity) 파일로 훈련시킴
# => sentence-transformers 패키지를 이용하여 구현 함.(*pip install -U sentence-transformers 설치 필요)
#
# 도큐먼트 : https://www.sbert.net/index.html
# 소스참고 : https://github.com/BM-K/KoSentenceBERT-ETRI
#  => KoSentenceBERT-ETRI-master\KoSentenceBERT-ETRI-master\con_training_sts.py

# pip install -U sentence-transformers
#======================================================================================================

from torch.utils.data import DataLoader
import math
from sentence_transformers import models, losses
from sentence_transformers import SentencesDataset, LoggingHandler, SentenceTransformer, util, InputExample
from sentence_transformers.evaluation import EmbeddingSimilarityEvaluator
from datetime import datetime
import sys
import os
import gzip
import csv
sys.path.append('..')
from myutils import seed_everything, GPU_info, mlogging

logger = mlogging(loggername="s-bert", logfilename="s-bert")
device = GPU_info()
seed_everything(111)



logfilepath:s-bert_2022-09-29.log
True
device: cuda:0
cuda index: 0
gpu 개수: 1
graphic name: NVIDIA A30


In [2]:
import os

# 기존 sbert 모델 경로
smodel_path = "../../data11/model/sbert/sbert-mdistilbertV2.1-distil-All-NLI"

# sts 학습후 학습된 sbert 모델 저장할 경로
# => **해당 경로\eval 폴더에 similarity_evaluation_sts-dev_result.csv 파일로 각 epoch 마다 평가된 결과가 기록된다.
smodel_save_path = '../../data11/model/sbert/sbert-mdistilbertV2.1-distil-All-NLI-STS-1'

use_korsts = 1     # 한국어 korsts 파일 (tsv 5,749개)
use_kluests = 1    # 한국어 kluests_v1.1 파일 (json 11,668개)
use_sts17 = 1      # 한국어 sts17-crosslingual-sts (jsonl 2,846개)
use_glue_sts = 1   # 영어 glue_sts (load_dataset 5,749개)
use_en_sts = 1     # 영어 stsb_multi_mt(load_dataset 15,676개) = stsb_multi_mt(5,749개) + mteb/sickr-sts(9,927개)

# KorSTS 학습, 평가 파일들
train_korsts_file = '../../data11/korpora/korsts/tune_train.tsv'
eval_korsts_file = '../../data11/korpora/korsts/tune_dev.tsv'

# KlueSTS 학습, 평가 파일들
train_kluests_file = '../../data11/korpora/klue-sts/klue-sts-v1.1_train.json'
eval_kluests_file = '../../data11/korpora/klue-sts/klue-sts-v1.1_dev.json'

# sts17-crosslingual-sts  학습 파일(*평가파일 없음)
train_sts17_file = '../../data11/korpora/sts17-crosslingual-sts/ko-ko.jsonl'

train_batch_size = 64
num_epochs = 128   # 128 정도 해도 최상의 모델을 찾을수 있음 (*sbert는 eval이 최상인 모델이 out모델로 저장됨)
lr = 3e-5          # default=2e-5

In [3]:
# sentence 모델을 불러옴
model = SentenceTransformer(smodel_path)
print(model)

SentenceTransformer(
  (0): Transformer({'max_seq_length': 256, 'do_lower_case': False}) with Transformer model: DistilBertModel 
  (1): Pooling({'word_embedding_dimension': 768, 'pooling_mode_cls_token': False, 'pooling_mode_mean_tokens': True, 'pooling_mode_max_tokens': False, 'pooling_mode_mean_sqrt_len_tokens': False})
)


In [4]:
import json
from datasets import load_dataset

train_samples = []

####################################################################################################
# KorSTS 훈련 데이터 셋 설정(.tsv 파일)
####################################################################################################
if use_korsts == True:
    count = 0
    logger.info(f"Read STS train dataset=>{train_korsts_file}")
    with open(train_korsts_file, "rt", encoding="utf-8") as f:
        lines = f.readlines()
        for line in lines:
            text_a, text_b, score = line.split('\t')
            score = score.strip()
            score = float(score) / 5.0  #5로 나눠서 0~1 사이가 되도록 함
            
            if count < 3:
                print(f"{text_a}, {text_b}, {score}")
                
            train_samples.append(InputExample(texts= [text_a,text_b], label=score))
            count += 1
    logger.info(f'*{train_korsts_file} count: {count}')
####################################################################################################

####################################################################################################
# klue 훈련 데이터 셋 설정(.json 파일)
# => 아래처럼 load_dataset으로 불러와서 사용할수도 있음.
# datas = load_dataset("klue", "sts", split="train")
# for data in datas:
#        text_a = data["sentence1"]
#        text_b = data["sentence2"]
#        score = data["labels"]["label"]
#        score = float(score) / 5.0  
###################################################################################################           
if use_kluests == True:  
    count = 0
    logger.info(f"Read STS train dataset=>{train_kluests_file}")
    with open(train_kluests_file, "rt", encoding="utf-8") as f:
        datas = json.load(f)
        for data in datas:
            text_a = data["sentence1"]
            text_b = data["sentence2"]
            score = data["labels"]["label"]
            score = float(score) / 5.0  #5로 나눠서 0~1 사이가 되도록 함

            if count < 3:
                print(f"{text_a}, {text_b}, {score}")

            train_samples.append(InputExample(texts= [text_a,text_b], label=score))
            count += 1
    logger.info(f'*{train_kluests_file} len: {count}')
####################################################################################################

####################################################################################################
# 한국어 sts17-crosslingual-sts 훈련 데이터셋 설정
# => jsonl : 여러개의 json 형식 파일이 각 줄마다 기록되어 있는 형태 파일
# => 패키지 설치 : !pip install jsonlines
####################################################################################################
if use_sts17 == True:
    import jsonlines
    count = 0
    logger.info(f"Read STS train dataset=>{train_sts17_file}")
    with jsonlines.open(train_sts17_file, "r") as f:
        for line in f:
            text_a = line["sentence1"]
            text_a = line["sentence2"]
            score = line["score"]
            score = float(score) / 5.0
            
            if count < 3:
                print(f"{text_a}, {text_b}, {score}")
                
            train_samples.append(InputExample(texts= [text_a,text_b], label=score))
            count += 1
            
    logger.info(f'*{train_kluests_file} len: {count}')
####################################################################################################

#############################################################################################
# 영문 sts 데이터셋 설정 (load_dataset)
# => stsb_multi_mt , mteb/sickr-sts 영문 sts 훈련 데이터 셋 불러오기
#############################################################################################
if use_en_sts == True:
    count = 0
    en_sts_dataset = load_dataset("stsb_multi_mt", name="en", split="train")
    for data in en_sts_dataset:
        text_a = data["sentence1"]
        text_b = data["sentence2"]
        score = data["similarity_score"]
        score = float(score) / 5.0  #5로 나눠서 0~1 사이가 되도록 함

        if count < 3:
            print(f"{text_a}, {text_b}, {score}")

        train_samples.append(InputExample(texts= [text_a,text_b], label=score))
        count += 1
    logger.info(f'*stsb_multi_mt_en len: {count}')
    
    # mteb/sickr-sts 훈련데이터 불러옴
    count = 0    
    en_sts_dataset = load_dataset("mteb/sickr-sts", split="test")
    for data in en_sts_dataset:
        text_a = data["sentence1"]
        text_b = data["sentence2"]
        score = data["score"]
        score = float(score) / 5.0  #5로 나눠서 0~1 사이가 되도록 함

        if count < 3:
            print(f"{text_a}, {text_b}, {score}")

        train_samples.append(InputExample(texts= [text_a,text_b], label=score))
        count += 1
    logger.info(f'*mteb/sickr-sts len: {count}')
#############################################################################################           
 
#############################################################################################
# GLUE STS 훈련 데이터셋 설정 (load_dataset)
#############################################################################################
if use_glue_sts == True:
    # glue stsb 훈련데이터 불러옴(5,749개)
    count = 0    
    en_sts_dataset = load_dataset("glue","stsb", split="train")
    for data in en_sts_dataset:
        text_a = data["sentence1"]
        text_b = data["sentence2"]
        score = data["label"]
        score = float(score) / 5.0  #5로 나눠서 0~1 사이가 되도록 함

        if count < 3:
            print(f"{text_a}, {text_b}, {score}")

        train_samples.append(InputExample(texts= [text_a,text_b], label=score))
        count += 1
    logger.info(f'*glue_stsb len: {count}')
#############################################################################################

logger.info(f'------------------------------------------------------------------------')        
logger.info(f'*train_samples_len:{len(train_samples)}')
print(train_samples[0:3])

2022-09-29 10:47:03,844 - s-bert - INFO - Read STS train dataset=>../../data11/korpora/korsts/tune_train.tsv
2022-09-29 10:47:03,858 - s-bert - INFO - *../../data11/korpora/korsts/tune_train.tsv count: 5749
2022-09-29 10:47:03,859 - s-bert - INFO - Read STS train dataset=>../../data11/korpora/klue-sts/klue-sts-v1.1_train.json


비행기가 이륙하고 있다., 비행기가 이륙하고 있다., 1.0
한 남자가 큰 플루트를 연주하고 있다., 남자가 플루트를 연주하고 있다., 0.76
한 남자가 피자에 치즈를 뿌려놓고 있다., 한 남자가 구운 피자에 치즈 조각을 뿌려놓고 있다., 0.76


2022-09-29 10:47:04,136 - s-bert - INFO - *../../data11/korpora/klue-sts/klue-sts-v1.1_train.json len: 11668
2022-09-29 10:47:04,141 - s-bert - INFO - Read STS train dataset=>../../data11/korpora/sts17-crosslingual-sts/ko-ko.jsonl
2022-09-29 10:47:04,157 - s-bert - INFO - *../../data11/korpora/klue-sts/klue-sts-v1.1_train.json len: 2846


숙소 위치는 찾기 쉽고 일반적인 한국의 반지하 숙소입니다., 숙박시설의 위치는 쉽게 찾을 수 있고 한국의 대표적인 반지하 숙박시설입니다., 0.74
위반행위 조사 등을 거부·방해·기피한 자는 500만원 이하 과태료 부과 대상이다., 시민들 스스로 자발적인 예방 노력을 한 것은 아산 뿐만이 아니었다., 0.0
회사가 보낸 메일은 이 지메일이 아니라 다른 지메일 계정으로 전달해줘., 사람들이 주로 네이버 메일을 쓰는 이유를 알려줘, 0.06
안전모를 쓴 한 남자가 춤을 추고 있다., 학회 홍보 메일은 회신 하지마, 1.0
아이가 말을 타고 있다., 학회 홍보 메일은 회신 하지마, 0.95
남자가 뱀에게 쥐를 먹이고 있다., 학회 홍보 메일은 회신 하지마, 1.0


Reusing dataset stsb_multi_mt (/MOCOMSYS/.cache/huggingface/datasets/stsb_multi_mt/en/1.0.0/a5d260e4b7aa82d1ab7379523a005a366d9b124c76a5a5cf0c4c5365458b0ba9)
2022-09-29 10:47:05,415 - s-bert - INFO - *stsb_multi_mt_en len: 5749


A plane is taking off., An air plane is taking off., 1.0
A man is playing a large flute., A man is playing a flute., 0.7599999904632568
A man is spreading shreded cheese on a pizza., A man is spreading shredded cheese on an uncooked pizza., 0.7599999904632568


Using custom data configuration mteb--sickr-sts-1e81327897d49df9
Reusing dataset json (/MOCOMSYS/.cache/huggingface/datasets/json/mteb--sickr-sts-1e81327897d49df9/0.0.0/ac0ca5f5289a6cf108e706efcf040422dbbfa8e658dee6a819f20d76bb84d26b)


A group of kids is playing in a yard and an old man is standing in the background, A group of boys in a yard is playing and a man is standing in the background, 0.9
A group of children is playing in the house and there is no man standing in the background, A group of kids is playing in a yard and an old man is standing in the background, 0.64
The young boys are playing outdoors and the man is smiling nearby, The kids are playing outdoors near a man with a smile, 0.9400000000000001


2022-09-29 10:47:09,585 - s-bert - INFO - *mteb/sickr-sts len: 9927
Reusing dataset glue (/MOCOMSYS/.cache/huggingface/datasets/glue/stsb/1.0.0/dacbe3125aa31d7f70367a07a8a9e72a5a0bfeb5fc42e75c9db75b96da6053ad)


A plane is taking off., An air plane is taking off., 1.0
A man is playing a large flute., A man is playing a flute., 0.7599999904632568
A man is spreading shreded cheese on a pizza., A man is spreading shredded cheese on an uncooked pizza., 0.7599999904632568


2022-09-29 10:47:11,068 - s-bert - INFO - *glue_stsb len: 5749
2022-09-29 10:47:11,070 - s-bert - INFO - ------------------------------------------------------------------------
2022-09-29 10:47:11,071 - s-bert - INFO - *train_samples_len:41688


[<sentence_transformers.readers.InputExample.InputExample object at 0x7fa8fbbac4c0>, <sentence_transformers.readers.InputExample.InputExample object at 0x7fa8f7af4f40>, <sentence_transformers.readers.InputExample.InputExample object at 0x7fa8f7af4730>]


In [5]:
# 데이터 셋, 데이터 로더, 손실함수 정의
train_dataset = SentencesDataset(train_samples, model)
train_dataloader = DataLoader(train_dataset, shuffle=True, batch_size=train_batch_size)
train_loss = losses.CosineSimilarityLoss(model=model)

In [6]:
#Read STSbenchmark dataset and use it as development set
# 평가데이터 불러오기
#korsts 파일로 두 문장간 유사도를 수치로(5.0이 만점=매우 유사) 측정함.
dev_samples = []

####################################################################################################
# KorSTS 평가 데이터 셋 설정(.tsv 파일)
####################################################################################################
if use_korsts == True:
    count = 0
    logger.info(f"Read STS dev dataset=>{eval_korsts_file}")
    with open(eval_korsts_file, 'rt', encoding='utf-8') as f:
        lines = f.readlines()
        for line in lines:
            text_a, text_b, score = line.split('\t')
            score = score.strip()
            score = float(score) / 5.0  #5로 나눠서 0~1 사이가 되도록 함
            
            if count < 5:
                print(f"{text_a}, {text_b}, {score}")
            
            dev_samples.append(InputExample(texts= [text_a,text_b], label=score))
            count += 1
    logger.info(f'*{eval_korsts_file} len: {count}')
####################################################################################################  

####################################################################################################
# KlueSTS 평가 데이터 셋 설정(.json 파일)
# => 아래처럼 load_dataset으로 불러와서 사용할수도 있음.
# datas = load_dataset("klue", "sts", split="test")
# for data in datas:
#        text_a = data["sentence1"]
#        text_b = data["sentence2"]
#        score = data["labels"]["label"]
#        score = float(score) / 5.0  
####################################################################################################           
if use_kluests == True:
    count = 0
    logger.info(f"Read STS dev dataset=>{eval_kluests_file}")
    with open(eval_kluests_file, "rt", encoding="utf-8") as f:
        datas = json.load(f)
        for data in datas:
            text_a = data["sentence1"]
            text_b = data["sentence2"]
            score = data["labels"]["label"]
            score = float(score) / 5.0  #5로 나눠서 0~1 사이가 되도록 함

            if count < 5:
                print(f"{text_a}, {text_b}, {score}")

            dev_samples.append(InputExample(texts= [text_a,text_b], label=score))
            count += 1
    logger.info(f'*{eval_kluests_file} len: {count}')
####################################################################################################  

####################################################################################################
# 영문 stsb_multi_mt 데이터 셋 설정(load_dataset)
####################################################################################################                
# stsb_multi_mt 영문 sts dev 데이터 셋 불러오기
if use_en_sts == True:
    count = 0
    en_sts_dataset = load_dataset("stsb_multi_mt", name="en", split="dev")
    for data in en_sts_dataset:
        text_a = data["sentence1"]
        text_b = data["sentence2"]
        score = data["similarity_score"]
        score = float(score) / 5.0  #5로 나눠서 0~1 사이가 되도록 함

        if count < 3:
            print(f"{text_a}, {text_b}, {score}")

        dev_samples.append(InputExample(texts= [text_a,text_b], label=score))
        count += 1
    logger.info(f'*stsb_multi_mt len: {count}')
####################################################################################################  

####################################################################################################
# 영문 GLUE 데이터 셋 설정(load_dataset)
####################################################################################################  
if use_glue_sts == True:
    count = 0
    glue_stsb_dataset = load_dataset("glue","stsb", split="validation")
    for data in glue_stsb_dataset:
        text_a = data["sentence1"]
        text_b = data["sentence2"]
        score = data["label"]
        score = float(score) / 5.0  #5로 나눠서 0~1 사이가 되도록 함
        
        if count < 3:
            print(f"{text_a}, {text_b}, {score}")
            
        dev_samples.append(InputExample(texts= [text_a,text_b], label=score))
        count += 1
    logger.info(f'*glue-stsb len: {count}')
####################################################################################################  

logger.info(f'------------------------------------------------------------------------')        
logger.info(f'*dev_samples_len:{len(dev_samples)}')
print(dev_samples[0:3])

# 2개의 bert 모델에서 구한 2개의 embedding 값들의 cosine 유사도를 구해서, 이를 실제 score와 비교해서 유사도 측정함
dev_evaluator = EmbeddingSimilarityEvaluator.from_input_examples(dev_samples, 
                                                                 batch_size=train_batch_size, 
                                                                 name='sts-dev')

2022-09-29 10:47:11,091 - s-bert - INFO - Read STS dev dataset=>../../data11/korpora/korsts/tune_dev.tsv
2022-09-29 10:47:11,097 - s-bert - INFO - *../../data11/korpora/korsts/tune_dev.tsv len: 1500
2022-09-29 10:47:11,098 - s-bert - INFO - Read STS dev dataset=>../../data11/korpora/klue-sts/klue-sts-v1.1_dev.json
2022-09-29 10:47:11,117 - s-bert - INFO - *../../data11/korpora/klue-sts/klue-sts-v1.1_dev.json len: 519


안전모를 가진 한 남자가 춤을 추고 있다., 안전모를 쓴 한 남자가 춤을 추고 있다., 1.0
어린아이가 말을 타고 있다., 아이가 말을 타고 있다., 0.95
한 남자가 뱀에게 쥐를 먹이고 있다., 남자가 뱀에게 쥐를 먹이고 있다., 1.0
한 여성이 기타를 연주하고 있다., 한 남자가 기타를 치고 있다., 0.48
한 여성이 플루트를 연주하고 있다., 남자가 플루트를 연주하고 있다., 0.55
무엇보다도 호스트분들이 너무 친절하셨습니다., 무엇보다도, 호스트들은 매우 친절했습니다., 0.9800000000000001
주요 관광지 모두 걸어서 이동가능합니다., 위치는 피렌체 중심가까지 걸어서 이동 가능합니다., 0.27999999999999997
학생들의 균형 있는 영어능력을 향상시킬 수 있는 학교 수업을 유도하기 위해 2018학년도 수능부터 도입된 영어 영역 절대평가는 올해도 유지한다., 영어 영역의 경우 학생들이 한글 해석본을 암기하는 문제를 해소하기 위해 2016학년도부터 적용했던 EBS 연계 방식을 올해도 유지한다., 0.26
다만, 도로와 인접해서 거리의 소음이 들려요., 하지만, 길과 가깝기 때문에 거리의 소음을 들을 수 있습니다., 0.74
형이 다시 캐나다 들어가야 하니 가족모임 일정은 바꾸지 마세요., 가족 모임 일정은 바꾸지 말도록 하십시오., 0.5


Reusing dataset stsb_multi_mt (/MOCOMSYS/.cache/huggingface/datasets/stsb_multi_mt/en/1.0.0/a5d260e4b7aa82d1ab7379523a005a366d9b124c76a5a5cf0c4c5365458b0ba9)
2022-09-29 10:47:12,203 - s-bert - INFO - *stsb_multi_mt len: 1500


A man with a hard hat is dancing., A man wearing a hard hat is dancing., 1.0
A young child is riding a horse., A child is riding a horse., 0.95
A man is feeding a mouse to a snake., The man is feeding a mouse to the snake., 1.0


Reusing dataset glue (/MOCOMSYS/.cache/huggingface/datasets/glue/stsb/1.0.0/dacbe3125aa31d7f70367a07a8a9e72a5a0bfeb5fc42e75c9db75b96da6053ad)
2022-09-29 10:47:13,892 - s-bert - INFO - *glue-stsb len: 1500
2022-09-29 10:47:13,893 - s-bert - INFO - ------------------------------------------------------------------------
2022-09-29 10:47:13,894 - s-bert - INFO - *dev_samples_len:5019


A man with a hard hat is dancing., A man wearing a hard hat is dancing., 1.0
A young child is riding a horse., A child is riding a horse., 0.95
A man is feeding a mouse to a snake., The man is feeding a mouse to the snake., 1.0
[<sentence_transformers.readers.InputExample.InputExample object at 0x7fa9019441f0>, <sentence_transformers.readers.InputExample.InputExample object at 0x7fa901955f40>, <sentence_transformers.readers.InputExample.InputExample object at 0x7fa901955cd0>]


In [7]:
warmup_steps = math.ceil(len(train_dataset) * num_epochs / train_batch_size * 0.1) #10% of train data for warm-up
logger.info("Warmup-steps: {}".format(warmup_steps))

# evaluation_steps은 20%로 설정
evaluation_steps = int(len(train_dataset) * num_epochs / train_batch_size * 0.2)

logger.info(f"IN-model:{smodel_path}, OUT-model:{smodel_save_path}")
logger.info("*batch_size: {}, epoch:{}, train_dataset:{}, Warmup-steps: {}, evaluation_step: {}".format(train_batch_size, num_epochs, len(train_dataset), warmup_steps, evaluation_steps))

# Train the model
model.fit(train_objectives=[(train_dataloader, train_loss)],
          evaluator=dev_evaluator,
          epochs=num_epochs,
          evaluation_steps=evaluation_steps,
          warmup_steps=warmup_steps,
          optimizer_params= {'lr': lr, 'eps': 1e-6, 'correct_bias': False},
          output_path=smodel_save_path)

2022-09-29 10:47:13,906 - s-bert - INFO - Warmup-steps: 8338
2022-09-29 10:47:13,909 - s-bert - INFO - IN-model:../../data11/model/sbert/sbert-mdistilbertV2.1-distil-All-NLI, OUT-model:../../data11/model/sbert/sbert-mdistilbertV2.1-distil-All-NLI-STS-1
2022-09-29 10:47:13,910 - s-bert - INFO - *batch_size: 64, epoch:128, train_dataset:41688, Warmup-steps: 8338, evaluation_step: 16675


Epoch:   0%|          | 0/128 [00:00<?, ?it/s]

Iteration:   0%|          | 0/652 [00:00<?, ?it/s]

Iteration:   0%|          | 0/652 [00:00<?, ?it/s]

Iteration:   0%|          | 0/652 [00:00<?, ?it/s]

Iteration:   0%|          | 0/652 [00:00<?, ?it/s]

Iteration:   0%|          | 0/652 [00:00<?, ?it/s]

Iteration:   0%|          | 0/652 [00:00<?, ?it/s]

Iteration:   0%|          | 0/652 [00:00<?, ?it/s]

Iteration:   0%|          | 0/652 [00:00<?, ?it/s]

Iteration:   0%|          | 0/652 [00:00<?, ?it/s]

Iteration:   0%|          | 0/652 [00:00<?, ?it/s]

Iteration:   0%|          | 0/652 [00:00<?, ?it/s]

Iteration:   0%|          | 0/652 [00:00<?, ?it/s]

Iteration:   0%|          | 0/652 [00:00<?, ?it/s]

Iteration:   0%|          | 0/652 [00:00<?, ?it/s]

Iteration:   0%|          | 0/652 [00:00<?, ?it/s]

Iteration:   0%|          | 0/652 [00:00<?, ?it/s]

Iteration:   0%|          | 0/652 [00:00<?, ?it/s]

Iteration:   0%|          | 0/652 [00:00<?, ?it/s]

Iteration:   0%|          | 0/652 [00:00<?, ?it/s]

Iteration:   0%|          | 0/652 [00:00<?, ?it/s]

Iteration:   0%|          | 0/652 [00:00<?, ?it/s]

Iteration:   0%|          | 0/652 [00:00<?, ?it/s]

Iteration:   0%|          | 0/652 [00:00<?, ?it/s]

Iteration:   0%|          | 0/652 [00:00<?, ?it/s]

Iteration:   0%|          | 0/652 [00:00<?, ?it/s]

Iteration:   0%|          | 0/652 [00:00<?, ?it/s]

Iteration:   0%|          | 0/652 [00:00<?, ?it/s]

Iteration:   0%|          | 0/652 [00:00<?, ?it/s]

Iteration:   0%|          | 0/652 [00:00<?, ?it/s]

Iteration:   0%|          | 0/652 [00:00<?, ?it/s]

Iteration:   0%|          | 0/652 [00:00<?, ?it/s]

Iteration:   0%|          | 0/652 [00:00<?, ?it/s]

Iteration:   0%|          | 0/652 [00:00<?, ?it/s]

Iteration:   0%|          | 0/652 [00:00<?, ?it/s]

Iteration:   0%|          | 0/652 [00:00<?, ?it/s]

Iteration:   0%|          | 0/652 [00:00<?, ?it/s]

Iteration:   0%|          | 0/652 [00:00<?, ?it/s]

Iteration:   0%|          | 0/652 [00:00<?, ?it/s]

Iteration:   0%|          | 0/652 [00:00<?, ?it/s]

Iteration:   0%|          | 0/652 [00:00<?, ?it/s]

Iteration:   0%|          | 0/652 [00:00<?, ?it/s]

Iteration:   0%|          | 0/652 [00:00<?, ?it/s]

Iteration:   0%|          | 0/652 [00:00<?, ?it/s]

Iteration:   0%|          | 0/652 [00:00<?, ?it/s]

Iteration:   0%|          | 0/652 [00:00<?, ?it/s]

Iteration:   0%|          | 0/652 [00:00<?, ?it/s]

Iteration:   0%|          | 0/652 [00:00<?, ?it/s]

Iteration:   0%|          | 0/652 [00:00<?, ?it/s]

Iteration:   0%|          | 0/652 [00:00<?, ?it/s]

Iteration:   0%|          | 0/652 [00:00<?, ?it/s]

Iteration:   0%|          | 0/652 [00:00<?, ?it/s]

Iteration:   0%|          | 0/652 [00:00<?, ?it/s]

Iteration:   0%|          | 0/652 [00:00<?, ?it/s]

Iteration:   0%|          | 0/652 [00:00<?, ?it/s]

Iteration:   0%|          | 0/652 [00:00<?, ?it/s]

Iteration:   0%|          | 0/652 [00:00<?, ?it/s]

Iteration:   0%|          | 0/652 [00:00<?, ?it/s]

Iteration:   0%|          | 0/652 [00:00<?, ?it/s]

Iteration:   0%|          | 0/652 [00:00<?, ?it/s]

Iteration:   0%|          | 0/652 [00:00<?, ?it/s]

Iteration:   0%|          | 0/652 [00:00<?, ?it/s]

Iteration:   0%|          | 0/652 [00:00<?, ?it/s]

Iteration:   0%|          | 0/652 [00:00<?, ?it/s]

Iteration:   0%|          | 0/652 [00:00<?, ?it/s]

Iteration:   0%|          | 0/652 [00:00<?, ?it/s]

Iteration:   0%|          | 0/652 [00:00<?, ?it/s]

Iteration:   0%|          | 0/652 [00:00<?, ?it/s]

Iteration:   0%|          | 0/652 [00:00<?, ?it/s]

Iteration:   0%|          | 0/652 [00:00<?, ?it/s]

Iteration:   0%|          | 0/652 [00:00<?, ?it/s]

Iteration:   0%|          | 0/652 [00:00<?, ?it/s]

Iteration:   0%|          | 0/652 [00:00<?, ?it/s]

Iteration:   0%|          | 0/652 [00:00<?, ?it/s]

Iteration:   0%|          | 0/652 [00:00<?, ?it/s]

Iteration:   0%|          | 0/652 [00:00<?, ?it/s]

Iteration:   0%|          | 0/652 [00:00<?, ?it/s]

Iteration:   0%|          | 0/652 [00:00<?, ?it/s]

Iteration:   0%|          | 0/652 [00:00<?, ?it/s]

Iteration:   0%|          | 0/652 [00:00<?, ?it/s]

Iteration:   0%|          | 0/652 [00:00<?, ?it/s]

Iteration:   0%|          | 0/652 [00:00<?, ?it/s]

Iteration:   0%|          | 0/652 [00:00<?, ?it/s]

Iteration:   0%|          | 0/652 [00:00<?, ?it/s]

Iteration:   0%|          | 0/652 [00:00<?, ?it/s]

Iteration:   0%|          | 0/652 [00:00<?, ?it/s]

Iteration:   0%|          | 0/652 [00:00<?, ?it/s]

Iteration:   0%|          | 0/652 [00:00<?, ?it/s]

Iteration:   0%|          | 0/652 [00:00<?, ?it/s]

Iteration:   0%|          | 0/652 [00:00<?, ?it/s]

Iteration:   0%|          | 0/652 [00:00<?, ?it/s]

Iteration:   0%|          | 0/652 [00:00<?, ?it/s]

Iteration:   0%|          | 0/652 [00:00<?, ?it/s]

Iteration:   0%|          | 0/652 [00:00<?, ?it/s]

Iteration:   0%|          | 0/652 [00:00<?, ?it/s]

Iteration:   0%|          | 0/652 [00:00<?, ?it/s]

Iteration:   0%|          | 0/652 [00:00<?, ?it/s]

Iteration:   0%|          | 0/652 [00:00<?, ?it/s]

Iteration:   0%|          | 0/652 [00:00<?, ?it/s]

Iteration:   0%|          | 0/652 [00:00<?, ?it/s]

Iteration:   0%|          | 0/652 [00:00<?, ?it/s]

Iteration:   0%|          | 0/652 [00:00<?, ?it/s]

Iteration:   0%|          | 0/652 [00:00<?, ?it/s]

Iteration:   0%|          | 0/652 [00:00<?, ?it/s]

Iteration:   0%|          | 0/652 [00:00<?, ?it/s]

Iteration:   0%|          | 0/652 [00:00<?, ?it/s]

Iteration:   0%|          | 0/652 [00:00<?, ?it/s]

Iteration:   0%|          | 0/652 [00:00<?, ?it/s]

Iteration:   0%|          | 0/652 [00:00<?, ?it/s]

Iteration:   0%|          | 0/652 [00:00<?, ?it/s]

Iteration:   0%|          | 0/652 [00:00<?, ?it/s]

Iteration:   0%|          | 0/652 [00:00<?, ?it/s]

Iteration:   0%|          | 0/652 [00:00<?, ?it/s]

Iteration:   0%|          | 0/652 [00:00<?, ?it/s]

Iteration:   0%|          | 0/652 [00:00<?, ?it/s]

Iteration:   0%|          | 0/652 [00:00<?, ?it/s]

Iteration:   0%|          | 0/652 [00:00<?, ?it/s]

Iteration:   0%|          | 0/652 [00:00<?, ?it/s]

Iteration:   0%|          | 0/652 [00:00<?, ?it/s]

Iteration:   0%|          | 0/652 [00:00<?, ?it/s]

Iteration:   0%|          | 0/652 [00:00<?, ?it/s]

Iteration:   0%|          | 0/652 [00:00<?, ?it/s]

Iteration:   0%|          | 0/652 [00:00<?, ?it/s]

Iteration:   0%|          | 0/652 [00:00<?, ?it/s]

Iteration:   0%|          | 0/652 [00:00<?, ?it/s]

Iteration:   0%|          | 0/652 [00:00<?, ?it/s]

Iteration:   0%|          | 0/652 [00:00<?, ?it/s]

Iteration:   0%|          | 0/652 [00:00<?, ?it/s]

Iteration:   0%|          | 0/652 [00:00<?, ?it/s]

In [8]:
##############################################################################
# 테스트 : 훈련된 모델 실제 test sts 말뭉치로 테스트 
##############################################################################
test_samples = []

# 테스트 파일=KorSTS 테스트파일 경로 지정
test_file = '../../data11/korpora/korsts/tune_test.tsv'

# 평가시 cosine 유사도등 측정 결과값 파일 (similarity_evaluation_xxxx.xls) 저장될 경로
output_path = './sts_test1'
os.makedirs(output_path, exist_ok=True)

####################################################################################################
# KorSTS 평가 데이터 셋 설정(.tsv 파일)
####################################################################################################
if use_korsts == True:
    count = 0
    logger.info(f"Read STS test dataset=>{test_file}")
    with open(test_file, 'rt', encoding='utf-8') as f:
        lines = f.readlines()
        for line in lines:
            text_a, text_b, score = line.split('\t')
            score = score.strip()
            score = float(score) / 5.0  #5로 나눠서 0~1 사이가 되도록 함
            
            if count < 5:
                print(f"{text_a}, {text_b}, {score}")
            
            test_samples.append(InputExample(texts= [text_a,text_b], label=score))
            count += 1
    logger.info(f'*{test_file} len: {count}')
####################################################################################################  

model = SentenceTransformer(smodel_save_path)
test_evaluator = EmbeddingSimilarityEvaluator.from_input_examples(test_samples, name='sts-test')
test_evaluator(model, output_path=output_path)

2022-09-29 13:14:07,300 - s-bert - INFO - Read STS test dataset=>../../data11/korpora/korsts/tune_test.tsv
2022-09-29 13:14:07,308 - s-bert - INFO - *../../data11/korpora/korsts/tune_test.tsv len: 1379


한 소녀가 머리를 스타일링하고 있다., 한 소녀가 머리를 빗고 있다., 0.5
한 무리의 남자들이 해변에서 축구를 한다., 한 무리의 소년들이 해변에서 축구를 하고 있다., 0.72
한 여성이 다른 여성의 발목을 재고 있다., 한 여자는 다른 여자의 발목을 측정한다., 1.0
한 남자가 오이를 자르고 있다., 한 남자가 오이를 자르고 있다., 0.8400000000000001
한 남자가 하프를 연주하고 있다., 한 남자가 키보드를 연주하고 있다., 0.3


0.867558845497177