In [1]:
#========================================================================================================================
# sentence-bert(sbert)에 CrossEncoder 방식 STS 훈련 예시임
# => cross-encocoder 방식은 2개의 문장(문장1, 문장2)을 입력했을때 output으로 유사도(0~1값)을 출력해줌
#
# => 참고 : https://www.sbert.net/examples/training/cross-encoder/README.html
#         https://github.com/UKPLab/sentence-transformers/blob/master/examples/training/cross-encoder/training_stsbenchmark.py  
#========================================================================================================================
import torch 
import os
import time
import numpy as np
from os import sys
from datetime import datetime
sys.path.append('../../')
from myutils import seed_everything, GPU_info, mlogging
from torch.utils.data import DataLoader
import math

from sentence_transformers.cross_encoder import CrossEncoder
from sentence_transformers.cross_encoder.evaluation import CECorrelationEvaluator
from sentence_transformers import InputExample

seed_everything(111)
device = GPU_info()
logger =  mlogging(loggername="sbertcross", logfilename="../../../log/sbert-crossencocer-train-sts")

logfilepath:../../log/bwdataset_2022-04-22.log
logfilepath:../../log/qnadataset_2022-04-22.log
True
device: cuda:0
cuda index: 0
gpu 개수: 1
graphic name: NVIDIA A30
logfilepath:../../../log/sbert-crossencocer-train-sts_2022-04-22.log


In [None]:

model_path = "../../../model/classification/bmc-fpt-wiki_20190620_mecab_false_0311-nouns-0327-ft-nli-0328/bertmodel"
model_save_path = 'output/crossencoder-sts-train-'+datetime.now().strftime("%Y-%m-%d_%H-%M-%S")

model = CrossEncoder(model_path, num_labels=1)     # CrossEncoder 모델 로딩
print(model)

In [None]:
# sts dataloader 구성
train_file_type = 1 #0이면 korsts.tsv 파일, 1이면 klue-stst.json 파일

if train_file_type == 0:
    train_file = '../../../korpora/korsts/tune_train.tsv'
    eval_file = '../../../korpora/korsts/tune_dev.tsv'
elif train_file_type == 1:
    train_file = '../../../korpora/klue-sts/klue-sts-v1.1_train.json'
    eval_file = '../../../korpora/klue-sts/klue-sts-v1.1_dev.json'

train_batch_size = 32
num_epochs = 10

In [None]:
import json

# korsts 훈련 데이터 불러오기
# => [sentence1, sentence2], labels 식으로 만듬
logger.info(f"Read STS train dataset=>{train_file}")

train_samples = []
count = 0
    
if train_file_type == 0:
    with open(train_file, "rt", encoding="utf-8") as f:
        lines = f.readlines()
        for line in lines:
            text_a, text_a, score = line.split('\t')
            score = score.strip()
            score = float(score) / 5.0  #5로 나눠서 0~1 사이가 되도록 함
            
            if count < 5:
                print(f"{text_a}, {text_b}, {score}")
                
            train_samples.append(InputExample(texts= [text_a,text_a], label=score))
            count += 1
            
# klue 훈련 데이터 불러오기
elif train_file_type == 1:           
    with open(train_file, "rt", encoding="utf-8") as f:
        datas = json.load(f)
        for data in datas:
            text_a = data["sentence1"]
            text_b = data["sentence2"]
            score = data["labels"]["label"]
            score = float(score) / 5.0  #5로 나눠서 0~1 사이가 되도록 함

            if count < 5:
                print(f"{text_a}, {text_b}, {score}")

            train_samples.append(InputExample(texts= [text_a,text_b], label=score))
            count += 1
        
print(train_samples[0:3])

In [None]:
#Read STSbenchmark dataset and use it as development set
# 평가데이터 불러오기
#korsts 파일로 두 문장간 유사도를 수치로(5.0이 만점=매우 유사) 측정함.
logger.info(f"Read STS dev dataset=>{eval_file}")
dev_samples = []
count = 0

# korSTS.tsv 파일인 경우 
if train_file_type == 0:
    with open(eval_file, 'rt', encoding='utf-8') as f:
        lines = f.readlines()
        for line in lines:
            text_a, text_b, score = line.split('\t')
            score = score.strip()
            score = float(score) / 5.0  #5로 나눠서 0~1 사이가 되도록 함
            
            if count < 5:
                print(f"{text_a}, {text_b}, {score}")
            
            dev_samples.append(InputExample(texts= [text_a,text_b], label=score))
            count += 1
            
#KLUE-STS.json 파일인 경우            
elif train_file_type == 1:
     with open(eval_file, "rt", encoding="utf-8") as f:
        datas = json.load(f)
        for data in datas:
            text_a = data["sentence1"]
            text_b = data["sentence2"]
            score = data["labels"]["label"]
            score = float(score) / 5.0  #5로 나눠서 0~1 사이가 되도록 함

            if count < 5:
                print(f"{text_a}, {text_b}, {score}")

            dev_samples.append(InputExample(texts= [text_a,text_b], label=score))
            count += 1
            
print(dev_samples[0:3])

In [None]:
# 데이터 로더, 손실함수 정의
# We wrap train_samples (which is a List[InputExample]) into a pytorch DataLoader
train_dataloader = DataLoader(train_samples, shuffle=True, batch_size=train_batch_size)


# We add an evaluator, which evaluates the performance during training
evaluator = CECorrelationEvaluator.from_input_examples(dev_samples, name='sts-dev')

In [None]:
# 훈련 시작 
# => model_save_path에 모델과, 평가 CECorrelationEvaluator_sts-dev_results.csv 파일 생성됨

warmup_steps = math.ceil(len(train_dataloader) * num_epochs * 0.1) #10% of train data for warm-up
logger.info("Warmup-steps: {}".format(warmup_steps))

# Train the model
model.fit(train_dataloader=train_dataloader,
          evaluator=evaluator,
          epochs=num_epochs,
          warmup_steps=warmup_steps,
          output_path=model_save_path)

In [4]:
##############################################################################
#
# Load the stored model and evaluate its performance on STS benchmark dataset
# => 훈련되어서 저장된 s-bert 모델을 불러와서 성능 평가 해봄
##############################################################################
import time 

#model_save_path = "../../../model/bert/crossencoder-sts-train-2022-04-22_13-20"
#model_save_path = "../../../model/classification/bmc-fpt-wiki_20190620_mecab_false_0311-nouns-0327-ft-nli-0328/bertmodel"

test_file = '../../../korpora/korsts/tune_test.tsv'

# 훈련 데이터 불러옴 
test_samples = []
with open(test_file, 'rt', encoding='utf-8') as fIn:
    lines = fIn.readlines()
    for line in lines:
        s1, s2, score = line.split('\t')
        score = score.strip()
        score = float(score) / 5.0
        test_samples.append(InputExample(texts=[s1,s2], label=score))

start = time.time()       
model = CrossEncoder(model_save_path)

evaluator = CECorrelationEvaluator.from_input_examples(test_samples, name='sts-test')
result = evaluator(model)

logger.info(f"\n")
logger.info(f"model path: {model_save_path}")
logger.info(f'=== result: {result} ===')
logger.info(f'=== 처리시간: {time.time() - start:.3f} 초 ===')
logger.info("==============================================")
logger.info("\n")

Some weights of BertForSequenceClassification were not initialized from the model checkpoint at ../../../model/classification/bmc-fpt-wiki_20190620_mecab_false_0311-nouns-0327-ft-nli-0328/bertmodel and are newly initialized: ['classifier.weight', 'classifier.bias']
You should probably TRAIN this model on a down-stream task to be able to use it for predictions and inference.
2022-04-22 13:35:23,421 - sbertcross - INFO - 

2022-04-22 13:35:23,422 - sbertcross - INFO - model path: ../../../model/classification/bmc-fpt-wiki_20190620_mecab_false_0311-nouns-0327-ft-nli-0328/bertmodel
2022-04-22 13:35:23,423 - sbertcross - INFO - === result: 0.3309727933677876 ===
2022-04-22 13:35:23,424 - sbertcross - INFO - === 처리시간: 3.439 초 ===
2022-04-22 13:35:23,425 - sbertcross - INFO - 

