In [1]:
import torch
import transformers
import pandas as pd
import pytorch_lightning as pl
import os
import yaml
from models.model import Model
from utils import utils

  from .autonotebook import tqdm as notebook_tqdm


In [2]:
# 확인하고 싶은 모델의 result 폴더 경로로 바꿔주세요.
result_path = '/opt/level1_semantictextsimilarity-nlp-06/results/2023-04-19-16:22:56_UJ'

In [5]:
# 그냥 실행하고 넘어가세요.
def find_files(folder_path):
    ckpt_path = ""
    yaml_path = ""
    for root, dirs, files in os.walk(folder_path):
        for file in files:
            if file.endswith("last.ckpt"):
                ckpt_path = os.path.abspath(os.path.join(root, file))
            elif file.endswith(".yaml"):
                yaml_path = os.path.abspath(os.path.join(root, file))
            if ckpt_path and yaml_path:
                break
        if ckpt_path and yaml_path:
            break
    return ckpt_path, yaml_path

def tokenizing(df):
    data = []

    for idx, item in df.iterrows():
        # 두 입력 문장을 [SEP] 토큰으로 이어붙여서 전처리
        text = '[SEP]'.join([item[text_column] for text_column in text_columns])
        outputs = tokenizer(text, add_special_tokens=True, padding='max_length', truncation=True)
        data.append(outputs['input_ids'])
    
    return data

train_df, val_df, predict_df = utils.get_data()
ckpt_path, yaml_path = find_files(result_path)

model = Model.load_from_checkpoint(ckpt_path)
model = model.to('cuda')
model.eval()

with open('baselines/baseline_config.yaml') as f:
    CFG = yaml.load(f, Loader=yaml.FullLoader)
tokenizer = transformers.AutoTokenizer.from_pretrained(CFG['train']['model_name'], model_max_length=CFG['train']['max_len'])
text_columns = ['sentence_1', 'sentence_2']


val_df = val_df[['sentence_1', 'sentence_2', 'label']]

result_list = []
for i in range(10):
    sample_df = val_df[i*55:(i+1)*55].reset_index()
    result = model(torch.tensor(tokenizing(sample_df)).to('cuda'))
    result = result.detach().cpu().squeeze().tolist()
    result_list.extend(result)
result_df = pd.DataFrame({'predict': result_list})
predict_df = pd.concat([val_df, result_df], axis=1)

sorted_df = pd.concat([predict_df, abs(predict_df['label'] - predict_df['predict'])], axis=1).sort_values([0], ascending=False)

Some weights of BertForSequenceClassification were not initialized from the model checkpoint at snunlp/KR-SBERT-V40K-klueNLI-augSTS and are newly initialized: ['classifier.bias', 'classifier.weight']
You should probably TRAIN this model on a down-stream task to be able to use it for predictions and inference.


In [8]:
sorted_df.head(10)

Unnamed: 0,sentence_1,sentence_2,label,predict,0
195,국가 부채를 줄입시다.,기초 단체의원을 줄입시다.,0.0,5.020357,5.020357
295,주택 과다보유 금지,주택 과잉 소유 금지,4.6,0.160749,4.439251
386,구글시트에 바로 적어주셔도 좋습니다!,"구독 시작하시면, 여기에만 올려주셔도 됩니다!!",0.0,4.267386,4.267386
75,위대하신 문대통령님!,위대한 문재인 대통령!,4.8,0.712141,4.087859
447,자브라 대만족이죠!,Jabra에 매우 만족합니다!,4.2,0.191302,4.008698
161,어머 너무 예쁘네요.,아 넘 이쁘다..ㅠㅠ,4.0,0.014797,3.985203
514,중소기업 지원 제도 폐지,중소기업 지원제도 폐지,5.0,1.042229,3.957771
515,덕분에 비즈팀과 SWE 팀에서 고민하고 있는 문제를 어느정도 맛볼 수 있었습니다.,"서로 입사하게된 계기에 대해 이야기하고, 궁금했던 추천팀 이야기도 들을 수 있었습니다.",0.2,4.092525,3.892525
274,진짜 이건 아닌거같음..,.? 이건 아니지..,4.0,0.26959,3.73041
507,눈물이 난다...T T,눈물난다...TT,4.8,1.105719,3.694281


In [10]:
sorted_df.to_csv('val_inference.csv', encoding='utf-8')