In [1]:
import torch
import transformers
import pandas as pd
import pytorch_lightning as pl
import os
import yaml
from models.model import Model
from utils import utils

In [2]:
# 확인하고 싶은 모델의 result 폴더 경로로 바꿔주세요.
result_path = '/opt/ml/level1_semantictextsimilarity-nlp-06/results/2023-04-19-15:31:39_SE'

In [4]:
# 그냥 실행하고 넘어가세요.
def find_files(folder_path):
    ckpt_path = ""
    yaml_path = ""
    for root, dirs, files in os.walk(folder_path):
        for file in files:
            if file.endswith("last.ckpt"):
                ckpt_path = os.path.abspath(os.path.join(root, file))
            elif file.endswith(".yaml"):
                yaml_path = os.path.abspath(os.path.join(root, file))
            if ckpt_path and yaml_path:
                break
        if ckpt_path and yaml_path:
            break
    return ckpt_path, yaml_path

def tokenizing(df):
    data = []

    for idx, item in df.iterrows():
        # 두 입력 문장을 [SEP] 토큰으로 이어붙여서 전처리
        text = '[SEP]'.join([item[text_column] for text_column in text_columns])
        outputs = tokenizer(text, add_special_tokens=True, padding='max_length', truncation=True)
        data.append(outputs['input_ids'])
    
    return data

train_df, val_df, predict_df = utils.get_data()
ckpt_path, yaml_path = find_files(result_path)

model = Model.load_from_checkpoint(ckpt_path)
model = model.to('cuda')
model.eval()

with open('baselines/baseline_config.yaml') as f:
    CFG = yaml.load(f, Loader=yaml.FullLoader)
tokenizer = transformers.AutoTokenizer.from_pretrained(CFG['train']['model_name'], model_max_length=CFG['train']['max_len'])
text_columns = ['sentence_1', 'sentence_2']


val_df = val_df[['sentence_1', 'sentence_2', 'label']]

result_list = []
for i in range(10):
    sample_df = val_df[i*55:(i+1)*55].reset_index()
    result = model(torch.tensor(tokenizing(sample_df)).to('cuda'))
    result = result.detach().cpu().squeeze().tolist()
    result_list.extend(result)
result_df = pd.DataFrame({'predict': result_list})
predict_df = pd.concat([val_df, result_df], axis=1)

sorted_df = pd.concat([predict_df, abs(predict_df['label'] - predict_df['predict'])], axis=1).sort_values([0], ascending=False)

Some weights of the model checkpoint at snunlp/KR-ELECTRA-discriminator were not used when initializing ElectraForSequenceClassification: ['discriminator_predictions.dense.weight', 'discriminator_predictions.dense.bias', 'discriminator_predictions.dense_prediction.weight', 'discriminator_predictions.dense_prediction.bias']
- This IS expected if you are initializing ElectraForSequenceClassification from the checkpoint of a model trained on another task or with another architecture (e.g. initializing a BertForSequenceClassification model from a BertForPreTraining model).
- This IS NOT expected if you are initializing ElectraForSequenceClassification from the checkpoint of a model that you expect to be exactly identical (initializing a BertForSequenceClassification model from a BertForSequenceClassification model).
Some weights of ElectraForSequenceClassification were not initialized from the model checkpoint at snunlp/KR-ELECTRA-discriminator and are newly initialized: ['classifier.dense

In [7]:
sorted_df.head(20)

Unnamed: 0,sentence_1,sentence_2,label,predict,0
447,자브라 대만족이죠!,Jabra에 매우 만족합니다!,4.2,1.225088,2.974912
171,(선언만 해두고 차후에 적정시점 실행),(나중에 적절한 시점에 선언하고 실행하면 됨),1.2,3.86106,2.66106
29,엥 제가 2등인가요? ㅋㅋ,엥 은메달인가요 제가? ㅋㅋ,2.6,0.5221,2.0779
288,과장과 공포에 굴복하여 도리어 국민생명을 위협하고 있는 비이성적인 '독일 따라하기'...,과장과 공포에 굴복하여 국민의 생명을 위협하는 불합리한 '독일을 모방'하는 비핵화 ...,2.4,4.24143,1.84143
390,편보다 편이 더 재밌는 영화... 편부터는 보지마,본편보다 영화가 더 재밌어... 본편은 보지마,1.6,3.392653,1.792653
65,ㅋㅋ 실습까지 ㅋㅋ,ㅎㅎ 연습까지,1.8,3.559879,1.759879
130,국회의원 월급 삭감,국회의원 급여 미지급,1.8,3.472321,1.672321
88,국가고시 가산점 폐지,518 공무원 가산제 폐지,2.4,0.736729,1.663271
420,아무래도 상관 없어요.,그것은 정말 중요하지 않습니다.,1.8,0.14365,1.65635
454,갈수록 뛰는 시간이 늘 것 같은 설렘!!,러닝타임이 길어질수록 더해지는 것 같은 설렘!!,2.2,3.831324,1.631324


In [10]:
# sorted_df.to_csv('val_inference.csv', encoding='utf-8')