In [2]:
import re
import emoji
from soynlp.normalizer import repeat_normalize
from transformers import ElectraForSequenceClassification, ElectraTokenizer
import torch.nn as nn
import pandas as pd

In [30]:
def infer(x, path) :
    model = ElectraForSequenceClassification.from_pretrained(path)
    tokenizer = ElectraTokenizer.from_pretrained("monologg/koelectra-small-v3-discriminator")
    
    emojis = ''.join(emoji.UNICODE_EMOJI.keys())  
    pattern = re.compile(f'[^ .,?!/@$%~％·∼()\x00-\x7Fㄱ-힣{emojis}]+')
    url_pattern = re.compile(
        r'https?:\/\/(www\.)?[-a-zA-Z0-9@:%._\+~#=]{1,256}\.[a-zA-Z0-9()]{1,6}\b([-a-zA-Z0-9()@:%_\+.~#?&//=]*)'
    )
    processed = pattern.sub(' ', x)
    processed = url_pattern.sub(' ', processed)
    processed = processed.strip()
    processed = repeat_normalize(processed, num_repeats=2)

    tokenized = tokenizer(processed, return_tensors='pt')

    output = model(tokenized.input_ids, tokenized.attention_mask)
    return nn.functional.softmax(output.logits, dim=-1)

In [3]:
text = '요딴 걸 영화라고 만들어놨네'
print(infer(text,'/home/hjgp/research/KoELECTRA/finetune/ckpt/koelectra-small-v3-nsmc-ckpt/checkpoint-46000/'))

[tensor([[0.1159, 0.8841]], grad_fn=<SoftmaxBackward0>), tensor([[0.9903, 0.0097]], grad_fn=<SoftmaxBackward0>), tensor([[0.5789, 0.4211]], grad_fn=<SoftmaxBackward0>), tensor([[0.9829, 0.0171]], grad_fn=<SoftmaxBackward0>), tensor([[0.5789, 0.4211]], grad_fn=<SoftmaxBackward0>), tensor([[0.9718, 0.0282]], grad_fn=<SoftmaxBackward0>), tensor([[0.3979, 0.6021]], grad_fn=<SoftmaxBackward0>), tensor([[0.4374, 0.5626]], grad_fn=<SoftmaxBackward0>), tensor([[0.0695, 0.9305]], grad_fn=<SoftmaxBackward0>), tensor([[0.5789, 0.4211]], grad_fn=<SoftmaxBackward0>), tensor([[0.8015, 0.1985]], grad_fn=<SoftmaxBackward0>), tensor([[0.3722, 0.6278]], grad_fn=<SoftmaxBackward0>), tensor([[0.6336, 0.3664]], grad_fn=<SoftmaxBackward0>), tensor([[0.9964, 0.0036]], grad_fn=<SoftmaxBackward0>), tensor([[0.3432, 0.6568]], grad_fn=<SoftmaxBackward0>)]


In [27]:
train_df = pd.read_csv('/home/hjgp/research/NH_intern/reviews_namu.csv')

In [28]:
result = infer(train_df['content'][:10], '/home/hjgp/research/KoELECTRA/finetune/ckpt/koelectra-small-v3-nsmc-ckpt/checkpoint-46000/')

In [11]:
train_df['score'][:10]

0    3
1    1
2    1
3    2
4    5
5    5
6    1
7    5
8    1
9    1
Name: score, dtype: int64

In [32]:
train_df['content'][:10]

0    고객정보확인인가 그거 신분증 실패했다고 하거나, 본인확인증 찍고 다음화면이 잘려서 ...
1    카카오페이 인증으로 접속이 안되서 ID로 접속하려구 했는데 등록된 단말이 아니라고 ...
2    PC용까지 쓰려면 믿고 걸러야 합니다. 600만원짜리 PC, 3개의 HTS 사용자지...
3    미장 주식 업데이트 할때마다 회사 주주정보에 총주식수와 유동주식수 나오던게 업데이트...
4              해외주식도 차트에 매매내역을 남길 수 있도록 해주세요! 꼭 부탁드려요!
5                                                  쉽다.
6                       느려 터져 답답해.정보를 다른 증권사 앱을 이용하다니.
7                                      너무너무너무너무너무 좋습니다
8                            이거 어플 왜이렇게 멈춤현상 렉걸림이ㅡ심한가요
9    해외주식거래 LOC매수를 엄청이용하고있는데요 요 며칠 계속 매수가 안되고있네요?? ...
Name: content, dtype: object

In [29]:
result

[0.15034963190555573,
 0.9920762777328491,
 0.9996480941772461,
 0.9986039996147156,
 0.01784360036253929,
 0.9995195865631104,
 0.9996054768562317,
 0.008082479238510132,
 0.9989297986030579,
 0.998897910118103]

In [33]:
for i in range(10):
    print(infer(train_df['content'][:10][i],'/home/hjgp/research/KoELECTRA/finetune/ckpt/koelectra-small-v3-nsmc-ckpt/checkpoint-46000/'))

tensor([[0.1503, 0.8497]], grad_fn=<SoftmaxBackward0>)
tensor([[0.9921, 0.0079]], grad_fn=<SoftmaxBackward0>)
tensor([[9.9965e-01, 3.5184e-04]], grad_fn=<SoftmaxBackward0>)
tensor([[0.9986, 0.0014]], grad_fn=<SoftmaxBackward0>)
tensor([[0.0178, 0.9822]], grad_fn=<SoftmaxBackward0>)
tensor([[9.9952e-01, 4.8041e-04]], grad_fn=<SoftmaxBackward0>)
tensor([[9.9961e-01, 3.9457e-04]], grad_fn=<SoftmaxBackward0>)
tensor([[0.0081, 0.9919]], grad_fn=<SoftmaxBackward0>)
tensor([[0.9989, 0.0011]], grad_fn=<SoftmaxBackward0>)
tensor([[0.9989, 0.0011]], grad_fn=<SoftmaxBackward0>)
