In [1]:
import re
import emoji
from soynlp.normalizer import repeat_normalize
from transformers import ElectraForSequenceClassification, ElectraTokenizer, ElectraConfig
import torch.nn as nn
import pandas as pd

from torch.utils.data import DataLoader, RandomSampler, SequentialSampler
from fastprogress.fastprogress import master_bar, progress_bar
from attrdict import AttrDict

from finetune.processor import seq_cls_tasks_num_labels as tasks_num_labels
from finetune.processor import seq_cls_processors as processors
from finetune.processor import seq_cls_output_modes as output_modes

from finetune.processor.seq_cls import seq_cls_convert_examples_to_features
from finetune.processor.seq_cls import InputExample


import json
import torch
from torch.utils.data import TensorDataset
import numpy as np

In [2]:
def compute_metrics(task_name, labels, preds):
    assert len(preds) == len(labels)
    if task_name == "kornli":
        return acc_score(labels, preds)
    elif task_name == "nsmc":
        return acc_score(labels, preds)
    elif task_name == "paws":
        return acc_score(labels, preds)
    elif task_name == "korsts":
        return pearson_and_spearman(labels, preds)
    elif task_name == "question-pair":
        return acc_score(labels, preds)
    elif task_name == "naver-ner":
        return f1_pre_rec(labels, preds, is_ner=True)
    elif task_name == "hate-speech":
        return f1_pre_rec(labels, preds, is_ner=False)
    else:
        raise KeyError(task_name)
        
def acc_score(labels, preds):
    return {
        "acc": simple_accuracy(labels, preds),
    }
def simple_accuracy(labels, preds):
    return (labels == preds).mean()

In [3]:
class NsmcProcessor(object):
    """Processor for the NSMC data set """

    def __init__(self, args, df):
        self.args = args

    def get_labels(self):
        return ["0", "1"]

    @classmethod
    def _read_file(cls, df):
        """Reads a tab separated value file."""
        lines = []
        
        for i in range(len(train_df)):
            line = train_df.iloc[i]
            lines.append(line[['reviewId', 'content', 'score']].to_string(header=False,index=False).strip())
        return lines
    
    def _create_examples(self, lines, set_type):
        """Creates examples for the training and dev sets."""
        examples = []
        for (i, line) in enumerate(lines[1:]):
            line = line.split("\n")
            guid = "%s-%s" % (set_type, i)
            text_a = line[1].strip()
            label = line[2].strip()
            examples.append(InputExample(guid=guid, text_a=text_a, text_b=None, label=label))
        return examples

    def get_examples(self, mode, df):
        """
        Args:
            mode: train, dev, test
        """
        file_to_read = None
        if mode == "train":
            file_to_read = self.args.train_file
        elif mode == "dev":
            file_to_read = self.args.dev_file
        elif mode == "test":
            file_to_read = self.args.test_file

        return self._create_examples(
            self._read_file(df), mode
        )

In [126]:
def infer(x, path) :
    model = ElectraForSequenceClassification.from_pretrained(path)
    tokenizer = ElectraTokenizer.from_pretrained("monologg/koelectra-small-v3-discriminator")
    
    emojis = ''.join(emoji.UNICODE_EMOJI.keys())  
    pattern = re.compile(f'[^ .,?!/@$%~％·∼()\x00-\x7Fㄱ-힣{emojis}]+')
    url_pattern = re.compile(
        r'https?:\/\/(www\.)?[-a-zA-Z0-9@:%._\+~#=]{1,256}\.[a-zA-Z0-9()]{1,6}\b([-a-zA-Z0-9()@:%_\+.~#?&//=]*)'
    )
    processed = pattern.sub(' ', x)
    processed = url_pattern.sub(' ', processed)
    processed = processed.strip()
    processed = repeat_normalize(processed, num_repeats=2)

    tokenized = tokenizer(processed, return_tensors='pt')

    output = model(tokenized.input_ids, tokenized.attention_mask)
    return nn.functional.softmax(output.logits, dim=-1)

In [170]:
def seq_cls_load_and_cache_examples(args, tokenizer, mode, df):
    processor = NsmcProcessor(args, df)
    # Load data features from cache or dataset file

    examples = processor.get_examples("test", df)
    features = seq_cls_convert_examples_to_features(args, examples, tokenizer, max_length=args.max_seq_len, task=args.task)
    
    #torch_tensor_output = torch.tensor(df['output'].values)
    
    # Convert to Tensors and build dataset
    all_input_ids = torch.tensor([f.input_ids for f in features], dtype=torch.long)
    all_attention_mask = torch.tensor([f.attention_mask for f in features], dtype=torch.long)
    all_token_type_ids = torch.tensor([f.token_type_ids for f in features], dtype=torch.long)
    all_labels = torch.tensor([f.label for f in features], dtype=torch.long)

    dataset = TensorDataset(all_input_ids, all_attention_mask, all_token_type_ids, all_labels)
    return dataset

In [171]:
train_df = pd.read_csv('/home/hjgp/research/NH_intern/reviews_namu.csv')

In [172]:
train_df['score'] = 1

In [173]:
with open('/home/hjgp/research/KoELECTRA/finetune/config/nsmc/koelectra-small-v3.json') as f:
    args = AttrDict(json.load(f))

processor = processors['nsmc'](args)

labels = processor.get_labels()
config = ElectraConfig.from_pretrained(
    '/home/hjgp/research/KoELECTRA/finetune/ckpt/koelectra-small-v3-nsmc-ckpt/checkpoint-46000/',
    num_labels=2,
    id2label={str(i): label for i, label in enumerate(labels)},
    label2id={label: i for i, label in enumerate(labels)},
)


tokenizer = ElectraTokenizer.from_pretrained(
    "monologg/koelectra-small-v3-discriminator",
)
model = ElectraForSequenceClassification.from_pretrained(
    '/home/hjgp/research/KoELECTRA/finetune/ckpt/koelectra-small-v3-nsmc-ckpt/checkpoint-46000/',
    config=config
)

test_dataset = seq_cls_load_and_cache_examples(args, tokenizer, mode="test", df=train_df) ##train_df : 테스트하고싶은 df

In [174]:
test_dataset[-2]

(tensor([    2, 22864,  6764,  ...,     0,     0,     0]),
 tensor([1, 1, 1,  ..., 0, 0, 0]),
 tensor([0, 0, 0,  ..., 0, 0, 0]),
 tensor(1))

In [175]:
def test(args, model, test_dataset):
    test_sampler = SequentialSampler(test_dataset)
    test_dataloader = DataLoader(test_dataset, sampler=test_sampler, batch_size=100)

    preds = None
    zz = 0
    for batch in progress_bar(test_dataloader):
        zz += 1
        batch = tuple(t.to(args.device) for t in batch)
        with torch.no_grad():
            inputs = {
                "input_ids": batch[0],
                "attention_mask": batch[1],
            }

            outputs = model(**inputs)
            logits = outputs[0]

        if preds is None:
            preds = nn.functional.softmax(logits, dim=-1).detach().cpu().numpy()
        else:
            preds = np.append(preds, nn.functional.softmax(logits, dim=-1).detach().cpu().numpy(), axis=0)
        

#     result = compute_metrics(args.task, out_label_ids, preds)
#     results.update(result)

    return preds, zz

In [176]:
args.device = "cuda"
result, zz = test(args, model.cuda(), test_dataset)

RuntimeError: The expanded size of the tensor (2048) must match the existing size (512) at non-singleton dimension 1.  Target sizes: [100, 2048].  Tensor sizes: [1, 512]

In [None]:
zz

In [None]:
result_list = []

for i in result:
    result_list.append(i[1])

In [None]:
len(result_list)

In [161]:
result_list.append(1)

In [162]:
train_df['nsmc'] = result_list

In [163]:
train_df['content'][1]

'카카오페이 인증으로 접속이 안되서 ID로 접속하려구 했는데 등록된 단말이 아니라고 ID해지하고 재등록하라구해서 해지하고나서 재등록하려는데 계좌번호입력창에서 숫자가 11자리밖에 입력이 안되서 재등록이 안됩니다.. 그래서 접속이 불가능한데 어떻게 접속해야하나요?'

In [164]:
train_df.to_csv('tmp.csv', index=False, encoding='utf-8-sig')

In [165]:
tmp = infer(train_df['content'][1], '/home/hjgp/research/KoELECTRA/finetune/ckpt/koelectra-small-v3-nsmc-ckpt/checkpoint-46000/')

In [166]:
tmp[0]

tensor([0.9921, 0.0079], grad_fn=<SelectBackward0>)

In [167]:
result

array([[9.9700373e-01, 2.9962957e-03],
       [9.9898189e-01, 1.0181338e-03],
       [9.5161390e-01, 4.8386134e-02],
       ...,
       [9.9904662e-01, 9.5336186e-04],
       [8.5410187e-03, 9.9145901e-01],
       [9.9907494e-01, 9.2508737e-04]], dtype=float32)

In [168]:
train_df['score'][:9]

0    1
1    1
2    1
3    1
4    1
5    1
6    1
7    1
8    1
Name: score, dtype: int64

In [36]:
 1

'이거 어플 왜이렇게 멈춤현상 렉걸림이ㅡ심한가요'

In [22]:
train_df

Unnamed: 0,reviewId,userName,userImage,content,score,thumbsUpCount,reviewCreatedVersion,at,replyContent,repliedAt,date
0,a74ab9e7-5b6b-4665-b8e5-4c0a82459cf6,김선구,https://play-lh.googleusercontent.com/a/AItbvm...,"고객정보확인인가 그거 신분증 실패했다고 하거나, 본인확인증 찍고 다음화면이 잘려서 ...",3,0,8.90,2022-08-08 07:21:10,,,202208
1,0df5ae4e-ff08-434a-b7a2-62da3181b586,정수정,https://play-lh.googleusercontent.com/a/AItbvm...,카카오페이 인증으로 접속이 안되서 ID로 접속하려구 했는데 등록된 단말이 아니라고 ...,1,0,8.88,2022-08-07 23:38:18,,,202208
2,c3169719-6a19-4995-92f2-2eca597ef652,zenter,https://play-lh.googleusercontent.com/a-/AFdZu...,"PC용까지 쓰려면 믿고 걸러야 합니다. 600만원짜리 PC, 3개의 HTS 사용자지...",1,0,8.90,2022-08-05 23:52:17,,,202208
3,cda3efef-4429-4007-b1cb-14bd6fd42b4b,홍길동,https://play-lh.googleusercontent.com/a/AItbvm...,미장 주식 업데이트 할때마다 회사 주주정보에 총주식수와 유동주식수 나오던게 업데이트...,2,0,8.90,2022-08-05 23:22:37,,,202208
4,996572c2-f4cb-4efe-a400-40b0f0123413,이용현,https://play-lh.googleusercontent.com/a/AItbvm...,해외주식도 차트에 매매내역을 남길 수 있도록 해주세요! 꼭 부탁드려요!,5,0,8.90,2022-08-05 15:22:57,,,202208
...,...,...,...,...,...,...,...,...,...,...,...
5009,4c5a5486-9f87-4571-a97e-ba5353cddccc,Google 사용자,https://play-lh.googleusercontent.com/EGemoI2N...,초기 편집이랑 설정잘해놓으면 hts못지않네요 아쉬운건 수수료가 비싸다는 점..,5,0,1.15,2012-03-27 02:23:04,,,201203
5010,9100e8a6-7d71-4ea7-a972-145d5c1d1821,Google 사용자,https://play-lh.googleusercontent.com/EGemoI2N...,피씨버전을하는듯하네요 상당한 발전이군요,5,1,,2012-03-16 18:39:06,,,201203
5011,e620e368-649a-4d0d-a772-c4cab76470e1,Google 사용자,https://play-lh.googleusercontent.com/EGemoI2N...,실행 아주 빠릅니다. 계정이 없어도 시세조회 가능합니다. 한번 로그인하면 별도의 로...,5,6,1.06,2011-11-10 20:40:07,,,201111
5012,471a8ecd-a646-4435-8246-3c924edd647b,Google 사용자,https://play-lh.googleusercontent.com/EGemoI2N...,로그인없이 시세 조회도 가능하고 일단 속도가 다른 어플에 비해서 빠르네요^^,5,2,1.05,2011-11-01 15:28:32,,,201111


In [46]:
ll = train_df.iloc[0][['reviewId', 'content', 'score']].to_string(header=False,index=False).strip()
ll

'a74ab9e7-5b6b-4665-b8e5-4c0a82459cf6\n고객정보확인인가 그거 신분증 실패했다고 하거나, 본인확인증 찍고 다음화면이 잘려서 수...\n                                                 3'

In [42]:
ll.split("\t")

['reviewId                 a74ab9e7-5b6b-4665-b8e5-4c0a82459cf6\ncontent     고객정보확인인가 그거 신분증 실패했다고 하거나, 본인확인증 찍고 다음화면이 잘려서 ...\nscore                                                       3\nName: 0, dtype: object']

In [32]:
train_df.iloc[1]

reviewId                             0df5ae4e-ff08-434a-b7a2-62da3181b586
userName                                                              정수정
userImage               https://play-lh.googleusercontent.com/a/AItbvm...
content                 카카오페이 인증으로 접속이 안되서 ID로 접속하려구 했는데 등록된 단말이 아니라고 ...
score                                                                   1
thumbsUpCount                                                           0
reviewCreatedVersion                                                 8.88
at                                                    2022-08-07 23:38:18
replyContent                                                          NaN
repliedAt                                                             NaN
date                                                               202208
Name: 1, dtype: object

In [33]:
for i in range(10):
    print(infer(train_df['content'][:10][i],'/home/hjgp/research/KoELECTRA/finetune/ckpt/koelectra-small-v3-nsmc-ckpt/checkpoint-46000/'))

tensor([[0.1503, 0.8497]], grad_fn=<SoftmaxBackward0>)
tensor([[0.9921, 0.0079]], grad_fn=<SoftmaxBackward0>)
tensor([[9.9965e-01, 3.5184e-04]], grad_fn=<SoftmaxBackward0>)
tensor([[0.9986, 0.0014]], grad_fn=<SoftmaxBackward0>)
tensor([[0.0178, 0.9822]], grad_fn=<SoftmaxBackward0>)
tensor([[9.9952e-01, 4.8041e-04]], grad_fn=<SoftmaxBackward0>)
tensor([[9.9961e-01, 3.9457e-04]], grad_fn=<SoftmaxBackward0>)
tensor([[0.0081, 0.9919]], grad_fn=<SoftmaxBackward0>)
tensor([[0.9989, 0.0011]], grad_fn=<SoftmaxBackward0>)
tensor([[0.9989, 0.0011]], grad_fn=<SoftmaxBackward0>)
