In [19]:
import re
import emoji
from soynlp.normalizer import repeat_normalize
from transformers import ElectraForSequenceClassification, ElectraTokenizer, ElectraConfig
import torch.nn as nn
import pandas as pd

from torch.utils.data import DataLoader, RandomSampler, SequentialSampler
from fastprogress.fastprogress import master_bar, progress_bar
from attrdict import AttrDict

from finetune.processor import seq_cls_tasks_num_labels as tasks_num_labels
from finetune.processor import seq_cls_processors as processors
from finetune.processor import seq_cls_output_modes as output_modes

from finetune.processor.seq_cls import seq_cls_convert_examples_to_features
from finetune.processor.seq_cls import InputExample


import json
import torch
from torch.utils.data import TensorDataset
import numpy as np

In [20]:
def compute_metrics(task_name, labels, preds):
    assert len(preds) == len(labels)
    if task_name == "kornli":
        return acc_score(labels, preds)
    elif task_name == "nsmc":
        return acc_score(labels, preds)
    elif task_name == "paws":
        return acc_score(labels, preds)
    elif task_name == "korsts":
        return pearson_and_spearman(labels, preds)
    elif task_name == "question-pair":
        return acc_score(labels, preds)
    elif task_name == "naver-ner":
        return f1_pre_rec(labels, preds, is_ner=True)
    elif task_name == "hate-speech":
        return f1_pre_rec(labels, preds, is_ner=False)
    else:
        raise KeyError(task_name)
        
def acc_score(labels, preds):
    return {
        "acc": simple_accuracy(labels, preds),
    }
def simple_accuracy(labels, preds):
    return (labels == preds).mean()

In [21]:
# lines = []

# for i in range(len(train_df)):
#     line = train_df.iloc[i]
#     line_review1 = line[['reviewId']].to_string(header=False,index=False).strip()
#     line_review2 = line[['content']].to_string(header=False,index=False).strip()
#     line_review3 = line[['score']].to_string(header=False,index=False).strip()
    
#     lines.append(line_review1 + '\t' + line_review2 + '\t' + line_review3)

In [4]:
# line = train_df.iloc[1]
# line[['reviewId']].to_string(header=False,index=False).strip() + '11'

In [5]:
# line[['reviewId', 'content', 'score']].to_string(header=False,index=False).strip()

In [6]:
# lines.replace('\n', '\t')

In [22]:
with open('/home/hjgp/research/KoELECTRA/finetune/data/nsmc/namu_test.txt', "r", encoding="utf-8") as f:
    lines = []
    for line in f:
        lines.append(line.strip())


In [4]:
lines[-10:]

['98a949ac-bd90-476d-82c3-60a951d1cab0\t타사는 스마트거래시 수수료싸든데 여기도치별점\t1',
 'cf10f289-0a43-436a-af4d-e3323d1459cd\tㅋㅋ\t1',
 '36c2d9c6-b896-48f0-8de8-e73b4065651c\t부탁드려요!\t1',
 '279e70fb-4aff-429a-9b58-e59dd9927224\t주식초보자인데 직관적 UI가 편해요\t1',
 'ac780c2d-351a-42fb-be64-ef298cab823d\t처음엔 좋았는데 업글되더니 형편없음.\t0',
 '4c5a5486-9f87-4571-a97e-ba5353cddccc\t아쉬운건 수수료가 비싸다는 점..\t1',
 '9100e8a6-7d71-4ea7-a972-145d5c1d1821\t상당한 발전이군요\t1',
 'e620e368-649a-4d0d-a772-c4cab76470e1\t시세알림 기능도 유용합니다.\t1',
 '471a8ecd-a646-4435-8246-3c924edd647b\t로그인없이 시세 조회도 가능하고  일단 속도가 다른 어플에 비해서 빠르네요^^\t1',
 'edd3bbc0-03e2-4a8a-aa32-7e8a7481d7e0\t부탁드립니다.\t1']

In [265]:
class NsmcProcessor(object):
    """Processor for the NSMC data set """

    def __init__(self, args, df):
        self.args = args

    def get_labels(self):
        return ["0", "1"]

    @classmethod
    def _read_file(cls, df):
        """Reads a tab separated value file."""
        #lines = df
        lines = []

        for i in range(len(df)):
            line = df.iloc[i]
            line_review1 = line[['reviewId']].to_string(header=False,index=False).strip()
            line_review2 = line[['content']].to_string(header=False,index=False).strip()
            line_review3 = line[['score']].to_string(header=False,index=False).strip()

            lines.append(line_review1 + '\t' + line_review2 + '\t' + line_review3)
        return lines
    
    def _create_examples(self, lines, set_type):
        """Creates examples for the training and dev sets."""
        examples = []
        for (i, line) in enumerate(lines[0:]):
            line = line.split("\t")
            guid = "%s-%s" % (set_type, i)
            text_a = line[1].strip()
            label = line[2].strip()
            examples.append(InputExample(guid=guid, text_a=text_a, text_b=None, label=label))
        return examples

    def get_examples(self, mode, df):
        """
        
        Args:
            mode: train, dev, test
        """
        file_to_read = None
        if mode == "train":
            file_to_read = self.args.train_file
        elif mode == "dev":
            file_to_read = self.args.dev_file
        elif mode == "test":
            file_to_read = self.args.test_file

        return self._create_examples(
            self._read_file(df), mode
        )

In [266]:
def infer(x, path) :
    model = ElectraForSequenceClassification.from_pretrained(path)
    tokenizer = ElectraTokenizer.from_pretrained("monologg/koelectra-small-v3-discriminator")
    
    emojis = ''.join(emoji.UNICODE_EMOJI.keys())  
    pattern = re.compile(f'[^ .,?!/@$%~％·∼()\x00-\x7Fㄱ-힣{emojis}]+')
    url_pattern = re.compile(
        r'https?:\/\/(www\.)?[-a-zA-Z0-9@:%._\+~#=]{1,256}\.[a-zA-Z0-9()]{1,6}\b([-a-zA-Z0-9()@:%_\+.~#?&//=]*)'
    )
    processed = pattern.sub(' ', x)
    processed = url_pattern.sub(' ', processed)
    processed = processed.strip()
    processed = repeat_normalize(processed, num_repeats=2)

    tokenized = tokenizer(processed, return_tensors='pt')

    output = model(tokenized.input_ids, tokenized.attention_mask)
    return nn.functional.softmax(output.logits, dim=-1)

In [267]:
def seq_cls_load_and_cache_examples(args, tokenizer, mode, df):
    processor = NsmcProcessor(args, df)
    # Load data features from cache or dataset file

    examples = processor.get_examples("test", df)
    features = seq_cls_convert_examples_to_features(args, examples, tokenizer, max_length=args.max_seq_len, task=args.task)
    
    #torch_tensor_output = torch.tensor(df['output'].values)
    
    # Convert to Tensors and build dataset
    all_input_ids = torch.tensor([f.input_ids for f in features], dtype=torch.long)
    all_attention_mask = torch.tensor([f.attention_mask for f in features], dtype=torch.long)
    all_token_type_ids = torch.tensor([f.token_type_ids for f in features], dtype=torch.long)
    all_labels = torch.tensor([f.label for f in features], dtype=torch.long)

    dataset = TensorDataset(all_input_ids, all_attention_mask, all_token_type_ids, all_labels)
    return dataset

In [9]:
train_df = pd.read_csv('/home/hjgp/research/NH_intern/filtered_for_nsmc.csv')

In [10]:
train_df['score'] = train_df['score'].replace(1, 0)
train_df['score'] = train_df['score'].replace(2, 0)
train_df['score'] = train_df['score'].replace(4, 1)
train_df['score'] = train_df['score'].replace(5, 1)

In [11]:
train_df['content'] = train_df['content'].str.replace('\n', '')

In [12]:
# train_df.to_csv('/home/hjgp/research/KoELECTRA/finetune/data/nsmc/reviews_finetuning.txt', sep = '\t', index=False)

In [8]:
lines

['reviewId\tcontent\tscore',
 '4123d655-f7b7-4328-abd2-7d7dcfb1c21f\t직관성 떨어지고 오류나고 답답\t1',
 '39afd67f-746b-462b-a838-f8dd5efca23e\t이 부분 개선되면 참 좋겠습니다\t1',
 '01016f38-580c-4c64-a954-5abf1f796094\t불법이 아니라면 개인투자자의 투자자율성 편의를 강화하는게 앱의 활용성을 높이고 경쟁력을 높이는데 크게 기여할거 같습니다~~\t1',
 '2630586a-03c8-41bb-ac4e-04a961b0e01b\t칸이모자르면 차라리 등락금액을 빼고 거래량을 넣어주세요\t0',
 'c124f0ff-c50a-47dc-a971-d5cf2cb09696\t좋아요..👌.\t1',
 'fe21b9a7-1b83-4cdb-92fe-7dd9408bb1a2\t메모기능 추가하면 좋을것같아요\t1',
 '368d8582-f98d-47b0-870a-63ff475dc534\t이부분에 대해 고민해보시면 좋을거 같습니다\t1',
 'a70c790a-417d-4616-bf73-00f16ca97e8a\t글자가 안보임\t0',
 '05fe293e-9f2c-45b6-87ec-692def1196c8\t단지 모르는번호로 주식단타열락문자옴\t1',
 '0fe4c2b1-d5ce-4aab-adf7-b678564d5aec\t개심해요 주식현재가 차트 보다가 위에 검색창에서 검색할라하면은 갑자기 미리보기 안뜨고 뒤로 나가기 누르니 화면이 회색깔로 변하고 뭐하는 짓임?\t0',
 '834397cf-c407-47cf-8ec6-28e29f7c5fed\tQv계좌연동은 안됩니까\t1',
 'ed4ecb2f-1915-49f4-8942-ba68f6d92833\t특별한 업데이트도 없는거 같고..\t1',
 'f6985ffb-e5fd-4518-9b5d-2eaa88bae5a2\t다른 증권사앱들은 종목 뉴스탭에 제대로 필터링돼서 뉴스들을 보여주는데 나무는 매번 이상한 광고 찌라시 뉴스만 잔뜩있어서 

In [26]:
with open('/home/hjgp/research/KoELECTRA/finetune/config/nsmc/review_test.json') as f:
    args = AttrDict(json.load(f))

processor = processors['nsmc'](args)

labels = processor.get_labels()
config = ElectraConfig.from_pretrained(
    '/home/hjgp/research/KoELECTRA/finetune/ckpt/koelectra-small-v3-nsmc-ckpt_review_new/checkpoint-39000/',
    num_labels=2,
    id2label={str(i): label for i, label in enumerate(labels)},
    label2id={label: i for i, label in enumerate(labels)},
)


tokenizer = ElectraTokenizer.from_pretrained(
    "monologg/koelectra-small-v3-discriminator",
)
model = ElectraForSequenceClassification.from_pretrained(
    '/home/hjgp/research/KoELECTRA/finetune/ckpt/koelectra-small-v3-nsmc-ckpt_review_new/checkpoint-39000/',
    config=config
)

test_dataset = seq_cls_load_and_cache_examples(args, tokenizer, mode="test", df=lines) ##train_df : 테스트하고싶은 df

In [27]:
def test(args, model, test_dataset):
    test_sampler = SequentialSampler(test_dataset)
    test_dataloader = DataLoader(test_dataset, sampler=test_sampler, batch_size=100)

    preds = None

    for batch in progress_bar(test_dataloader):

        batch = tuple(t.to(args.device) for t in batch)
        with torch.no_grad():
            inputs = {
                "input_ids": batch[0],
                "attention_mask": batch[1],
            }
            if args.model_type not in ["distilkobert", "xlm-roberta"]:
                inputs["token_type_ids"] = batch[2]  # Distilkobert, XLM-Roberta don't use segment_ids
            outputs = model(**inputs)
            logits = outputs[0]

        if preds is None:
            preds = nn.functional.softmax(logits, dim=-1).detach().cpu().numpy()
        else:
            preds = np.append(preds, nn.functional.softmax(logits, dim=-1).detach().cpu().numpy(), axis=0)
        

#     result = compute_metrics(args.task, out_label_ids, preds)
#     results.update(result)

    return preds

In [28]:
args.device = "cuda"
result = test(args, model.cuda(), test_dataset)

In [132]:
result

array([[0.99695826, 0.00304178],
       [0.2612903 , 0.7387097 ],
       [0.01088577, 0.9891143 ],
       ...,
       [0.00917432, 0.9908257 ],
       [0.00897739, 0.9910226 ],
       [0.50744224, 0.49255773]], dtype=float32)

In [373]:
#############################################################################
import re

import emoji
from soynlp.normalizer import repeat_normalize

from transformers import ElectraForSequenceClassification, ElectraTokenizer, ElectraConfig


def infer(x, path) :
    model = ElectraForSequenceClassification.from_pretrained(path)
    tokenizer = ElectraTokenizer.from_pretrained("monologg/koelectra-small-v3-discriminator")
    
    emojis = ''.join(emoji.UNICODE_EMOJI.keys())  
    pattern = re.compile(f'[^ .,?!/@$%~％·∼()\x00-\x7Fㄱ-힣{emojis}]+')
    url_pattern = re.compile(
        r'https?:\/\/(www\.)?[-a-zA-Z0-9@:%._\+~#=]{1,256}\.[a-zA-Z0-9()]{1,6}\b([-a-zA-Z0-9()@:%_\+.~#?&//=]*)'
    )
    processed = pattern.sub(' ', x)
    processed = url_pattern.sub(' ', processed)
    processed = processed.strip()
    processed = repeat_normalize(processed, num_repeats=2)

    tokenized = tokenizer(processed, return_tensors='pt')

    output = model(tokenized.input_ids, tokenized.attention_mask)
    return nn.functional.softmax(output.logits, dim=-1)

In [12]:
text = '이 부분 개선되면 참 좋겠습니다'
print(infer(text, '/home/hjgp/research/KoELECTRA/finetune/ckpt/koelectra-small-v3-nsmc-ckpt/checkpoint-26000/'))

text = '이 부분 개선되면 참 좋겠습니다'
print(infer(text, '/home/hjgp/research/KoELECTRA/finetune/ckpt/koelectra-small-v3-nsmc-ckpt_review_new/checkpoint-39000/'))

text = '이 부분 개선되면 참 좋겠습니다'
print(infer(text, '/home/hjgp/research/KoELECTRA/finetune/ckpt/koelectra-small-v3-nsmc-ckpt_review_full/checkpoint-70000/'))

tensor([[0.2632, 0.7368]], grad_fn=<SoftmaxBackward0>)
tensor([[0.8849, 0.1151]], grad_fn=<SoftmaxBackward0>)
tensor([[0.0091, 0.9909]], grad_fn=<SoftmaxBackward0>)


In [13]:
text = '살짝 아쉽지만 전체적으로 만족합니다.'
print(infer(text, '/home/hjgp/research/KoELECTRA/finetune/ckpt/koelectra-small-v3-nsmc-ckpt/checkpoint-26000/'))

text = '살짝 아쉽지만 전체적으로 만족합니다.'
print(infer(text, '/home/hjgp/research/KoELECTRA/finetune/ckpt/koelectra-small-v3-nsmc-ckpt_review_new/checkpoint-39000/'))

text = '살짝 아쉽지만 전체적으로 만족합니다.'
print(infer(text, '/home/hjgp/research/KoELECTRA/finetune/ckpt/koelectra-small-v3-nsmc-ckpt_review_full/checkpoint-70000/'))

tensor([[0.0111, 0.9889]], grad_fn=<SoftmaxBackward0>)
tensor([[0.0076, 0.9924]], grad_fn=<SoftmaxBackward0>)
tensor([[0.0079, 0.9921]], grad_fn=<SoftmaxBackward0>)


In [14]:
text = 'UI 빨리 고쳐주세요!!!'
print(infer(text, '/home/hjgp/research/KoELECTRA/finetune/ckpt/koelectra-small-v3-nsmc-ckpt/checkpoint-26000/'))

text = 'UI 빨리 고쳐주세요!!!'
print(infer(text, '/home/hjgp/research/KoELECTRA/finetune/ckpt/koelectra-small-v3-nsmc-ckpt_review_new/checkpoint-39000/'))

text = 'UI 빨리 고쳐주세요!!!'
print(infer(text, '/home/hjgp/research/KoELECTRA/finetune/ckpt/koelectra-small-v3-nsmc-ckpt_review_full/checkpoint-70000/'))

tensor([[0.9821, 0.0179]], grad_fn=<SoftmaxBackward0>)
tensor([[0.9903, 0.0097]], grad_fn=<SoftmaxBackward0>)
tensor([[0.9945, 0.0055]], grad_fn=<SoftmaxBackward0>)


In [20]:
text = '지인 때문에 울며 겨자먹기로 쓰는 앱'
print(infer(text, '/home/hjgp/research/KoELECTRA/finetune/ckpt/koelectra-small-v3-nsmc-ckpt/checkpoint-26000/'))

print(infer(text, '/home/hjgp/research/KoELECTRA/finetune/ckpt/koelectra-small-v3-nsmc-ckpt_review_new/checkpoint-39000/'))

print(infer(text, '/home/hjgp/research/KoELECTRA/finetune/ckpt/koelectra-small-v3-nsmc-ckpt_review_full/checkpoint-70000/'))

tensor([[0.9970, 0.0030]], grad_fn=<SoftmaxBackward0>)
tensor([[0.9975, 0.0025]], grad_fn=<SoftmaxBackward0>)
tensor([[0.9980, 0.0020]], grad_fn=<SoftmaxBackward0>)


In [None]:
koelectra-small-v3-nsmc-ckpt_review_350000

In [147]:
result

array([[0.9964393 , 0.00356075],
       [0.88477474, 0.11522522],
       [0.99294066, 0.00705928],
       ...,
       [0.00866902, 0.991331  ],
       [0.01060901, 0.989391  ],
       [0.5378835 , 0.46211648]], dtype=float32)

In [148]:
result_list = []

for i in result:
    result_list.append(i[1])

In [149]:
result_list

[0.0035607507,
 0.11522522,
 0.007059285,
 0.2084209,
 0.988644,
 0.9868896,
 0.014393635,
 0.018283244,
 0.99439806,
 0.0077934754,
 0.94902396,
 0.007357829,
 0.0036422506,
 0.9949911,
 0.0061874897,
 0.99412423,
 0.008189505,
 0.009485318,
 0.0039096726,
 0.0793678,
 0.0024872702,
 0.0051014493,
 0.9617021,
 0.9853956,
 0.005441231,
 0.98629016,
 0.009001749,
 0.011457747,
 0.18617308,
 0.06456646,
 0.0032706407,
 0.057587452,
 0.005806593,
 0.0027137552,
 0.002476704,
 0.007884473,
 0.0723302,
 0.9694723,
 0.021415455,
 0.4180745,
 0.01394982,
 0.0027284436,
 0.06967806,
 0.028785693,
 0.0028783728,
 0.923505,
 0.0026276933,
 0.008349974,
 0.007911999,
 0.0036213826,
 0.011386932,
 0.061798505,
 0.011378487,
 0.98380727,
 0.12881912,
 0.68289566,
 0.7866055,
 0.002561727,
 0.003646884,
 0.0031897272,
 0.0030997721,
 0.91756415,
 0.99541795,
 0.98572034,
 0.9946607,
 0.9898009,
 0.9936852,
 0.89602965,
 0.99517894,
 0.003226791,
 0.032212634,
 0.00514377,
 0.9906108,
 0.99513054,
 0

In [150]:
tmp_df = pd.DataFrame(lines)

In [151]:
tmp_df = pd.DataFrame(list(tmp_df.applymap(lambda a: a.split('\t'))[0]))

In [152]:
tmp_df = tmp_df.rename(columns=tmp_df.iloc[0]).iloc[1:].reset_index(drop=True)

In [153]:
tmp_df['NSMC'] = result_list

In [154]:
tmp_df['score'] = tmp_df['score'].astype(int)

In [155]:
tmp_df['MAE'] = tmp_df['score'] - tmp_df['NSMC']

In [142]:
tmp_df

Unnamed: 0,reviewId,content,score,NSMC,MAE
0,4123d655-f7b7-4328-abd2-7d7dcfb1c21f,직관성 떨어지고 오류나고 답답,1,0.003042,0.996958
1,39afd67f-746b-462b-a838-f8dd5efca23e,이 부분 개선되면 참 좋겠습니다,1,0.738710,0.261290
2,01016f38-580c-4c64-a954-5abf1f796094,불법이 아니라면 개인투자자의 투자자율성 편의를 강화하는게 앱의 활용성을 높이고 경쟁...,1,0.989114,0.010886
3,2630586a-03c8-41bb-ac4e-04a961b0e01b,칸이모자르면 차라리 등락금액을 빼고 거래량을 넣어주세요,0,0.952143,-0.952143
4,c124f0ff-c50a-47dc-a971-d5cf2cb09696,좋아요..👌.,1,0.965908,0.034092
...,...,...,...,...,...
4427,4c5a5486-9f87-4571-a97e-ba5353cddccc,아쉬운건 수수료가 비싸다는 점..,1,0.979160,0.020840
4428,9100e8a6-7d71-4ea7-a972-145d5c1d1821,상당한 발전이군요,1,0.989635,0.010365
4429,e620e368-649a-4d0d-a772-c4cab76470e1,시세알림 기능도 유용합니다.,1,0.990826,0.009174
4430,471a8ecd-a646-4435-8246-3c924edd647b,로그인없이 시세 조회도 가능하고 일단 속도가 다른 어플에 비해서 빠르네요^^,1,0.991023,0.008977


In [156]:
tmp_df

Unnamed: 0,reviewId,content,score,NSMC,MAE
0,4123d655-f7b7-4328-abd2-7d7dcfb1c21f,직관성 떨어지고 오류나고 답답,1,0.003561,0.996439
1,39afd67f-746b-462b-a838-f8dd5efca23e,이 부분 개선되면 참 좋겠습니다,1,0.115225,0.884775
2,01016f38-580c-4c64-a954-5abf1f796094,불법이 아니라면 개인투자자의 투자자율성 편의를 강화하는게 앱의 활용성을 높이고 경쟁...,1,0.007059,0.992941
3,2630586a-03c8-41bb-ac4e-04a961b0e01b,칸이모자르면 차라리 등락금액을 빼고 거래량을 넣어주세요,0,0.208421,-0.208421
4,c124f0ff-c50a-47dc-a971-d5cf2cb09696,좋아요..👌.,1,0.988644,0.011356
...,...,...,...,...,...
4427,4c5a5486-9f87-4571-a97e-ba5353cddccc,아쉬운건 수수료가 비싸다는 점..,1,0.947566,0.052434
4428,9100e8a6-7d71-4ea7-a972-145d5c1d1821,상당한 발전이군요,1,0.990528,0.009472
4429,e620e368-649a-4d0d-a772-c4cab76470e1,시세알림 기능도 유용합니다.,1,0.991331,0.008669
4430,471a8ecd-a646-4435-8246-3c924edd647b,로그인없이 시세 조회도 가능하고 일단 속도가 다른 어플에 비해서 빠르네요^^,1,0.989391,0.010609


In [164]:
text = 'UI는 편리합니다. 하지만 색깔이 바뀐게 아쉽네요.. 이 부분 개선되면 참 좋겠습니다'
print(infer(text, '/home/hjgp/research/KoELECTRA/finetune/ckpt/koelectra-small-v3-nsmc-ckpt_review_new/checkpoint-39000/'))

tensor([[0.1849, 0.8151]], grad_fn=<SoftmaxBackward0>)


In [165]:
text = 'UI는 편리합니다. 하지만 색깔이 바뀐게 아쉽네요.. 이 부분 개선되면 참 좋겠습니다'
print(infer(text, '/home/hjgp/research/KoELECTRA/finetune/ckpt/koelectra-small-v3-nsmc-ckpt/checkpoint-26000/'))

tensor([[0.0492, 0.9508]], grad_fn=<SoftmaxBackward0>)


In [163]:
tmp_df.to_csv('/home/hjgp/research/NH_intern/result/finetuned.csv', index=False, encoding='utf-8-sig')

In [143]:
#리뷰 finetune
np.mean(abs(tmp_df['MAE']))

0.16991970869147266

In [116]:
#1 / 5
np.mean(abs(tmp_df['MAE']))

0.15800968866517082

In [100]:
#12 45 finetune
np.mean(abs(tmp_df['MAE']))

0.17955200966715096

In [86]:
train_df.to_csv('finetuning_fault.csv', index=False, encoding='utf-8-sig')

In [None]:
tmp[0]

In [215]:
result

array([[9.9700373e-01, 2.9962957e-03],
       [9.9898189e-01, 1.0181338e-03],
       [9.5161390e-01, 4.8386134e-02],
       ...,
       [9.9904662e-01, 9.5336186e-04],
       [8.5410187e-03, 9.9145901e-01],
       [9.9907494e-01, 9.2508737e-04]], dtype=float32)

In [168]:
train_df['score'][:9]

0    1
1    1
2    1
3    1
4    1
5    1
6    1
7    1
8    1
Name: score, dtype: int64

In [36]:
 1

'이거 어플 왜이렇게 멈춤현상 렉걸림이ㅡ심한가요'

In [22]:
train_df

Unnamed: 0,reviewId,userName,userImage,content,score,thumbsUpCount,reviewCreatedVersion,at,replyContent,repliedAt,date
0,a74ab9e7-5b6b-4665-b8e5-4c0a82459cf6,김선구,https://play-lh.googleusercontent.com/a/AItbvm...,"고객정보확인인가 그거 신분증 실패했다고 하거나, 본인확인증 찍고 다음화면이 잘려서 ...",3,0,8.90,2022-08-08 07:21:10,,,202208
1,0df5ae4e-ff08-434a-b7a2-62da3181b586,정수정,https://play-lh.googleusercontent.com/a/AItbvm...,카카오페이 인증으로 접속이 안되서 ID로 접속하려구 했는데 등록된 단말이 아니라고 ...,1,0,8.88,2022-08-07 23:38:18,,,202208
2,c3169719-6a19-4995-92f2-2eca597ef652,zenter,https://play-lh.googleusercontent.com/a-/AFdZu...,"PC용까지 쓰려면 믿고 걸러야 합니다. 600만원짜리 PC, 3개의 HTS 사용자지...",1,0,8.90,2022-08-05 23:52:17,,,202208
3,cda3efef-4429-4007-b1cb-14bd6fd42b4b,홍길동,https://play-lh.googleusercontent.com/a/AItbvm...,미장 주식 업데이트 할때마다 회사 주주정보에 총주식수와 유동주식수 나오던게 업데이트...,2,0,8.90,2022-08-05 23:22:37,,,202208
4,996572c2-f4cb-4efe-a400-40b0f0123413,이용현,https://play-lh.googleusercontent.com/a/AItbvm...,해외주식도 차트에 매매내역을 남길 수 있도록 해주세요! 꼭 부탁드려요!,5,0,8.90,2022-08-05 15:22:57,,,202208
...,...,...,...,...,...,...,...,...,...,...,...
5009,4c5a5486-9f87-4571-a97e-ba5353cddccc,Google 사용자,https://play-lh.googleusercontent.com/EGemoI2N...,초기 편집이랑 설정잘해놓으면 hts못지않네요 아쉬운건 수수료가 비싸다는 점..,5,0,1.15,2012-03-27 02:23:04,,,201203
5010,9100e8a6-7d71-4ea7-a972-145d5c1d1821,Google 사용자,https://play-lh.googleusercontent.com/EGemoI2N...,피씨버전을하는듯하네요 상당한 발전이군요,5,1,,2012-03-16 18:39:06,,,201203
5011,e620e368-649a-4d0d-a772-c4cab76470e1,Google 사용자,https://play-lh.googleusercontent.com/EGemoI2N...,실행 아주 빠릅니다. 계정이 없어도 시세조회 가능합니다. 한번 로그인하면 별도의 로...,5,6,1.06,2011-11-10 20:40:07,,,201111
5012,471a8ecd-a646-4435-8246-3c924edd647b,Google 사용자,https://play-lh.googleusercontent.com/EGemoI2N...,로그인없이 시세 조회도 가능하고 일단 속도가 다른 어플에 비해서 빠르네요^^,5,2,1.05,2011-11-01 15:28:32,,,201111


In [46]:
ll = train_df.iloc[0][['reviewId', 'content', 'score']].to_string(header=False,index=False).strip()
ll

'a74ab9e7-5b6b-4665-b8e5-4c0a82459cf6\n고객정보확인인가 그거 신분증 실패했다고 하거나, 본인확인증 찍고 다음화면이 잘려서 수...\n                                                 3'

In [42]:
ll.split("\t")

['reviewId                 a74ab9e7-5b6b-4665-b8e5-4c0a82459cf6\ncontent     고객정보확인인가 그거 신분증 실패했다고 하거나, 본인확인증 찍고 다음화면이 잘려서 ...\nscore                                                       3\nName: 0, dtype: object']

In [32]:
train_df.iloc[1]

reviewId                             0df5ae4e-ff08-434a-b7a2-62da3181b586
userName                                                              정수정
userImage               https://play-lh.googleusercontent.com/a/AItbvm...
content                 카카오페이 인증으로 접속이 안되서 ID로 접속하려구 했는데 등록된 단말이 아니라고 ...
score                                                                   1
thumbsUpCount                                                           0
reviewCreatedVersion                                                 8.88
at                                                    2022-08-07 23:38:18
replyContent                                                          NaN
repliedAt                                                             NaN
date                                                               202208
Name: 1, dtype: object

In [33]:
for i in range(10):
    print(infer(train_df['content'][:10][i],'/home/hjgp/research/KoELECTRA/finetune/ckpt/koelectra-small-v3-nsmc-ckpt/checkpoint-46000/'))

tensor([[0.1503, 0.8497]], grad_fn=<SoftmaxBackward0>)
tensor([[0.9921, 0.0079]], grad_fn=<SoftmaxBackward0>)
tensor([[9.9965e-01, 3.5184e-04]], grad_fn=<SoftmaxBackward0>)
tensor([[0.9986, 0.0014]], grad_fn=<SoftmaxBackward0>)
tensor([[0.0178, 0.9822]], grad_fn=<SoftmaxBackward0>)
tensor([[9.9952e-01, 4.8041e-04]], grad_fn=<SoftmaxBackward0>)
tensor([[9.9961e-01, 3.9457e-04]], grad_fn=<SoftmaxBackward0>)
tensor([[0.0081, 0.9919]], grad_fn=<SoftmaxBackward0>)
tensor([[0.9989, 0.0011]], grad_fn=<SoftmaxBackward0>)
tensor([[0.9989, 0.0011]], grad_fn=<SoftmaxBackward0>)


In [252]:
text = '구버전처럼 사고팔기 편하고 한눈에쏙들어오고 빠르고 단순한 인터페이스를원하는겁니다.'
print(infer(text, '/home/hjgp/research/KoELECTRA/finetune/ckpt/koelectra-small-v3-nsmc-ckpt/checkpoint-26000/'))
print(infer(text, '/home/hjgp/research/KoELECTRA/finetune/ckpt/koelectra-small-v3-nsmc-ckpt_review_new/checkpoint-39000/'))
print(infer(text, '/home/hjgp/research/KoELECTRA/finetune/ckpt/koelectra-small-v3-nsmc-ckpt_review_full/checkpoint-70000/'))
print(infer(text, '/home/hjgp/research/KoELECTRA/finetune/ckpt/koelectra-small-v3-nsmc-ckpt_review_350000/checkpoint-55000/'))
print(infer(text, '/home/hjgp/research/KoELECTRA/finetune/ckpt/koelectra-small-v3-nsmc-ckpt_review_200000/checkpoint-64000/'))

tensor([[0.0039, 0.9961]], grad_fn=<SoftmaxBackward0>)
tensor([[0.9846, 0.0154]], grad_fn=<SoftmaxBackward0>)
tensor([[0.6203, 0.3797]], grad_fn=<SoftmaxBackward0>)
tensor([[0.0102, 0.9898]], grad_fn=<SoftmaxBackward0>)
tensor([[0.9791, 0.0209]], grad_fn=<SoftmaxBackward0>)


In [251]:
text = '기능도 많아지고 깔끔한데 위젯 디자인도 좀더 깔끔하게 바꿔주면 좋겠네요.'
print(infer(text, '/home/hjgp/research/KoELECTRA/finetune/ckpt/koelectra-small-v3-nsmc-ckpt/checkpoint-26000/'))
print(infer(text, '/home/hjgp/research/KoELECTRA/finetune/ckpt/koelectra-small-v3-nsmc-ckpt_review_new/checkpoint-39000/'))
print(infer(text, '/home/hjgp/research/KoELECTRA/finetune/ckpt/koelectra-small-v3-nsmc-ckpt_review_full/checkpoint-70000/'))
print(infer(text, '/home/hjgp/research/KoELECTRA/finetune/ckpt/koelectra-small-v3-nsmc-ckpt_review_350000/checkpoint-55000/'))
print(infer(text, '/home/hjgp/research/KoELECTRA/finetune/ckpt/koelectra-small-v3-nsmc-ckpt_review_200000/checkpoint-64000/'))

tensor([[0.2265, 0.7735]], grad_fn=<SoftmaxBackward0>)
tensor([[0.3742, 0.6258]], grad_fn=<SoftmaxBackward0>)
tensor([[0.0090, 0.9910]], grad_fn=<SoftmaxBackward0>)
tensor([[0.0157, 0.9843]], grad_fn=<SoftmaxBackward0>)
tensor([[0.0117, 0.9883]], grad_fn=<SoftmaxBackward0>)


In [249]:
text =  '예전보다 훨 좋네요 좀 아쉬운건 사용시 조명을 안꺼지게 하면 좋겠고, 계좌비밀번호는 항상 저장되게 변했음 좋겠어요'
print(infer(text, '/home/hjgp/research/KoELECTRA/finetune/ckpt/koelectra-small-v3-nsmc-ckpt/checkpoint-26000/'))
print(infer(text, '/home/hjgp/research/KoELECTRA/finetune/ckpt/koelectra-small-v3-nsmc-ckpt_review_new/checkpoint-39000/'))
print(infer(text, '/home/hjgp/research/KoELECTRA/finetune/ckpt/koelectra-small-v3-nsmc-ckpt_review_full/checkpoint-70000/'))
print(infer(text, '/home/hjgp/research/KoELECTRA/finetune/ckpt/koelectra-small-v3-nsmc-ckpt_review_350000/checkpoint-55000/'))
print(infer(text, '/home/hjgp/research/KoELECTRA/finetune/ckpt/koelectra-small-v3-nsmc-ckpt_review_200000/checkpoint-64000/'))

tensor([[0.4946, 0.5054]], grad_fn=<SoftmaxBackward0>)
tensor([[0.9800, 0.0200]], grad_fn=<SoftmaxBackward0>)
tensor([[0.0054, 0.9946]], grad_fn=<SoftmaxBackward0>)
tensor([[0.0267, 0.9733]], grad_fn=<SoftmaxBackward0>)
tensor([[0.0057, 0.9943]], grad_fn=<SoftmaxBackward0>)


In [250]:
text =  '업데이트 되기 전에는 정말 쓰기도 힘들고 어렵고 별로였는데, 업데이트 되고나니 좋네요'
print(infer(text, '/home/hjgp/research/KoELECTRA/finetune/ckpt/koelectra-small-v3-nsmc-ckpt/checkpoint-26000/'))
print(infer(text, '/home/hjgp/research/KoELECTRA/finetune/ckpt/koelectra-small-v3-nsmc-ckpt_review_new/checkpoint-39000/'))
print(infer(text, '/home/hjgp/research/KoELECTRA/finetune/ckpt/koelectra-small-v3-nsmc-ckpt_review_full/checkpoint-70000/'))
print(infer(text, '/home/hjgp/research/KoELECTRA/finetune/ckpt/koelectra-small-v3-nsmc-ckpt_review_350000/checkpoint-55000/'))
print(infer(text, '/home/hjgp/research/KoELECTRA/finetune/ckpt/koelectra-small-v3-nsmc-ckpt_review_200000/checkpoint-64000/'))

tensor([[0.0301, 0.9699]], grad_fn=<SoftmaxBackward0>)
tensor([[0.9906, 0.0094]], grad_fn=<SoftmaxBackward0>)
tensor([[0.9932, 0.0068]], grad_fn=<SoftmaxBackward0>)
tensor([[0.9470, 0.0530]], grad_fn=<SoftmaxBackward0>)
tensor([[0.9921, 0.0079]], grad_fn=<SoftmaxBackward0>)


In [372]:
text =  '업데이트 되기 전에는 별로였는데, 업데이트 되고나니 좋네요'
print(infer(text, '/home/hjgp/research/KoELECTRA/finetune/ckpt/koelectra-small-v3-nsmc-ckpt/checkpoint-26000/'))
print(infer(text, '/home/hjgp/research/KoELECTRA/finetune/ckpt/koelectra-small-v3-nsmc-ckpt_review_new/checkpoint-39000/'))
print(infer(text, '/home/hjgp/research/KoELECTRA/finetune/ckpt/koelectra-small-v3-nsmc-ckpt_review_full/checkpoint-70000/'))
print(infer(text, '/home/hjgp/research/KoELECTRA/finetune/ckpt/koelectra-small-v3-nsmc-ckpt_review_350000/checkpoint-55000/'))
print(infer(text, '/home/hjgp/research/KoELECTRA/finetune/ckpt/koelectra-small-v3-nsmc-ckpt_review_200000/checkpoint-64000/'))

tensor([[0.0128, 0.9872]], grad_fn=<SoftmaxBackward0>)
tensor([[0.0361, 0.9639]], grad_fn=<SoftmaxBackward0>)
tensor([[0.0228, 0.9772]], grad_fn=<SoftmaxBackward0>)
tensor([[0.0076, 0.9924]], grad_fn=<SoftmaxBackward0>)
tensor([[0.0284, 0.9716]], grad_fn=<SoftmaxBackward0>)


In [338]:
all_review = pd.read_csv('/home/hjgp/research/NH_intern/all_review_data.csv')

In [339]:
all_review = all_review[all_review['app'] != 'plus']

In [340]:
from kiwipiepy import Kiwi
kiwi = Kiwi()

In [341]:
all_review[['content', 'at']]

Unnamed: 0,content,at
27971,이런게 증권사라고... 탈퇴할거임,2022-08-24 14:49:25
27972,오늘따라 접속이 자꾸 안돼는데 이런일이 자주 있는데 개선이 안돼나요? 응답시간 초과...,2022-08-24 10:44:15
27973,폴드3사용자 인데 메인화면비율이 안맞아 화면이 잘려요 ㅠ,2022-08-24 00:10:06
27974,아침에 매일 스크립트에러뜨면서 튕겨요 진짜 서버 왜이래요....,2022-08-19 09:09:12
27975,2년째 사용하고 있는데 인간적으로 너무 느리지 않나요? 느려서 다른증권사로 갈아타고...,2022-08-18 22:57:41
...,...,...
114906,추천합니다~!,2016-11-01 08:49:02
114907,아직 좀 더써봐야겠지만 디스플레이가 보기쉽게 되어있어 잘쓸것같아요~,2016-11-01 08:47:03
114908,새로운 어플이 투자에 많은 도움이 될것 같습니다~ 좋네요~,2016-11-01 08:45:36
114909,증권 어플이 새롭게 느껴지네요^^,2016-11-01 08:42:56


In [342]:
def extract_sentence(text):
    output_list = []
    tmp = kiwi.split_into_sents(text)
    
    for i in tmp:
        output_list.append(i.text)
    return output_list

full_reviews = list(all_review['content'])
date_reviews = list(all_review['at'])
app_reviews = list(all_review['app'])

split_reviews = []
split_date = []
split_app = []

for i, j, h in zip(full_reviews, date_reviews, app_reviews):
    tmp = extract_sentence(i)
    if type(i) == 'str':
        split_reviews.append(tmp)
        split_date.append(j)
        split_app.append(h)
    else:
        split_reviews.extend(tmp)
        for i in range(len(tmp)):
            split_date.append(j)
            split_app.append(h)
        
dddf = pd.DataFrame()
dddf['content'] = split_reviews
dddf['reviewId'] = split_date
dddf['score'] = 0

dddf.reset_index(drop=True, inplace = True)

In [343]:
dddf

Unnamed: 0,content,reviewId,score
0,이런게 증권사라고...,2022-08-24 14:49:25,0
1,탈퇴할거임,2022-08-24 14:49:25,0
2,오늘따라 접속이 자꾸 안돼는데 이런일이 자주 있는데 개선이 안돼나요?,2022-08-24 10:44:15,0
3,응답시간 초과는 이제 지겹네요,2022-08-24 10:44:15,0
4,오전 다보네고 뭐하자는 겁니까?,2022-08-24 10:44:15,0
...,...,...,...
172696,새로운 어플이 투자에 많은 도움이 될것 같습니다~,2016-11-01 08:45:36,0
172697,좋네요~,2016-11-01 08:45:36,0
172698,증권 어플이 새롭게 느껴지네요^^,2016-11-01 08:42:56,0
172699,새로운...,2016-11-01 08:22:26,0


In [365]:
화면=['화면', '디자인', '직관', '인터페이스', 'UI', 'UX', 'ui', 'ux', 'Ui', '한눈', '한 눈']

In [366]:
def or_expression(input_list):
    base = r'(?:{}'
    output_list = base.format('|'.join(w for w in input_list)) + ')'
    return output_list

In [367]:
or_expression(화면)

'(?:화면|디자인|직관|인터페이스|UI|UX|ui|ux|Ui|한눈|한 눈)'

In [345]:
test_dataset = seq_cls_load_and_cache_examples(args, tokenizer, mode="test", df=dddf) ##train_df : 테스트하고싶은 df

In [346]:
args.device = "cuda"
result = test(args, model.cuda(), test_dataset)

In [347]:
result_list = []

for i in result:
    result_list.append(i[1])

In [348]:
len(result_list)

172701

In [349]:
dddf['score'] = result_list
dddf['app'] = split_app

In [350]:
dddf['reviewId'] = pd.to_datetime(dddf['reviewId'], errors='coerce')
dddf['date'] = dddf['reviewId'].dt.strftime('%Y')

In [351]:
result = dddf[dddf['date'] >= '2021'].reset_index(drop=True)

In [352]:
result

Unnamed: 0,content,reviewId,score,app,date
0,이런게 증권사라고...,2022-08-24 14:49:25,0.005113,hantu,2022
1,탈퇴할거임,2022-08-24 14:49:25,0.004402,hantu,2022
2,오늘따라 접속이 자꾸 안돼는데 이런일이 자주 있는데 개선이 안돼나요?,2022-08-24 10:44:15,0.290482,hantu,2022
3,응답시간 초과는 이제 지겹네요,2022-08-24 10:44:15,0.005938,hantu,2022
4,오전 다보네고 뭐하자는 겁니까?,2022-08-24 10:44:15,0.003082,hantu,2022
...,...,...,...,...,...
102434,속도가 진짜 넘 느려요.,2021-01-04 10:07:57,0.010400,sinhan,2021
102435,엄청버벅거림 반응도 엄청느림,2021-01-04 09:24:10,0.005693,sinhan,2021
102436,눈알이 넘아파요 죄다너무흰색...,2021-01-04 00:23:32,0.008766,sinhan,2021
102437,색선택을할수있게 해주시던지 다른 눈편한색으로좀해주세요,2021-01-04 00:23:32,0.006628,sinhan,2021


In [353]:
set(result['app'])

{'daesin', 'hantu', 'kb', 'kium', 'mirae', 'namu', 'qv', 'samsung', 'sinhan'}

In [363]:
tmp_result = result.copy()

In [368]:
result = result[result['content'].str.contains(or_expression(화면))]

In [376]:
result.to_csv("./result_ui.csv", index=False, encoding='utf-8-sig')

In [375]:
!pwd

/home/hjgp/research/KoELECTRA


In [377]:
result = result[result['app'] != 'sinhan']

In [378]:
daesin = result[result['app'] == 'daesin']
hantu = result[result['app'] == 'hantu']
kb = result[result['app'] == 'kb']
kium = result[result['app'] == 'kium']
mirae = result[result['app'] == 'mirae']
namu = result[result['app'] == 'namu']
qv = result[result['app'] == 'qv']
samsung = result[result['app'] == 'samsung']

In [379]:
print(np.mean(daesin['score']) * 5)
print(np.mean(hantu['score']) * 5)
print(np.mean(kb['score']) * 5)
print(np.mean(kium['score']) * 5)
print(np.mean(mirae['score']) * 5)
print(np.mean(namu['score']) * 5)
print(np.mean(qv['score']) * 5)
print(np.mean(samsung['score']) * 5)

1.1449007881085642
0.8558094063203239
1.4449883243458133
0.9563240473737624
1.945563418269981
2.0705485455286894
2.319099881957906
2.3550537679797254


In [380]:
np.mean(result['score'] * 5)

1.8173133000112562

In [161]:
from google_play_scraper import Sort, reviews_all, reviews
def scraper_all(url, score):
    result = reviews_all(
        url,
        sleep_milliseconds=5, # defaults to 0
        lang='ko', # defaults to 'en'
        country='kr', # defaults to 'us'
        sort=Sort.NEWEST, # defaults to Sort.MOST_RELEVANT
        filter_score_with=score # defaults to None(means all score)
    )
    result_df = pd.DataFrame(result)
    return result_df   

In [233]:
url1 = 'com.dunamu.stockplus' ##증권플러스
url2 = 'com.truefriend.neosmarta' ##한투
url3 = 'com.wooriwm.txsmart' ##나무
url4 = 'com.wooriwm.mugsmart' ##qv
url5 = 'com.kbsec.mts.iplustarngm2' ##kb
url6 = 'com.miraeasset.trade' ##미래
url7 = 'com.samsungpop.android.mpop' ##삼성
url8 = 'com.daishin' #대신
url9 = 'com.linkzen.app' #키움

In [234]:
review_df = scraper_all(url9, None).dropna(subset=['content'])

In [235]:
review_df['date'] = review_df['at'].dt.strftime('%Y')
review_df = review_df[review_df['date'].astype(int) >= 2021][['reviewId', 'content', 'score']]

In [236]:
from tqdm import tqdm
def extract_sentence(text):
    output_list = []
    tmp = kiwi.split_into_sents(text)
    
    for i in tmp:
        output_list.append(i.text)
    return output_list

full_reviews = list(review_df['content'])

split_reviews = []
for i in tqdm(full_reviews):
    if type(i) == 'str':
        split_reviews.append(extract_sentence(i))
    else:
        split_reviews.extend(extract_sentence(i))
        
dddf = pd.DataFrame(split_reviews)

df = pd.DataFrame()
df['reviewId'] = df.index
df['content'] = dddf[dddf[0].str.contains(or_expression(화면))][0]
df['score'] = 0
df.reset_index(drop=True, inplace=True)

100%|█████████████████████████████████████| 6426/6426 [00:04<00:00, 1417.72it/s]


In [237]:
test_dataset = seq_cls_load_and_cache_examples(args, tokenizer, mode="test", df=df) ##train_df : 테스트하고싶은 df

In [238]:
args.device = "cuda"
result = test(args, model.cuda(), test_dataset)

In [239]:
result_list = []

for i in result:
    result_list.append(i[1])

In [240]:
np.mean(result_list)

0.19007131

In [253]:
df['score'] = result_list

ValueError: Length of values (673) does not match length of index (674)

In [260]:
df

Unnamed: 0,reviewId,content,score
0,,"날짜별 체결내역, 매수단가,매도간가, 수수료 한눈에 알아볼 수 있으면 좋겠어요...",0
1,,UI가 너무 구리네요...,0
2,,갤럭시탭a8로 로그인하고 비밀번호 입력할때마다 화면이 강제로 회전됩니다.,0
3,,배당금 관련해서 투자 중인데 배당 관련 UI 개선 및 확장 부탁드립니다.,0
4,,화면회전하면 주문시 차트와 주문창 이분할로 되는데 언제부터인가 차트가 아예 안보일정...,0
...,...,...,...
669,,디자인만 보면 거의 90년대인 것 같습니다.,0
670,,"바라는 점이 있다면, 핸드폰을 보통 세로로 쓰다보니 제 종목들을 볼 때(계좌-잔고)...",0
671,,기본적으로 종목명ㅡ매입가ㅡ평가손익ㅡ수익률ㅡ가능수량ㅡ보유수량ㅡ현재가ㅡ매입금액ㅡ평가금액...,0
672,,설치하고 회원가입하고 계좌인증 다하고 나갔다 난주 다시 들어가려는데 초기화면에서 자...,0
