In [1]:
import sys
import logging
import os
import json
import pandas as pd

date_strftime_format = "%Y-%m-%y %H:%M:%S"
logging.basicConfig(stream=sys.stdout, level=logging.INFO, format="%(asctime)s %(message)s", datefmt=date_strftime_format)

# LOAD DATA

In [2]:
DATA_FOLDER = os.getcwd() + '/review_scorer/data/'
SENTI_PATH = DATA_FOLDER + 'SentiWord_info.json'
DATA_PATH = DATA_FOLDER + 'data_origin.csv'
data: pd.DataFrame

with open(SENTI_PATH, mode='rt', encoding='UTF8') as f:
    senti = pd.DataFrame.from_dict(json.load(f))

data = pd.read_csv(DATA_PATH, encoding='UTF8')
data = data.dropna(axis=0)
data = data.sample(frac=1).reset_index(drop=True)

# TOKENIZER

In [3]:
from tqdm import tqdm
from twkorean import TwitterKoreanProcessor

processor = TwitterKoreanProcessor()
tokenize = processor.tokenize_to_strings
tokens = [tokenize(_) for _ in tqdm(data.review)]

100%|████████████████████████████████████| 71904/71904 [08:24<00:00, 142.39it/s]


# SET REVIEW SCORER

In [4]:
from review_scorer import ReviewScorer

# Review scorer needs tokens of datas to train when initializing it.
# 리뷰 채점기 클래스를 생성할 때, 토크나이즈 된 데이터를 인자로 주어야 합니다.
rs = ReviewScorer(sentences=tokens, senti_dict_path=SENTI_PATH)

2022-08-22 23:03:26 collecting all words and their counts
2022-08-22 23:03:26 PROGRESS: at sentence #0, processed 0 words, keeping 0 word types
2022-08-22 23:03:35 PROGRESS: at sentence #10000, processed 1095162 words, keeping 28199 word types
2022-08-22 23:03:44 PROGRESS: at sentence #20000, processed 2184895 words, keeping 37831 word types
2022-08-22 23:03:53 PROGRESS: at sentence #30000, processed 3281505 words, keeping 43753 word types
2022-08-22 23:04:02 PROGRESS: at sentence #40000, processed 4371883 words, keeping 47443 word types
2022-08-22 23:04:11 PROGRESS: at sentence #50000, processed 5455131 words, keeping 49746 word types
2022-08-22 23:04:21 PROGRESS: at sentence #60000, processed 6540357 words, keeping 51127 word types
2022-08-22 23:04:30 PROGRESS: at sentence #70000, processed 7619619 words, keeping 51656 word types
2022-08-22 23:04:31 collected 51667 word types from a corpus of 7833944 raw words and 71904 sentences
2022-08-22 23:04:31 Creating a fresh vocabulary
2022-0

## TAGGING RIVIEW SCORER's SENTIMENTAL DICTIONARY

In [6]:
# Tagging review scorer's sentimental dictionary by category.
# 카테고리에 따라 리뷰 채점기의 감성사전을 태깅합니다.
rs.tag(categories={'taste': ['맛', '맛있다', '맛없다'],
                   'price': ['가격', '싸다', '비싸다', '저렴'],
                   'service': ['서비스', '친절', '싸가지'],
                   'atmosphere': ['인테리어', '분위기']}, topn=500)

2022-08-22 23:07:23 tagging taste


TypeError: only list-like objects are allowed to be passed to isin(), you passed a [NoneType]

# SCORING WITH REVIEW SCORER

In [None]:
start = 50000
for i in range(start, start + 10):
    print('index: ', i)
    print(rs.score_review(tokenize(data.review.iloc[i])))
    print(data.review.iloc[i])
    print()