In [1]:
import pandas as pd
import numpy as np
import matplotlib.pyplot as plt
import seaborn as sns
import time
from tqdm.notebook import tqdm
from ast import literal_eval
import os
import sys
import pickle
import json

tqdm.pandas()

# Load data

In [2]:
# Load saved dataframe
review_df = pd.read_csv("main_dictionary.csv")

# Load saved aspect_list, main_aspect_list, core_term_list
with open('aspect_list_MLM.json', 'r', encoding='UTF-8-sig') as f:
    aspect_list = json.load(f)
    
with open('main_aspect_list_MLM.json', 'r', encoding='UTF-8-sig') as f:
    main_aspect_list = json.load(f)

with open('core_term_list_MLM.json', 'r', encoding='UTF-8-sig') as f:
    core_term_list = json.load(f)

# Load Sentiment Lexicon
with open('score_dict_nor.pkl', 'rb') as f:
    score_dict = pickle.load(f)

# Get row with aspects

In [3]:
remove_index = review_df[(review_df["candiate_ids"] == "error") | ( review_df["candiate_ids"] == "{}")].index
review_df = review_df.drop(remove_index, axis=0).reset_index(drop=True)
review_df["candiate_ids"] = review_df["candiate_ids"].progress_apply(lambda x : literal_eval(x))

HBox(children=(FloatProgress(value=0.0, max=268922.0), HTML(value='')))




In [4]:
def contain_aspect(candidate_dict):
    word_list = list(candidate_dict.keys())
    for word in word_list:
        if word in aspect_list:
            return True
    return False

In [5]:
# Leave data that have aspect in their sentence
review_df["has_aspect"] = review_df["candiate_ids"].progress_apply(lambda x : contain_aspect(x))
review_with_aspect = review_df[review_df["has_aspect"]].copy().reset_index(drop=True).drop(["has_aspect"], axis=1)
# Add columns of aspect list
review_with_aspect[aspect_list] = 0

HBox(children=(FloatProgress(value=0.0, max=268922.0), HTML(value='')))




In [6]:
for i in range(1,6):
    print("proportion of rating {} : {:.2f}%".format(i, (review_with_aspect["rating"]==i).sum()/len(review_with_aspect)*100))

proportion of rating 1 : 0.43%
proportion of rating 2 : 0.57%
proportion of rating 3 : 3.31%
proportion of rating 4 : 11.19%
proportion of rating 5 : 84.50%


# Calculate Sent Score

In [7]:
from koalanlp.Util import initialize, finalize

# 꼬꼬마와 ETRI 분석기의 2.0.4 버전을 참조합니다.
initialize(java_options="-Xmx4g", EUNJEON="latest", KMR="latest", KKMA="latest", HNN="latest")

[koalanlp.jip] [INFO] Latest version of kr.bydelta:koalanlp-eunjeon (2.1.6) will be used.
[koalanlp.jip] [INFO] Latest version of kr.bydelta:koalanlp-kmr (2.1.4) will be used.
[koalanlp.jip] [INFO] Latest version of kr.bydelta:koalanlp-kkma (2.1.4) will be used.
[koalanlp.jip] [INFO] Latest version of kr.bydelta:koalanlp-hnn (2.1.4) will be used.
[root] Java gateway started with port number 42415
[root] Callback server will use port number 25334
[koalanlp.jip] JVM initialization procedure is completed.


In [8]:
from koalanlp import API
from koalanlp.proc import Parser
from koalanlp.proc import Tagger
from koalanlp.types import PhraseTag

In [9]:
mecab_tagger = Tagger(API.EUNJEON)
kkma_parser = Parser(API.KKMA)

In [10]:
def cal_aspect_score(df):
    sentence = df["review_text"]
    cands = df["candiate_ids"]
    tagged_review = mecab_tagger(sentence)
    parsed_review = kkma_parser(tagged_review)
    for word in cands.keys():
        word_score = 0
        if word in aspect_list:
            id_list = cands[word]
            for id in id_list:
                cur_aspect = parsed_review[0][id]
                dependents = cur_aspect.getDependentEdges()
                governor = cur_aspect.getGovernorEdge()
                if dependents != []:
                    for dependent in dependents:
                        sent_words = dependent.dependent.morphemes
                        for sent_word in sent_words:
                            cur_word = sent_word.surface + "/" + sent_word.tag
                            if cur_word in score_dict.keys():
                                word_score += score_dict[cur_word]
                if governor.governor != None:
                    sent_words = governor.governor.morphemes
                    for sent_word in sent_words:
                        cur_word = sent_word.surface + "/" + sent_word.tag
                        if cur_word in score_dict.keys():
                            word_score += score_dict[cur_word]
            review_with_aspect.loc[df.name, word] = word_score

In [11]:
review_with_aspect.progress_apply(lambda x: cal_aspect_score(x), axis=1)

HBox(children=(FloatProgress(value=0.0, max=177063.0), HTML(value='')))




0         None
1         None
2         None
3         None
4         None
          ... 
177058    None
177059    None
177060    None
177061    None
177062    None
Length: 177063, dtype: object

In [None]:
finalize()

In [17]:
review_with_aspect.to_csv("review_with_aspect_nor.csv", index=False)

In [18]:
review_with_aspect

Unnamed: 0,name,rating,review_text,candiate_ids,산미,음식,스파클,술맛,사과,알코올,...,금액,감칠맛,알스,탄산수,값어치,쓴맛,크리스마스,크리미,고급술,향도
0,복순도가,5.0,한 달 사용기 부모님과 한잔하면서 좋은 시간 보냈어요 부모님 댁으로 바로 배송했더니...,"{'사용': [2], '기': [3], '부모': [4, 9], '시간': [7],...",0.000000,1.865832,0.000000,0.0,0.0,0.000000,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0
1,복순도가,5.0,막걸리 좋아하시는 어른께 명절 선물로 보내드렸습니다 명절에다 모여서 드시라고 3병 ...,"{'막걸리': [0], '어른': [3], '명절': [4, 8], '선물': [5...",0.000000,0.000000,0.000000,0.0,0.0,0.000000,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0
2,복순도가,5.0,할머니 댁에 하나 집에 하나 시켰어요 덕분에 이쁜 손녀 이쁜 딸 됐네요 ㅎㅎ 개인적...,"{'할머니': [0], '댁': [1], '집': [3], '덕분': [6], '손...",0.000000,0.000000,0.000000,0.0,0.0,0.000000,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0
3,복순도가,5.0,설날과 집안 행사를 핑계로 구매했습니다 병 딸 때 조심해야 하는 건 원래 알고 있어...,"{'설': [0], '설날': [0], '집': [1], '집안': [1], '행사...",0.000000,0.000000,0.000000,0.0,0.0,0.000000,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0
4,복순도가,5.0,제가 술은 따로 즐겨 먹진 않는데 이상하게 막걸리는 한두 잔 정도 마십니다 ㅎㅎ론 ...,"{'술': [1], '막걸리': [7, 17, 28, 38, 83, 95, 98, ...",0.848502,0.000000,1.730931,0.0,0.0,1.686501,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
177058,맑은내일,5.0,이것도 맛이 조군요,{'맛': [1]},0.000000,0.000000,0.000000,0.0,0.0,0.000000,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0
177059,맑은내일,4.0,배송 빠르고 좋아요,{'배송': [0]},0.000000,0.000000,0.000000,0.0,0.0,0.000000,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0
177060,맑은내일,5.0,배송도 빨랐고 맛도 있어요,"{'배송': [0], '맛': [2]}",0.000000,0.000000,0.000000,0.0,0.0,0.000000,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0
177061,맑은내일,5.0,선물용으로 샀는데 만족합니다,{'선물': [0]},0.000000,0.000000,0.000000,0.0,0.0,0.000000,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0
