In [1]:
import pandas as pd
import numpy as np
import re
from tqdm import tqdm
from modules import *

In [2]:
data = pd.read_csv("배상면주가.csv", encoding='utf-8-sig')
data.drop_duplicates(['text'],ignore_index=True, inplace=True) # 중복 제거
data.columns = ['date', 'star','doc']

In [3]:
data.doc = data.doc.str.replace("[^가-힣 ]"," ") # 한글만 남기고 특수문자, 숫자, 영어 삭제
data.doc = data.doc.str.replace("\s+", " ") # white space 삭제

data.doc

0                                  맛있어요 전에도 사서 마셨는데 맛있습니다
1                           항상 복분자주는 여기서 사먹네요 안 독하고 깔끔합니다
2       저번에 대형마트에서 발견해서 먹어봤다 맛있어서 온라인 구매했어요 주변에서 많이 팔면...
3       지인한테 선물받아 먹어보니 너무 맛있어요먹어본 복분자술 중에 단연 최고에요너무 과하...
4                                               잘받았어요잘쓸게유
                              ...                        
7748                                              만족스럽습니다
7749         빠른배송 맘에 들고요 가족들과 즐기기 좋습니다 시중에서 구하기 힘들었는데 좋네요
7750                                                맛있네요 
7751                                             기대이상이에요 
7752                 맛있는 술엄마 사드렸는데 맛있다하셨어요술 잘 못드시는분들 좋을듯요
Name: doc, Length: 7753, dtype: object

In [4]:
# 검토 후 추가 삭제 문구 적용
data.dropna(inplace=True)

In [5]:
data[data.doc.isnull()]

Unnamed: 0,date,star,doc


In [6]:
# data = data[data.doc.str.contains('맛|향|달|넘김|부드|단|도수|냄세')]
# data.reset_index(drop=True, inplace=True)

In [7]:
len(data)

7753

In [8]:
from nltk.corpus import stopwords
import nltk
from nltk.stem import WordNetLemmatizer, SnowballStemmer
from nltk.stem.porter import *
from nltk import word_tokenize
from konlpy.tag import Okt

In [9]:
#원하는 태그 추출 & 불용어 제거
okt = Okt()

#불용어
stopwords = pd.read_csv('ko-stopwords.csv') #한국어불용어사전 불러오기
stopwords=list(stopwords['stopwords']) 
stopwords.extend(['선물','주문','친구','사서','전','빙','재다','자다','대형','마트','저번','빠르다','좋다','안전하다','자다 오다',
                  '만족하다','좋아하다','ㅎ','빨대','처음구매','배송','꼼꼼', '받다','넘다','마시기','아주','의사','자주','용',
                  '재다','깨지다 오다', '맛 요', '생각 맛', '인터넷 구매','맛 맛', '두번째 구입','두번째 구매','선물용','빙 탄복',
                  '마시다','가져가다','깨지다', '오다', '맛 맛','용 구매', '에서','고','이다','는','한','씨', "것","거","게","데",
                  "이다","건","고","되다","되어다","걸","기",'구매','없이','굿','술', '부담', '너무', '꼼꼼하다','말다','포장','늘','말다','탄복'
                  "시","네","듯","랍니","중이","얘","스","도도", "나","수","개","내","기","제","저","인","있다","이렇다",
                  "그렇다","번","위","팅","분","인","링","란","포","두", "진짜", "하다" ,"이다" ,"가다", "이제" ,"들다",
                 '에서','고','이다','쓰리다 도어즈','일요일','휴무','수표','층','틀다','차차','드므','맛집','술집','길',
                 '층','개다 성','사진','파다','동영상','아스 론','자다','는','희다 스토리','한','씨', "것","거","게","데",
                 "이다","건","고","되다","되어다","걸","기", "시","네","듯","랍니","중이","얘","스","도도", "나","수","개",
                 "내","기","제","저","인","있다","이렇다", "그렇다","번","위","팅","분","인","링","란","포","두", "진짜",
                 "하다" ,"이다" ,"가다", "이제" ,"들다", "먹다", "보다",'다','알','요','보고','탄복','역시','박스','그냥','크다','좌',
                 '맘애듭','잇다','반정','오늘','손색','없다','개인','능']) #불용어 추가

stopwords=set(stopwords) # 중복제거    

# 형태소 분석 함수 만들기
def okt_pos_tagging(string):
    pos_words = okt.pos(string, stem=True, norm=True) # 형태소 분석. 단어는 사전형으로 바꿔주기
    words = [word for word, tag in pos_words if tag 
             in ['Noun', 'Adjective', 'Verb','Adverb'] if word not in stopwords ]
    return words


In [10]:
from sklearn.feature_extraction.text import CountVectorizer

vect = CountVectorizer(tokenizer = lambda x: okt_pos_tagging(x))
bow_vect = vect.fit_transform(data['doc'].tolist())
word_list = vect.get_feature_names()
count_list = bow_vect.toarray().sum(axis=0)

In [11]:
bow_vect.shape

(7753, 3773)

In [12]:
word_count_dict = dict(zip(word_list, count_list))
word_count_dict


{'가가': 3,
 '가게': 12,
 '가격': 134,
 '가교': 1,
 '가기': 2,
 '가까이': 1,
 '가깝다': 4,
 '가끔': 40,
 '가나': 1,
 '가능하다': 11,
 '가득': 3,
 '가득하다': 4,
 '가라': 1,
 '가량': 1,
 '가루': 1,
 '가르다': 1,
 '가리다': 1,
 '가면': 3,
 '가물': 1,
 '가미': 10,
 '가바': 1,
 '가방': 2,
 '가버리다': 1,
 '가법': 1,
 '가변': 1,
 '가볍다': 478,
 '가보다': 4,
 '가성': 29,
 '가수': 2,
 '가시': 1,
 '가안': 1,
 '가약': 1,
 '가왜': 1,
 '가요': 7,
 '가운데': 1,
 '가을': 1,
 '가이트': 1,
 '가인': 1,
 '가장': 6,
 '가져다주다': 1,
 '가져오다': 1,
 '가족': 111,
 '가주': 1,
 '가즈': 1,
 '가지': 7,
 '가지다': 2,
 '가하다': 1,
 '각설이': 1,
 '간다': 1,
 '간단': 1,
 '간단하다': 60,
 '간만': 3,
 '간식': 1,
 '간편': 1,
 '간편하다': 9,
 '간혹': 1,
 '갇': 1,
 '갈다': 37,
 '갈리다': 3,
 '갈비': 1,
 '갈수': 2,
 '갈수록': 2,
 '갈아': 2,
 '갈아타다': 3,
 '갈증': 5,
 '감': 27,
 '감기': 1,
 '감다': 25,
 '감동': 8,
 '감목': 1,
 '감미롭다': 1,
 '감미료': 1,
 '감복': 1,
 '감사': 16,
 '감사하다': 193,
 '감사히': 2,
 '감솨': 1,
 '감싸다': 1,
 '감안': 1,
 '감탄': 1,
 '갑': 2,
 '갑자기': 1,
 '갑작': 1,
 '값해': 1,
 '갓': 4,
 '강': 26,
 '강도': 1,
 '강력': 2,
 '강력하다': 1,
 '강릉': 1,
 '강원도': 1,
 '강추': 31,
 '강하': 5,
 '강하다': 90,
 '

In [13]:
from sklearn.feature_extraction.text import TfidfTransformer

tfidf_vectorizer = TfidfTransformer()
tf_idf_vect = tfidf_vectorizer.fit_transform(bow_vect)

In [14]:
print(tf_idf_vect.shape)


(7753, 3773)


In [15]:
print(tf_idf_vect[0])

  (0, 1124)	1.0


In [16]:
invert_index_vectorizer = {v: k for k, v in vect.vocabulary_.items()}
print(str(invert_index_vectorizer)[:100]+'...')

{1124: '맛있다', 3610: '항상', 1478: '복분자주', 1633: '사먹다', 2080: '안', 827: '독하다', 395: '깔끔하다', 1355: '발견',...


In [17]:
def rating_to_label(rating):
    if rating > 3:
        return 1
    else:
        return 0

data['y'] = data['star'].apply(lambda x : rating_to_label(x))
len(data[data['y']==0])

389

In [18]:
from sklearn.model_selection import train_test_split

x = tf_idf_vect
y = data['y']

# positive_random_idx = data[data['y']==1].sample(48, random_state=12).index.tolist()
# negative_random_idx = data[data['y']==0].sample(48, random_state=12).index.tolist()

# random_idx = positive_random_idx + negative_random_idx
# x = tf_idf_vect[random_idx]
# y = data['y'][random_idx]


x_train, x_test, y_train, y_test = train_test_split(x, y, test_size = 0.3, random_state=1)


In [19]:
x.shape,y.shape

((7753, 3773), (7753,))

In [20]:
from sklearn.linear_model import LogisticRegression
from sklearn.metrics import accuracy_score, precision_score, recall_score, f1_score

# fit in training set
lr = LogisticRegression()
lr.fit(x_train, y_train)

# predict in test set
y_pred = lr.predict(x_test)

In [21]:
print('accuracy: %.4f' % accuracy_score(y_test, y_pred))
print('precision: %.4f' % precision_score(y_test, y_pred))
print('recall: %.4f' % recall_score(y_test, y_pred))
print('F1: %.4f' % f1_score(y_test, y_pred))


accuracy: 0.9510
precision: 0.9510
recall: 1.0000
F1: 0.9749


In [22]:
coef_pos_index = sorted(((value, index) for index, value in enumerate(lr.coef_[0])), reverse = True)
coef_neg_index = sorted(((value, index) for index, value in enumerate(lr.coef_[0])), reverse = False)
coef_pos_index



[(3.455618903956166, 1124),
 (1.485270622554382, 2822),
 (1.446908910098594, 1505),
 (1.125524989443121, 1925),
 (1.0044795916585298, 3209),
 (0.9075569646073368, 858),
 (0.7589151251894684, 1623),
 (0.7473406112963731, 2813),
 (0.7176702268441215, 2961),
 (0.7080026023567043, 3650),
 (0.6903591578451895, 3015),
 (0.664657718906575, 302),
 (0.6641202095828178, 920),
 (0.6525305524169959, 1627),
 (0.6488668456179623, 685),
 (0.6467758051868222, 3162),
 (0.6159659931768635, 3148),
 (0.6135398144573656, 2015),
 (0.6116910393155103, 1863),
 (0.6082570217388549, 135),
 (0.6023461424964773, 1595),
 (0.6017440552133119, 2385),
 (0.575043913741478, 2897),
 (0.5749750928076499, 3572),
 (0.5741055370154121, 2615),
 (0.5730098263618684, 1691),
 (0.5657911165392823, 3067),
 (0.5628208076961713, 1088),
 (0.5569647812505346, 395),
 (0.5556118735952207, 2210),
 (0.5437846554985025, 1118),
 (0.5402920700440034, 1091),
 (0.5213229590146676, 2340),
 (0.5071888867385177, 516),
 (0.5047088606204798, 2335)

In [23]:
for coef in coef_pos_index[:20]:
    print(invert_index_vectorizer[coef[1]], coef[0])


맛있다 3.455618903956166
적당하다 1.485270622554382
부드럽다 1.446908910098594
시원하다 1.125524989443121
최고 1.0044795916585298
두번째 0.9075569646073368
사다 0.7589151251894684
저렴하다 0.7473406112963731
주다 0.7176702268441215
향 0.7080026023567043
즐기다 0.6903591578451895
귀엽다 0.664657718906575
딱이다 0.6641202095828178
사람 0.6525305524169959
달달 0.6488668456179623
처음 0.6467758051868222
찾다 0.6159659931768635
쏘다 0.6135398144573656
술술 0.6116910393155103
걱정 0.6082570217388549


In [24]:
for coef in coef_neg_index[:20]:
    print(invert_index_vectorizer[coef[1]], coef[0])


별로 -2.2832943580167417
아니다 -2.079536568324191
깨다 -1.9807488679594574
맛없다 -1.8559751983158526
보통 -1.7952512887408703
느리다 -1.7867495404731069
나쁘다 -1.733794051124938
달다 -1.6903454816153995
입맛 -1.637404331688609
깊다 -1.6074632794589914
상태 -1.5810962399395483
생각 -1.5348256963684674
기대 -1.4981347230997453
살짝 -1.4426759956315818
기대하다 -1.4212566168094676
세다 -1.3748540009981243
아쉽다 -1.3240712925605183
그닥 -1.2718507151470624
가끔 -1.269894093496505
실망 -1.266350462502343


In [25]:
for coef in coef_neg_index[:]:
    print(invert_index_vectorizer[coef[1]], coef[0])

별로 -2.2832943580167417
아니다 -2.079536568324191
깨다 -1.9807488679594574
맛없다 -1.8559751983158526
보통 -1.7952512887408703
느리다 -1.7867495404731069
나쁘다 -1.733794051124938
달다 -1.6903454816153995
입맛 -1.637404331688609
깊다 -1.6074632794589914
상태 -1.5810962399395483
생각 -1.5348256963684674
기대 -1.4981347230997453
살짝 -1.4426759956315818
기대하다 -1.4212566168094676
세다 -1.3748540009981243
아쉽다 -1.3240712925605183
그닥 -1.2718507151470624
가끔 -1.269894093496505
실망 -1.266350462502343
실망하다 -1.1834434641454783
불다 -1.1717860829555733
평 -1.1684985372478363
뭐 -1.1430313766417226
취향 -1.0595494872107576
마심 -1.047202414258043
편하다 -1.0404646514509595
교환 -1.0315026675699743
복분자주 -1.0155570941821648
오래 -1.0102953247952624
음 -0.9950975847724467
조심하다 -0.9521864973933555
지난번 -0.9513572072074412
걸리다 -0.9455440283802655
테 -0.9447569251193464
요약 -0.9311003401162966
솔직하다 -0.9276471093122007
평보 -0.9208641153887623
느끼다 -0.9138054168681669
덜하다 -0.9029046252810008
쥬스 -0.9026944717011106
안나 -0.8963284664139438
프로 -0.8885604272551608
약