# sentiment dictionary

In [29]:
import pandas as pd
import numpy as np
import re
from sklearn.feature_extraction.text import CountVectorizer
from sklearn.feature_extraction.text import TfidfVectorizer
from sklearn.linear_model import LogisticRegression
from sklearn.metrics import accuracy_score

In [2]:
def tf_extractor(corpus):  
    # returns a frequency-based DTM
    vectorizer = CountVectorizer(min_df=1, ngram_range=(1,1)) 
    features = vectorizer.fit_transform(corpus) # transform texts to a frequency matrix
    return vectorizer, features  

In [3]:
def tfidf_extractor(corpus):
    # returns a tf-idf based DTM
    vectorizer = TfidfVectorizer(min_df=1, 
                                 norm='l2',
                                 smooth_idf=True,
                                 use_idf=True,
                                 ngram_range=(1,1))
    features = vectorizer.fit_transform(corpus)
    return vectorizer, features

## <br>
## naver_clova

In [112]:
path = r'C:\Users\sormd\Documents\GitHub\ai_speaker_textmining\data'
data = pd.read_csv(path + r'\playstore_review_naver_clova.csv', encoding='UTF-8')
comments = list(data['comment'])
ratings = list(data['ratings'])
data.shape

(2556, 5)

In [113]:
filtered_texts = []
filtered_labels = []

for text, score in zip(comments, ratings):
    if score == 3:
        continue
        
    # 평점 기준으로 문서에 label을 부여
    # 1 ~ 2 -> 부정, -1
    # 4 ~ 5 -> 긍정, 1
    filtered_texts.append(text)
    filtered_labels.append(1 if score > 3 else -1)

In [114]:
# You can also use the following method
from sklearn.model_selection import train_test_split
train_texts, test_texts, train_labels, test_labels = train_test_split(filtered_texts, filtered_labels, test_size=0.3, random_state=0)

In [115]:
tf_vectorizer, train_tf_features = tf_extractor(train_texts)
# input의 형태 = list of docs
test_tf_features = tf_vectorizer.transform(test_texts)
vocablist = [word for word, _ in sorted(tf_vectorizer.vocabulary_.items(), key=lambda x:x[1])]
# tf_vectorizer.vocabulary_.items() returns a list of (word, frequency)
# We sort words based on their frequencies and save the words

In [119]:
# tf matrix를 사용한 경우
lr = LogisticRegression(C=0.1, penalty='l2', solver='sag') # Lasso regression
# C = Inverse of regularization strength, 즉 C 값이 작을수록 penalty를 많이 준다는 것입니다.
# penalty를 많이 준다는 뜻은 L1 같은 경우는 feature의 수를 그만큼 많이 줄인다는 뜻이고
# L2인 경우는 weight 값을 더 0에 가깝게 한다는 뜻입니다.
lr.fit(train_tf_features, train_labels) # 학습
pred_labels = lr.predict(test_tf_features)
print('Misclassified samples: {} out of {}'.format((pred_labels != test_labels).sum(),len(test_labels)))
print('Accuracy: %.2f' % accuracy_score(test_labels, pred_labels))

Misclassified samples: 178 out of 641
Accuracy: 0.72


In [38]:
# tfidf matrix를 사용한 경우
tfidf_vectorizer, train_tfidf_features = tfidf_extractor(train_texts)
test_tfidf_features = tfidf_vectorizer.transform(test_texts)
lr = LogisticRegression(C=0.1, penalty='l1', solver='saga') # Lasso regression
lr.fit(train_tfidf_features, train_labels) # 학습
pred_labels = lr.predict(test_tfidf_features)
print('Misclassified samples: {} out of {}'.format((pred_labels != test_labels).sum(),len(test_labels)))
print('Accuracy: %.2f' % accuracy_score(test_labels, pred_labels))

Misclassified samples: 304 out of 641
Accuracy: 0.53


In [41]:
# Get coefficients of the model
coefficients = lr.coef_.tolist()

sorted_coefficients = sorted(enumerate(coefficients[0]), key=lambda x:x[1], reverse=True)
# 학습에 사용된 각 단어마다의 coefficient (즉 weight) 값이 존재
# coefficient값이 큰 순으로 정렬 'reverse=True'

print(sorted_coefficients[:5])
# print top 50 positive words
for word, coef in sorted_coefficients[:50]:
    print('{0:} ({1:.3f})'.format(vocablist[word], coef))
# print top 50 negative words
for word, coef in sorted_coefficients[-50:]:
    print('{0:} ({1:.3f})'.format(vocablist[word], coef))

[(7574, 1.039721417408199), (7600, 0.7724308659319985), (7561, 0.5665838283947136), (1049, 0.47337489103995595), (7547, 0.4468903456873619)]
좋아요 (1.040)
좋은데 (0.772)
좋네요 (0.567)
근데 (0.473)
좋겠어요 (0.447)
좋겠네요 (0.432)
좋겠습니다 (0.384)
같아요 (0.351)
있게 (0.320)
좋습니다 (0.318)
부르면 (0.315)
감사합니다 (0.311)
ㅎㅎ (0.302)
빅스비 (0.284)
있었으면 (0.278)
추가해주세요 (0.273)
하지만 (0.267)
합니다 (0.260)
해주세요 (0.252)
되면 (0.249)
있으면 (0.238)
good (0.231)
정말 (0.230)
빅스비보다 (0.219)
다좋은데 (0.218)
너무좋아요 (0.210)
클로바를 (0.209)
저는 (0.208)
시리처럼 (0.205)
좋고 (0.204)
좋은 (0.203)
재생이 (0.203)
더욱 (0.198)
좋음 (0.196)
주세요 (0.185)
좋을것 (0.184)
한가지 (0.183)
최고 (0.178)
있는데요 (0.173)
굿굿 (0.167)
연동 (0.159)
같습니다 (0.157)
하네요 (0.156)
아주 (0.155)
에서 (0.152)
쓰고 (0.151)
음성호출이 (0.148)
인공지능 (0.147)
이름이 (0.146)
좋겠다 (0.145)
샀는데 (-0.190)
도대체 (-0.194)
앱을 (-0.197)
음성 (-0.197)
같은데 (-0.198)
어플도 (-0.199)
스피커도 (-0.205)
혼자 (-0.207)
갑자기 (-0.208)
나오네요 (-0.209)
점점 (-0.209)
그냥 (-0.210)
없음 (-0.219)
해도 (-0.221)
안되는게 (-0.221)
어떻게 (-0.225)
너무 (-0.229)
프렌즈 (-0.229)
클로바 (-0.233)
않는 (-0.2

In [64]:
with open(r'C:\Users\sormd\Documents\GitHub\ai_speaker_textmining\data\sentiment_dictionary\naver_clova.txt', 'w', encoding='utf-8') as f:
    for word, coef in sorted_coefficients:
        f.write(vocablist[word]+'\t'+str(coef)+'\n')
f.close()

## <br>
## kakao_mini

In [65]:
path = r'C:\Users\sormd\Documents\GitHub\ai_speaker_textmining\data'
data = pd.read_csv(path + r'\playstore_review_kakao_mini.csv', encoding='UTF-8')
comments = list(data['comment'])
ratings = list(data['ratings'])
data.shape

(578, 5)

In [66]:
filtered_texts = []
filtered_labels = []

for text, score in zip(comments, ratings):
    if score == 3:
        continue
        
    filtered_texts.append(text)
    filtered_labels.append(1 if score > 3 else -1)

In [67]:
from sklearn.model_selection import train_test_split
train_texts, test_texts, train_labels, test_labels = train_test_split(filtered_texts, filtered_labels, test_size=0.3, random_state=0)
tf_vectorizer, train_tf_features = tf_extractor(train_texts)
test_tf_features = tf_vectorizer.transform(test_texts)
vocablist = [word for word, _ in sorted(tf_vectorizer.vocabulary_.items(), key=lambda x:x[1])]

In [71]:
# tf matrix를 사용한 경우
lr = LogisticRegression(C=0.1, penalty='l2', solver='sag') # Lasso regression
# C = Inverse of regularization strength, 즉 C 값이 작을수록 penalty를 많이 준다는 것입니다.
# penalty를 많이 준다는 뜻은 L1 같은 경우는 feature의 수를 그만큼 많이 줄인다는 뜻이고
# L2인 경우는 weight 값을 더 0에 가깝게 한다는 뜻입니다.
lr.fit(train_tf_features, train_labels) # 학습
pred_labels = lr.predict(test_tf_features)
print('Misclassified samples: {} out of {}'.format((pred_labels != test_labels).sum(),len(test_labels)))
print('Accuracy: %.2f' % accuracy_score(test_labels, pred_labels))

Misclassified samples: 50 out of 149
Accuracy: 0.66


In [69]:
# tfidf matrix를 사용한 경우
tfidf_vectorizer, train_tfidf_features = tfidf_extractor(train_texts)
test_tfidf_features = tfidf_vectorizer.transform(test_texts)
lr = LogisticRegression(C=0.1, penalty='l1', solver='saga') # Lasso regression
lr.fit(train_tfidf_features, train_labels) # 학습
pred_labels = lr.predict(test_tfidf_features)
print('Misclassified samples: {} out of {}'.format((pred_labels != test_labels).sum(),len(test_labels)))
print('Accuracy: %.2f' % accuracy_score(test_labels, pred_labels))

Misclassified samples: 69 out of 149
Accuracy: 0.54


In [72]:
# Get coefficients of the model
coefficients = lr.coef_.tolist()

sorted_coefficients = sorted(enumerate(coefficients[0]), key=lambda x:x[1], reverse=True)

print(sorted_coefficients[:5])
# print top 50 positive words
for word, coef in sorted_coefficients[:50]:
    print('{0:} ({1:.3f})'.format(vocablist[word], coef))
# print top 50 negative words
for word, coef in sorted_coefficients[-50:]:
    print('{0:} ({1:.3f})'.format(vocablist[word], coef))

[(2523, 0.607919781186865), (2512, 0.3326869482487577), (2511, 0.25866577529010176), (2708, 0.24190978331699442), (2516, 0.18162850739750763)]
좋아요 (0.608)
좋겠어요 (0.333)
좋겠습니다 (0.259)
카카오미니 (0.242)
좋네요 (0.182)
기능이 (0.180)
좋고 (0.164)
음악 (0.159)
좋은데 (0.158)
있으면 (0.146)
업데이트 (0.145)
제가 (0.142)
ㅠㅠ (0.117)
기대합니다 (0.114)
근데 (0.112)
있습니다 (0.110)
노래도 (0.107)
잘쓰고 (0.106)
너무 (0.105)
유용하게 (0.105)
위젯이 (0.104)
부탁드려요 (0.104)
좋다 (0.102)
헤이카카오 (0.102)
지금은 (0.101)
해주세요 (0.098)
있어서 (0.097)
원하는 (0.096)
좋습니다 (0.095)
아쉽네요 (0.091)
제발 (0.090)
영어로 (0.089)
그게 (0.087)
베이스 (0.087)
음질도 (0.083)
신기하고 (0.080)
다른건 (0.080)
좋은데요 (0.080)
같아요 (0.079)
기능 (0.078)
앞으로 (0.078)
반응도 (0.076)
만족합니다 (0.075)
있었으면 (0.075)
하지 (0.074)
설정이 (0.074)
재생할때 (0.072)
빠른 (0.072)
다만 (0.072)
최고 (0.072)
미니 (-0.088)
사용 (-0.088)
같은 (-0.088)
와이파이도 (-0.089)
안잡히고 (-0.091)
이게 (-0.094)
잡음이 (-0.095)
불러도 (-0.095)
와이파이를 (-0.097)
없다고 (-0.099)
해결해주세요 (-0.102)
연결도 (-0.103)
공유기 (-0.104)
안되는데 (-0.104)
카카오 (-0.105)
있어요 (-0.105)
무슨 (-0.109)
불편해요 (-0.112)
바로 (-0.11

In [73]:
with open(r'C:\Users\sormd\Documents\GitHub\ai_speaker_textmining\data\sentiment_dictionary\kakao_mini.txt', 'w', encoding='utf-8') as f:
    for word, coef in sorted_coefficients:
        f.write(vocablist[word]+'\t'+str(coef)+'\n')
f.close()

## <br>
## kt_gigagenie

In [74]:
path = r'C:\Users\sormd\Documents\GitHub\ai_speaker_textmining\data'
data = pd.read_csv(path + r'\playstore_review_kt_gigagenie.csv', encoding='UTF-8')
comments = list(data['comment'])
ratings = list(data['ratings'])
data.shape

(430, 5)

In [75]:
filtered_texts = []
filtered_labels = []

for text, score in zip(comments, ratings):
    if score == 3:
        continue
        
    filtered_texts.append(text)
    filtered_labels.append(1 if score > 3 else -1)

In [76]:
from sklearn.model_selection import train_test_split
train_texts, test_texts, train_labels, test_labels = train_test_split(filtered_texts, filtered_labels, test_size=0.3, random_state=0)
tf_vectorizer, train_tf_features = tf_extractor(train_texts)
test_tf_features = tf_vectorizer.transform(test_texts)
vocablist = [word for word, _ in sorted(tf_vectorizer.vocabulary_.items(), key=lambda x:x[1])]

In [79]:
# tf matrix를 사용한 경우
lr = LogisticRegression(C=0.1, penalty='l2', solver='sag') # Lasso regression
lr.fit(train_tf_features, train_labels) # 학습
pred_labels = lr.predict(test_tf_features)
print('Misclassified samples: {} out of {}'.format((pred_labels != test_labels).sum(),len(test_labels)))
print('Accuracy: %.2f' % accuracy_score(test_labels, pred_labels))

Misclassified samples: 32 out of 110
Accuracy: 0.71


In [78]:
# tfidf matrix를 사용한 경우
tfidf_vectorizer, train_tfidf_features = tfidf_extractor(train_texts)
test_tfidf_features = tfidf_vectorizer.transform(test_texts)
lr = LogisticRegression(C=0.1, penalty='l1', solver='saga') # Lasso regression
lr.fit(train_tfidf_features, train_labels) # 학습
pred_labels = lr.predict(test_tfidf_features)
print('Misclassified samples: {} out of {}'.format((pred_labels != test_labels).sum(),len(test_labels)))
print('Accuracy: %.2f' % accuracy_score(test_labels, pred_labels))

Misclassified samples: 55 out of 110
Accuracy: 0.50


In [80]:
# Get coefficients of the model
coefficients = lr.coef_.tolist()

sorted_coefficients = sorted(enumerate(coefficients[0]), key=lambda x:x[1], reverse=True)

print(sorted_coefficients[:5])
# print top 50 positive words
for word, coef in sorted_coefficients[:50]:
    print('{0:} ({1:.3f})'.format(vocablist[word], coef))
# print top 50 negative words
for word, coef in sorted_coefficients[-50:]:
    print('{0:} ({1:.3f})'.format(vocablist[word], coef))

[(1142, 0.6547985427460136), (1224, 0.28839288216950926), (1151, 0.21039604604230958), (1282, 0.13418928234346778), (460, 0.11387123016243433)]
좋아요 (0.655)
최고 (0.288)
좋음 (0.210)
편해요 (0.134)
매일 (0.114)
kt간편로그인에 (0.112)
좋다 (0.092)
좋네요 (0.091)
good (0.087)
좋은 (0.084)
듭니다 (0.083)
아쉽네요 (0.082)
간편하고 (0.079)
제가 (0.071)
아주 (0.064)
우리집 (0.063)
근데 (0.059)
등록된 (0.056)
아이디가 (0.056)
아이디를 (0.056)
좋습니다 (0.048)
취향 (0.048)
너무감사쥬쥬 (0.048)
말귀를잘알아들음 (0.048)
기가지니가티비아니면핸드폰에나오는데난기가지니를처음써본다 (0.048)
김정순 (0.048)
좋아좋아 (0.048)
넘좋아요ㅎㅎ (0.048)
교통상황빨리 (0.048)
편함 (0.048)
멋져뿌러 (0.048)
조찬복 (0.048)
재재가재재를먹어부렸어 (0.048)
좋아ㅡ (0.048)
네좋아요 (0.048)
다좋음 (0.048)
넘냐ㅡㅎ은것 (0.048)
좋을것갔다 (0.048)
좋아여 (0.048)
지니를궃이않불러도돼서편리해요 (0.048)
조아 (0.048)
헤헤 (0.048)
오케 (0.048)
감사 (0.048)
간단하네ㅎㅎ (0.048)
디ㅡㄱㅅㄱㅅ (0.048)
편리해요 (0.048)
ㅠㅠ5555 (0.048)
기가지니는참말을잘듣습니다 (0.048)
않되는데 (0.048)
쓰고 (-0.077)
굉장히 (-0.077)
지금 (-0.078)
담거나 (-0.079)
해주세요 (-0.079)
노래가 (-0.081)
정말 (-0.081)
권한 (-0.082)
장난까냐 (-0.082)
기능이 (-0.082)
늘려주세요 (-0.083)
ㅡㅡ (-0.083)
갑자기 (-0.083)
쓸데

In [81]:
with open(r'C:\Users\sormd\Documents\GitHub\ai_speaker_textmining\data\sentiment_dictionary\kt_gigagenie.txt', 'w', encoding='utf-8') as f:
    for word, coef in sorted_coefficients:
        f.write(vocablist[word]+'\t'+str(coef)+'\n')
f.close()

## <br>
## skt_nugu

In [82]:
path = r'C:\Users\sormd\Documents\GitHub\ai_speaker_textmining\data'
data = pd.read_csv(path + r'\playstore_review_skt_nugu.csv', encoding='UTF-8')
comments = list(data['comment'])
ratings = list(data['ratings'])
data.shape

(1940, 5)

In [83]:
filtered_texts = []
filtered_labels = []

for text, score in zip(comments, ratings):
    if score == 3:
        continue
        
    filtered_texts.append(text)
    filtered_labels.append(1 if score > 3 else -1)

In [84]:
from sklearn.model_selection import train_test_split
train_texts, test_texts, train_labels, test_labels = train_test_split(filtered_texts, filtered_labels, test_size=0.3, random_state=0)
tf_vectorizer, train_tf_features = tf_extractor(train_texts)
test_tf_features = tf_vectorizer.transform(test_texts)
vocablist = [word for word, _ in sorted(tf_vectorizer.vocabulary_.items(), key=lambda x:x[1])]

In [87]:
# tf matrix를 사용한 경우
lr = LogisticRegression(C=0.1, penalty='l2', solver='sag') # Lasso regression
lr.fit(train_tf_features, train_labels) # 학습
pred_labels = lr.predict(test_tf_features)
print('Misclassified samples: {} out of {}'.format((pred_labels != test_labels).sum(),len(test_labels)))
print('Accuracy: %.2f' % accuracy_score(test_labels, pred_labels))

Misclassified samples: 134 out of 488
Accuracy: 0.73


In [86]:
# tfidf matrix를 사용한 경우
tfidf_vectorizer, train_tfidf_features = tfidf_extractor(train_texts)
test_tfidf_features = tfidf_vectorizer.transform(test_texts)
lr = LogisticRegression(C=0.1, penalty='l1', solver='saga') # Lasso regression
lr.fit(train_tfidf_features, train_labels) # 학습
pred_labels = lr.predict(test_tfidf_features)
print('Misclassified samples: {} out of {}'.format((pred_labels != test_labels).sum(),len(test_labels)))
print('Accuracy: %.2f' % accuracy_score(test_labels, pred_labels))

Misclassified samples: 175 out of 488
Accuracy: 0.64


In [88]:
# Get coefficients of the model
coefficients = lr.coef_.tolist()

sorted_coefficients = sorted(enumerate(coefficients[0]), key=lambda x:x[1], reverse=True)

print(sorted_coefficients[:5])
# print top 50 positive words
for word, coef in sorted_coefficients[:50]:
    print('{0:} ({1:.3f})'.format(vocablist[word], coef))
# print top 50 negative words
for word, coef in sorted_coefficients[-50:]:
    print('{0:} ({1:.3f})'.format(vocablist[word], coef))

[(5314, 1.1386396414224107), (5308, 0.4173584755355908), (4866, 0.33210715718373546), (5311, 0.32803802155508194), (3434, 0.31923048287435984)]
좋아요 (1.139)
좋네요 (0.417)
있습니다 (0.332)
좋습니다 (0.328)
아리아 (0.319)
좋겠습니다 (0.307)
good (0.286)
좋겠어요 (0.276)
최고 (0.268)
기능이 (0.260)
좋음 (0.241)
감사합니다 (0.233)
있으면 (0.220)
좋은 (0.202)
좋은데 (0.199)
편해요 (0.195)
많이 (0.193)
사용하고 (0.175)
기대됩니다 (0.162)
기능은 (0.161)
생각보다 (0.148)
nugu (0.144)
너무좋아요 (0.144)
정말 (0.142)
랜덤 (0.141)
아주 (0.135)
있어요 (0.135)
기능들이 (0.133)
혹시 (0.129)
추가해주세요 (0.128)
있어서 (0.125)
제가 (0.122)
편리해요 (0.120)
있도록 (0.119)
아주좋아요 (0.118)
좋아 (0.117)
다좋은데 (0.116)
있었으면 (0.115)
난리 (0.115)
잘쓰고 (0.114)
최고의 (0.113)
있는데 (0.112)
크게 (0.112)
조아요 (0.111)
편하고 (0.111)
좋겠네요 (0.111)
있는 (0.109)
좀더 (0.107)
보입니다 (0.107)
유튜브 (0.106)
연결을 (-0.166)
끊겨서 (-0.167)
처럼 (-0.168)
사용 (-0.171)
안되요 (-0.171)
제대로 (-0.171)
접속이 (-0.171)
앱은 (-0.174)
멜론을 (-0.177)
진짜 (-0.178)
무조건 (-0.185)
아무리 (-0.185)
연결할수 (-0.186)
인터넷 (-0.188)
멜론 (-0.188)
없네요 (-0.188)
업데이트 (-0.188)
누르면 (-0.190)
말을 (-0.193)
나

In [89]:
with open(r'C:\Users\sormd\Documents\GitHub\ai_speaker_textmining\data\sentiment_dictionary\skt_nugu.txt', 'w', encoding='utf-8') as f:
    for word, coef in sorted_coefficients:
        f.write(vocablist[word]+'\t'+str(coef)+'\n')
f.close()

## <br>
## clova + nugu + gigagenie + mini

In [169]:
path = r'C:\Users\sormd\Documents\GitHub\ai_speaker_textmining\data'
data1 = pd.read_csv(path + r'\playstore_review_naver_clova.csv', encoding='UTF-8')
data2 = pd.read_csv(path + r'\playstore_review_skt_nugu.csv', encoding='UTF-8')
data3 = pd.read_csv(path + r'\playstore_review_kt_gigagenie.csv', encoding='UTF-8')
data4 = pd.read_csv(path + r'\playstore_review_kakao_mini.csv', encoding='UTF-8')
comments = list(data1['comment']) + list(data2['comment']) + list(data3['comment']) + list(data4['comment'])
ratings = list(data1['ratings']) + list(data2['ratings']) + list(data3['ratings']) + list(data4['ratings']) 

In [170]:
filtered_texts = []
filtered_labels = []

for text, score in zip(comments, ratings):
    if score == 3:
        continue
        
    filtered_texts.append(text)
    filtered_labels.append(1 if score > 3 else -1)

In [171]:
from sklearn.model_selection import train_test_split
train_texts, test_texts, train_labels, test_labels = train_test_split(filtered_texts, filtered_labels, test_size=0.3, random_state=0)
tf_vectorizer, train_tf_features = tf_extractor(train_texts)
test_tf_features = tf_vectorizer.transform(test_texts)
vocablist = [word for word, _ in sorted(tf_vectorizer.vocabulary_.items(), key=lambda x:x[1])]

### Test accuracy <br>
- naver_clova : 0.72 (641) <br>
- skt_nugu : 0.73 (488) <br>
- kt_gigagenie : 0.71 (110) <br>
- kakao_mini : 0.66 (149) <br>

In [123]:
# C : 0.1 -> 1 변경시 accuracy 증가

In [177]:
# tf matrix를 사용한 경우
lr = LogisticRegression(C=1, penalty='l2', solver='sag') # Lasso regression
lr.fit(train_tf_features, train_labels) # 학습
pred_labels = lr.predict(test_tf_features)
print('Misclassified samples: {} out of {}'.format((pred_labels != test_labels).sum(),len(test_labels)))
print('Accuracy: %.2f' % accuracy_score(test_labels, pred_labels))

Misclassified samples: 298 out of 1387
Accuracy: 0.79


In [173]:
# tfidf matrix를 사용한 경우
tfidf_vectorizer, train_tfidf_features = tfidf_extractor(train_texts)
test_tfidf_features = tfidf_vectorizer.transform(test_texts)
lr = LogisticRegression(C=1, penalty='l1', solver='saga') # Lasso regression
lr.fit(train_tfidf_features, train_labels) # 학습
pred_labels = lr.predict(test_tfidf_features)
print('Misclassified samples: {} out of {}'.format((pred_labels != test_labels).sum(),len(test_labels)))
print('Accuracy: %.2f' % accuracy_score(test_labels, pred_labels))

Misclassified samples: 348 out of 1387
Accuracy: 0.75


In [178]:
# Get coefficients of the model
coefficients = lr.coef_.tolist()

sorted_coefficients = sorted(enumerate(coefficients[0]), key=lambda x:x[1], reverse=True)

print(sorted_coefficients[:5])
# print top 50 positive words
for word, coef in sorted_coefficients[:50]:
    print('{0:} ({1:.3f})'.format(vocablist[word], coef))
# print top 50 negative words
for word, coef in sorted_coefficients[-50:]:
    print('{0:} ({1:.3f})'.format(vocablist[word], coef))

[(13457, 2.9526365397046006), (13502, 2.2516683380580385), (372, 1.8489078844881726), (13439, 1.8166668418466116), (13525, 1.7243950817841833)]
좋아요 (2.953)
좋은데 (2.252)
good (1.849)
좋네요 (1.817)
좋음 (1.724)
좋습니다 (1.671)
최고 (1.620)
감사합니다 (1.574)
너무좋아요 (1.445)
근데 (1.326)
있습니다 (1.307)
아주좋아요 (1.234)
좋아 (1.183)
추가해주세요 (1.179)
다좋은데 (1.155)
편해요 (1.078)
좋고 (1.073)
감사 (1.064)
같아요 (1.045)
좋은 (1.038)
굿굿 (1.002)
제가 (0.989)
최고의 (0.987)
좋다 (0.979)
좋네 (0.956)
잘쓰고 (0.931)
최고에요 (0.913)
쵝오 (0.905)
좋겠네요 (0.905)
아리아 (0.901)
그런데 (0.891)
nugu (0.888)
조아요 (0.887)
빅스비보다 (0.882)
좋겠어요 (0.873)
있었으면 (0.865)
그래도 (0.845)
해제 (0.804)
부탁드려요 (0.803)
다만 (0.775)
없어도 (0.773)
되면 (0.764)
있는데요 (0.750)
유용하게 (0.749)
오류좀 (0.731)
신기하네요 (0.719)
좋겠습니다 (0.713)
가끔 (0.710)
기대합니다 (0.707)
잘되는데 (0.705)
말귀를 (-0.762)
답답하네요 (-0.764)
어플도 (-0.766)
티비 (-0.779)
모르겠어요 (-0.781)
구매했는데 (-0.791)
연동이 (-0.793)
절대 (-0.795)
한참 (-0.799)
네트워크가 (-0.809)
안되는데 (-0.813)
음성 (-0.820)
오류로 (-0.829)
말을 (-0.832)
쓰레기 (-0.844)
연결안됨 (-0.850)
스피커 (-0.859)
제발 (-0.867)
멈춤 

In [128]:
with open(r'C:\Users\sormd\Documents\GitHub\ai_speaker_textmining\data\sentiment_dictionary\senti_dictionary.txt', 'w', encoding='utf-8') as f:
    for word, coef in sorted_coefficients:
        f.write(vocablist[word]+'\t'+str(coef)+'\n')
f.close()

In [13]:
import re 
sent_dict = {}
with open(r'C:\Users\sormd\Documents\GitHub\ai_speaker_textmining\data\sentiment_dictionary\senti_dictionary.txt', 'r', encoding='utf-8') as f:
    for i in f.readlines():
        word, score = i.split('\t')
        score = re.sub('\n', '', score)
        sent_dict[word] = float(score)
f.close()

In [33]:
sent_dict

{'좋아요': 2.9527606222300258,
 '좋은데': 2.251907199235719,
 'good': 1.8491984037755202,
 '좋네요': 1.8169588665472602,
 '좋음': 1.7246830948415321,
 '좋습니다': 1.6709440517110479,
 '최고': 1.6201384608282334,
 '감사합니다': 1.5747820031620254,
 '너무좋아요': 1.4445872355522489,
 '근데': 1.326636953086338,
 '있습니다': 1.3044598557124025,
 '아주좋아요': 1.2346810317447146,
 '좋아': 1.182774871944742,
 '추가해주세요': 1.179038258497266,
 '다좋은데': 1.1551426728890044,
 '편해요': 1.0784052977699103,
 '좋고': 1.07381743810381,
 '감사': 1.064778009755287,
 '같아요': 1.0454006563436928,
 '좋은': 1.0383077433608938,
 '굿굿': 1.002114292889251,
 '제가': 0.9887222692231149,
 '최고의': 0.9868726272030411,
 '좋다': 0.9793795124540977,
 '좋네': 0.9563002345487744,
 '잘쓰고': 0.9314680738978124,
 '최고에요': 0.913119329239129,
 '쵝오': 0.9059027051729514,
 '좋겠네요': 0.9053388771286865,
 '아리아': 0.9017962345978487,
 '그런데': 0.8899175643339942,
 'nugu': 0.8885840914486435,
 '조아요': 0.8870554536820461,
 '빅스비보다': 0.8825332568884904,
 '좋겠어요': 0.8727164780987425,
 '있었으면': 0.86555737862

In [179]:
sent_dict = {}
for word, coef in sorted_coefficients:
    sent_dict[vocablist[word]] = coef
sent_dict

{'좋아요': 2.9526365397046006,
 '좋은데': 2.2516683380580385,
 'good': 1.8489078844881726,
 '좋네요': 1.8166668418466116,
 '좋음': 1.7243950817841833,
 '좋습니다': 1.6710564911966266,
 '최고': 1.6198631536759365,
 '감사합니다': 1.5744289983787203,
 '너무좋아요': 1.4445207357646836,
 '근데': 1.3259092550497764,
 '있습니다': 1.3068808073905174,
 '아주좋아요': 1.2344609762629262,
 '좋아': 1.1826321553619878,
 '추가해주세요': 1.1788512496661605,
 '다좋은데': 1.1552033651041658,
 '편해요': 1.078104430783355,
 '좋고': 1.0731711592158804,
 '감사': 1.0642715017158606,
 '같아요': 1.045057518697363,
 '좋은': 1.0376158331857264,
 '굿굿': 1.0017470375943998,
 '제가': 0.9891542349881555,
 '최고의': 0.9869545265957445,
 '좋다': 0.9791651319408108,
 '좋네': 0.9564719608267904,
 '잘쓰고': 0.9311742073965216,
 '최고에요': 0.9128656680345015,
 '쵝오': 0.9054240777772419,
 '좋겠네요': 0.9050567615154744,
 '아리아': 0.9013321454268016,
 '그런데': 0.8910966324881633,
 'nugu': 0.8882264637177401,
 '조아요': 0.8870544501824127,
 '빅스비보다': 0.8821219668651791,
 '좋겠어요': 0.8725736657432833,
 '있었으면': 0.8653

# <br>
# wordcloud

### 감성사전으로 긍정, 부정, 중립 문헌 나누기

In [11]:
df = pd.read_csv(r'C:\Users\sormd\Documents\GitHub\ai_speaker_textmining\data\naver_blog\naverclova_speaker_blog.csv', encoding='UTF-16')
texts = list(df['text'].dropna())
texts = list(map(lambda x: x.replace('\n', ' ').replace('\r', ' '), texts))

In [12]:
tf_vectorizer, tf_features = tf_extractor(texts)
vocablist = [word for word, _ in sorted(tf_vectorizer.vocabulary_.items(), key=lambda x:x[1])]

In [14]:
sent_score = []
for i in vocablist:
    if i in sent_dict:
        sent_score.append(sent_dict[i])
    else:
        sent_score.append(0)

In [15]:
tf_score = []
for i in tf_features:
    tf_score.append(i.dot(sent_score)[0])
tf_score

[16.503492987043913,
 -5.997952963612364,
 0.5634197401186769,
 -4.0075183326530865,
 21.883267509417756,
 -0.7450948700129161,
 3.370555175226662,
 -29.432248760299643,
 5.106500916442445,
 12.085252988401644,
 -12.757437910475323,
 -0.6328930921061193,
 2.490329224779809,
 -0.9271787146144134,
 -15.104426972324632,
 7.373917864793675,
 -11.69013738178129,
 -5.330093455071581,
 4.996017020404211,
 -6.226069167343789,
 3.5322651388350064,
 7.138603539104491,
 2.8386760214406537,
 -5.928354130525635,
 -16.696745734028614,
 -4.17613264858293,
 -1.7600481357720967,
 8.358776156142511,
 -4.914627581857023,
 -2.7850863004763236,
 -32.363544268906445,
 3.0966924964507285,
 -11.534721979692113,
 22.72634797982358,
 -10.092595918498818,
 3.253946689379105,
 -3.460767764880818,
 -4.29908554726237,
 -5.848964831508074,
 3.3457090624423214,
 -3.943167797542783,
 -3.4646856458959547,
 -1.717739368404886,
 4.517915351777255,
 -0.5638340801130165,
 23.2635015022667,
 -2.06691098532715,
 12.678208926

In [16]:
pos = []
neg = []
neu = []
for num, score in enumerate(tf_score):
    if score > 0:
        pos.append(texts[num])
    elif score < 0:
        neg.append(texts[num])
    else:
        neu.append(texts[num])

### Dead kernel... 일단 저장..

In [17]:
with open('pos_texts.txt', 'w', encoding='utf-8') as p:
    for i in pos:
        p.write(i+'\n')
p.close()
len(pos)

1848

In [49]:
with open('neg_texts.txt', 'w', encoding='utf-8') as n:
    for i in neg:
        n.write(i+'\n')
n.close()
len(neg)

1555

In [50]:
with open('neu_texts.txt', 'w', encoding='utf-8') as ne:
    for i in neu:
        ne.write(i+'\n')
ne.close()
len(neu)

2

In [2]:
pos = []
with open('pos_texts.txt', 'r', encoding='utf-8') as p:
    for i in p.readlines():
        pos.append(i.split('\n')[0])
len(pos)

1848

In [3]:
neg = []
with open('neg_texts.txt', 'r', encoding='utf-8') as n:
    for i in n.readlines():
        neg.append(i.split('\n')[0])
len(neg)

1555

In [4]:
neu = []
with open('neu_texts.txt', 'r', encoding='utf-8') as ne:
    for i in ne.readlines():
        neu.append(i.split('\n')[0])
len(neu)

2

### positive document

In [44]:
from tqdm import tqdm_notebook

In [5]:
# kornoun extractor로 명사 추출 for 미등록 단어
from kornounextractor.noun_extractor import extract
extract_texts = list(map(extract, pos))
extract_texts = sum(extract_texts, [])

In [6]:
with open("./dic_pos.txt", 'w', encoding='UTF-8') as f:
    for i in extract_texts:
        f.write(i+'\tNNG\n')

In [None]:
# 미등록단어 지정하여 Komoran 형태소 분석 
import konlpy.tag
komoran = konlpy.tag.Komoran()

dicpath = './dic_pos.txt'
komoran = konlpy.tag.Komoran(userdic= dicpath)
komo_nouns = list(map(komoran.nouns, pos))

In [None]:
def remove_one_length(x):
    return len(x)>1

filtered_nouns = []
for noun in komo_nouns:
    filtered_noun = list(filter(remove_one_length, noun))
    filtered_nouns.append(filtered_noun)

In [None]:
# 불용어 제거
with open(r'C:\Users\sormd\Desktop\수업_1학기\기계학습\과제\stopwordsKor.txt', 'r', encoding='utf8') as f:
    stopwords = f.read()

def remove_stopwords(x):
    for word in x:
        if word in stopwords:
            x.remove(word)
    return(x)

cleaned_nouns = list(map(remove_stopwords, filtered_nouns))

In [54]:
import collections
sum_nouns = sum(cleaned_nouns,[])
count_nouns = collections.Counter(sum_nouns)

In [55]:
collections.OrderedDict(count_nouns.most_common())

OrderedDict([('클로바', 5347),
             ('스피커', 4878),
             ('네이버', 3780),
             ('사용', 3093),
             ('인공지능', 2760),
             ('가능', 2667),
             ('기능', 2386),
             ('서비스', 1870),
             ('AI', 1684),
             ('프렌즈', 1586),
             ('음악', 1567),
             ('제품', 1453),
             ('연결', 1445),
             ('보다', 1384),
             ('함께', 1367),
             ('IoT', 1178),
             ('부터', 1160),
             ('정보', 1115),
             ('설정', 1108),
             ('이용', 1089),
             ('확인', 1026),
             ('활용', 983),
             ('기술', 978),
             ('LG', 956),
             ('필요', 934),
             ('스마트', 930),
             ('편리', 925),
             ('음성', 916),
             ('오늘', 915),
             ('노래', 913),
             ('영어', 908),
             ('아이들', 848),
             ('브라운', 844),
             ('검색', 838),
             ('블루투스', 832),
             ('버튼', 830),
             ('콘텐츠', 823),
   

In [56]:
sorted(count_nouns.items(), key=lambda i: i[1])

[('충전용USB', 1),
 ('LED시계', 1),
 ('Radio', 1),
 ('Buzzer', 1),
 ('설정시간', 1),
 ('부저음', 1),
 ('시간차', 1),
 ('부저', 1),
 ('Sleep버튼', 1),
 ('영어방송', 1),
 ('충전상태', 1),
 ('연애통화', 1),
 ('꿀기능', 1),
 ('AAA배터리', 1),
 ('설정사항', 1),
 ('용도예요', 1),
 ('검색유입', 1),
 ('와글와글', 1),
 ('음악취향', 1),
 ('스푼', 1),
 ('spoon', 1),
 ('라디오판', 1),
 ('클립네이버', 1),
 ('음성판', 1),
 ('팟빵팟캐스트', 1),
 ('Box3', 1),
 ('Vox음악', 1),
 ('사운드클라우드', 1),
 ('Mixerbox', 1),
 ('evermusic', 1),
 ('pro클라우드', 1),
 ('mp3몽땅', 1),
 ('cnspower', 1),
 ('은행업무', 1),
 ('입출금', 1),
 ('외부약속', 1),
 ('출타', 1),
 ('텔레비시청', 1),
 ('온라인고스톱', 1),
 ('베타버젼', 1),
 ('인식자체', 1),
 ('구글음성', 1),
 ('배우이름', 1),
 ('검색하기위', 1),
 ('사드린건데', 1),
 ('거주환경', 1),
 ('백 투 더 퓨처', 1),
 ('로버트', 1),
 ('저메키스', 1),
 ('바이센테니얼맨', 1),
 ('콜럼버스', 1),
 ('면면', 1),
 ('컬처', 1),
 ('CUMI', 1),
 ('포켓린트', 1),
 ('국제가전박람회', 1),
 ('1924년', 1),
 ('산업전시', 1),
 ('가전전시회', 1),
 ('smartphone4europe', 1),
 ('scoop', 1),
 ('자연언어', 1),
 ('Thinkstock', 1),
 ('shutterstock', 1),
 ('음향학', 1),
 ('지멘스', 1),
 ('보쉬', 1),
 

In [None]:
from wordcloud import WordCloud
import matplotlib.pyplot as plt

# 그래프에 retina display 적용
#%config InlineBackend.figure_format = 'retina'

# 나눔고딕 설치
!apt -qq -y install fonts-nanum > /dev/null
import matplotlib.font_manager as fm
fontpath = '/usr/share/fonts/truetype/nanum/NanumBarunGothic.ttf'
font = fm.FontProperties(fname=fontpath, size=9)

In [None]:
wordcloud = WordCloud(
    font_path = fontpath,
    width = 800,
    height = 600,
    #stopwords = lines,
    background_color="white"
)

wordcloud = wordcloud.generate_from_frequencies(word_freq)

plt.figure(figsize=(15, 11))
plt.imshow(wordcloud, interpolation="bilinear")
plt.axis("off")
plt.show()