# 7. 감성분석
- ### 감성분석 : 텍스트에 나타난 주관적 요소를 분석하여 긍정,부정의 요소 및 그 정도를 판별하여 정량화하는 기법
- ### 긍정과 부정을 판별할 뿐 아니라 긍정,부정의 대상이 되는 단어 또는 개체 를 추출하고 감성을 표현하는 이의 의도나 입장을 분석하는 것도 포함하는 개념

1. 단어사전기반 분석 - 감성사전을 이용하여 각 단어의 감정 분류와 그 정도를 알 수 있어야 함

In [1]:
# 텍스트와 감성지수가 사전에 정의되어 있어야 함
import glob
from afinn import Afinn
# imdb 데이터셋 5만건의 학습용, 검증용 데이터셋 긍정, 부정 리뷰로 라벨링되어 있음.
# 긍정리뷰데이터 20번째 내용
# glob.glob 특정한 패턴의 파일만 선택하는 함수
pos_review = (glob.glob("../data/imdb/train/pos/*.txt"))[20]
f = open(pos_review, 'r')
lines1 = f.readlines()[0] # 파일을 읽음
f.close()


In [2]:
# 감성분석 객체
afinn = Afinn()

# 텍스트 전처리 후 감성점수 산출
afinn.score(lines1)

7.0

In [3]:
files=list(glob.glob('../data/imdb/train/pos/*.txt')[:10])
files

['../data/imdb/train/pos\\0_9.txt',
 '../data/imdb/train/pos\\10000_8.txt',
 '../data/imdb/train/pos\\10001_10.txt',
 '../data/imdb/train/pos\\10002_7.txt',
 '../data/imdb/train/pos\\10003_8.txt',
 '../data/imdb/train/pos\\10004_8.txt',
 '../data/imdb/train/pos\\10005_7.txt',
 '../data/imdb/train/pos\\10006_7.txt',
 '../data/imdb/train/pos\\10007_7.txt',
 '../data/imdb/train/pos\\10008_7.txt']

In [4]:
# 학습용 긍정리뷰 10개 파일만 테스트
afinn=Afinn() # 감성분석 함수
for i in files:
    f=open(i) # 파일 오픈
    lines1=f.readlines()[0] # 리스트의 첫번째 문자열
    print(afinn.score(lines1)) # 감성점수
    f.close()
# 부정리뷰데이터 20번째 내용
neg_review=(glob.glob("../data/imdb/train/neg/*.txt"))[20]

-1.0
2.0
19.0
3.0
14.0
8.0
22.0
28.0
13.0
5.0


In [5]:
f = open(neg_review, 'r')
lines2 = f.readlines()[0]
f.close()

In [6]:
afinn.score(lines2)

-4.0

In [7]:
files=list(glob.glob('../data/imdb/train/neg/*.txt')[:10])
files

['../data/imdb/train/neg\\0_3.txt',
 '../data/imdb/train/neg\\10000_4.txt',
 '../data/imdb/train/neg\\10001_4.txt',
 '../data/imdb/train/neg\\10002_1.txt',
 '../data/imdb/train/neg\\10003_1.txt',
 '../data/imdb/train/neg\\10004_3.txt',
 '../data/imdb/train/neg\\10005_3.txt',
 '../data/imdb/train/neg\\10006_4.txt',
 '../data/imdb/train/neg\\10007_1.txt',
 '../data/imdb/train/neg\\10008_2.txt']

In [8]:
# 학습용 부정리뷰 10개 파일만 테스트
afinn=Afinn() # 감성분석 함수
for i in files:
    f=open(i) # 파일 오픈
    lines1=f.readlines()[0] # 리스트의 첫번째 문자열
    print(afinn.score(lines1)) # 감성점수
    f.close()

6.0
-4.0
9.0
5.0
-7.0
1.0
13.0
4.0
7.0
6.0


2. 기계학습으로 감성분석(시간이 매우 오래 걸림)

In [9]:
import glob

# 긍정 텍스트 로딩
pos_review=(glob.glob("../data/imdb/train/pos/*.txt")[:100])
lines_pos=[]
for i in pos_review:
    try:
        f = open(i, 'r')
        temp = f.readlines()[0]
        lines_pos.append(temp)
        f.close()
    except :
        continue
len(lines_pos)

100

In [10]:
# 부정 텍스트 로딩
neg_review=(glob.glob("../data/imdb/train/neg/*.txt")[:100])
lines_neg=[]
for i in neg_review:
    try:
        f = open(i, 'r')
        temp = f.readlines()[0]
        lines_neg.append(temp)
        f.close()
    except :
        continue
len(lines_neg)

100

In [11]:
# 긍정,부정 리뷰를 합침
total_text=lines_pos+lines_neg
len(total_text)

200

In [12]:
import numpy as np
from nltk.corpus import stopwords

# 긍정,부정 클래스 라벨링
x = np.array(["pos", "neg"])
class_Index=np.repeat(x, [len(lines_pos), len(lines_neg)], axis=0)
stop_words = stopwords.words('english')

In [13]:
# 단어들에 Tfidf 가중치를 부여한 후 문서-단어 매트릭스로 바꿈
from sklearn.feature_extraction.text import TfidfVectorizer
vect = TfidfVectorizer(stop_words=stop_words).fit(total_text)
X_train_vectorized = vect.transform(total_text)
X_train_vectorized.index = class_Index

In [14]:
import pandas as pd
# 데이터프레임으로 변환
df = pd.DataFrame(X_train_vectorized.toarray(), columns=vect.vocabulary_.keys())
df.head()

Unnamed: 0,bromwell,high,cartoon,comedy,ran,time,programs,school,life,teachers,35,years,teaching,profession,lead,believe,satire,much,closer,reality,scramble,survive,financially,insightful,students,see,right,pathetic,pomp,pettiness,whole,situation,remind,schools,knew,saw,episode,student,repeatedly,tried,...,spell,romania,voodooism,anticipation,clawing,covered,roadside,graves,prettier,glamor,generously,airbrushed,models,runway,borrows,lauded,countryman,injects,euro,techno,prehistoric,electronic,bumblebee,noise,ibiza,disco,shake,booty,function,er,zombified,auteur,ample,opportunities,golden,geist,uttered,downloading,midget,tricking
0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.054934,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0
1,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.033873,0.0,0.0,0.0,0.0,0.0,0.0,0.037815,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0
2,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0
3,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0
4,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0


In [32]:
for idx,value in enumerate(X_train_vectorized[0].toarray()[0]):
    if value>0:
        print(idx, value)

64 0.10075971571307776
189 0.10075971571307776
212 0.09519036398460193
583 0.06607233108770873
770 0.4344371044102009
813 0.10075971571307776
889 0.09087043917100991
1034 0.07745152705306162
1062 0.09519036398460193
1101 0.07093762711956966
1916 0.09087043917100991
2029 0.07093762711956966
2121 0.06502562223418444
2167 0.10860927610255022
2202 0.10075971571307776
2719 0.34800983331794577
2895 0.10075971571307776
3000 0.10860927610255022
3005 0.10860927610255022
3239 0.07949124320565701
3310 0.09087043917100991
3377 0.04921361834843082
3397 0.06835253974870535
3553 0.04749164856850098
3818 0.040058435364654435
4021 0.03193031768060858
4168 0.09087043917100991
4231 0.10860927610255022
4277 0.10075971571307776
4347 0.10860927610255022
4469 0.10075971571307776
4476 0.10860927610255022
4606 0.10075971571307776
4652 0.09519036398460193
4664 0.10860927610255022
4745 0.09087043917100991
4760 0.10075971571307776
4860 0.0630880667300972
4944 0.10860927610255022
4980 0.10860927610255022
4989 0.06

- 로지스틱 회귀 모형

In [15]:
# 로지스틱 회귀 모형
from sklearn.linear_model import LogisticRegression
logit = LogisticRegression(random_state=10)
logit.fit(X_train_vectorized, class_Index)

LogisticRegression(random_state=10)

In [33]:
# 긍정 리뷰들을 하나씩 불러와서 실험
def pos_review(model):
    count_all = 0
    count = 0
    num = 100
    tests1 = []
    for idx in range(0, num):
        pos_review_test = (glob.glob("../data/imdb/test/pos/*.txt"))[idx]
        f = open(pos_review_test, 'r', encoding='utf-8')
        tests1.append(f.readlines())
        f.close()
    for test in tests1:
        pred = model.predict(vect.transform(test))
        result = pred[0]
        if result == "pos":
            count += 1
        count_all += 1
    rate = count * 100 / count_all
    print(f"긍정정확도:{rate:.1f}%")

In [34]:
# 부정 리뷰들을 하나씩 불러와서 실험
def neg_review(model):
    count_all = 0
    count = 0
    num = 100
    tests2 = []
    for idx in range(0, num):
        neg_review_test=(glob.glob("../data/imdb/test/neg/*.txt"))[idx]
        f = open(neg_review_test, 'r', encoding="utf-8")
        tests2.append(f.readlines())
        f.close()
    for test in tests2:
        preds = model.predict(vect.transform(test))
        result=preds[0]
        if result == "neg":
            count += 1
        count_all += 1
    rate= count * 100 / count_all
    print("부정정확도:{0:.1f}%".format(rate))
pos_review(logit)
neg_review(logit)

긍정정확도:66.0%
부정정확도:82.0%


- 의사결정나무 모형

In [35]:
# 의사결정나무 모형
from sklearn.tree import DecisionTreeClassifier
tree = DecisionTreeClassifier(random_state=10)
tree.fit(X_train_vectorized, class_Index)
pos_review(tree)
neg_review(tree)

긍정정확도:54.0%
부정정확도:54.0%


- K최근접이웃 모형

In [36]:
from sklearn.neighbors import KNeighborsClassifier
knn = KNeighborsClassifier(n_neighbors=2)
knn.fit(X_train_vectorized, class_Index)
pos_review(knn)
neg_review(knn)

  mode, _ = stats.mode(_y[neigh_ind, k], axis=1)


긍정정확도:34.0%
부정정확도:85.0%


- 랜덤포레스트

In [37]:
# 랜덤포레스트
from sklearn.ensemble import RandomForestClassifier
# 10개의 트리로 구성된 랜덤 포레스트
forest = RandomForestClassifier(n_estimators=10,
random_state=10)
forest.fit(X_train_vectorized, class_Index)
pos_review(forest)
neg_review(forest)

긍정정확도:46.0%
부정정확도:70.0%


- 인공신경망

In [38]:
# 인공신경망
from sklearn.neural_network import MLPClassifier
mlp = MLPClassifier(random_state=10)
mlp.fit(X_train_vectorized, class_Index)
pos_review(mlp)
neg_review(mlp)

긍정정확도:63.0%
부정정확도:76.0%


- SVM 모형

In [39]:
# SVM 모형
from sklearn.svm import SVC
svm = SVC(random_state=10)
svm.fit(X_train_vectorized, class_Index)
pos_review(svm)
neg_review(svm)

긍정정확도:63.0%
부정정확도:86.0%


- 나이브배이즈 모형

In [40]:
from textblob import TextBlob
from textblob.classifiers import NaiveBayesClassifier
train = [
        ('I love this sandwich.', 'pos'),
        ('This is an amazing place!', 'pos'),
        ('I feel very good about these beers.', 'pos'),
        ('This is my best work.', 'pos'),
        ('What an awesome view', 'pos'),
        ('I do not like this restaurant', 'neg'),
        ('I am tired of this stuff.', 'neg'),
        ("I can't deal with this", 'neg'),
        ('He is my sworn enemy!', 'neg'),
        ('My boss is horrible.', 'neg')
    ]
# 한글도 가능하다.
test = [
        ('The beer was good.', 'pos'),
        ('I do not enjoy my job', 'neg'),
        ('I am not feeling dandy today.', 'neg'),
        ('I feel amazing!', 'pos'),
        ('Gary is a friend of mine.', 'pos'),
        ("I can't believe I'm doing this.", 'neg')
    ]
cl = NaiveBayesClassifier(train)
print(cl.classify('Their burgers are amazing'))
print(cl.classify("I don't like their pizza."))

pos
neg


In [41]:
# 여러 문장을 종합하여 부정으로 분류
blob = TextBlob("The beer was amazing. But the hangover was horrible. My boss was not happy.", classifier=cl)
blob.classify()

'neg'

In [42]:
# 개별 문장으로 분류
for sentence in blob.sentences:
    print(sentence, '==>', sentence.classify())

# "pos", "neg", "neg"
for row in test:
    print(row[0], '==>', cl.classify(row[0]))

cl.accuracy(test)

The beer was amazing. ==> pos
But the hangover was horrible. ==> neg
My boss was not happy. ==> neg
The beer was good. ==> pos
I do not enjoy my job ==> neg
I am not feeling dandy today. ==> neg
I feel amazing! ==> pos
Gary is a friend of mine. ==> neg
I can't believe I'm doing this. ==> neg


0.8333333333333334

In [43]:
cl.show_informative_features(5)
# this가 포함된 경우 부정:긍정 = 2.3:1.0
# this가 포함되지 않은 경우 긍정:부정 = 1.8:1.0

Most Informative Features
          contains(this) = True              neg : pos    =      2.3 : 1.0
          contains(this) = False             pos : neg    =      1.8 : 1.0
          contains(This) = False             neg : pos    =      1.6 : 1.0
            contains(an) = False             neg : pos    =      1.6 : 1.0
             contains(I) = False             pos : neg    =      1.4 : 1.0
