### 텍스트 정규화

In [8]:
from google.colab import drive
drive.mount('/content/drive')

Mounted at /content/drive


In [None]:
from sklearn.datasets import fetch_20newsgroups
news_data = fetch_20newsgroups(subset='all', random_state=156)
print(news_data.keys())

dict_keys(['data', 'filenames', 'target_names', 'target', 'DESCR'])


In [None]:
# bunch - scikit-learn에서 사용되는 딕셔너리와 유사한 객체임, 뜻은 '다량의 것'. '무리'

type(fetch_20newsgroups())

sklearn.utils._bunch.Bunch

In [None]:
import pandas as pd

print('target 클래스의 값과 분포도\n', pd.Series(news_data.target).value_counts().sort_index())
print('target 클래스의 이름들 \n', news_data.target_names)

target 클래스의 값과 분포도
 0     799
1     973
2     985
3     982
4     963
5     988
6     975
7     990
8     996
9     994
10    999
11    991
12    984
13    990
14    987
15    997
16    910
17    940
18    775
19    628
dtype: int64
target 클래스의 이름들 
 ['alt.atheism', 'comp.graphics', 'comp.os.ms-windows.misc', 'comp.sys.ibm.pc.hardware', 'comp.sys.mac.hardware', 'comp.windows.x', 'misc.forsale', 'rec.autos', 'rec.motorcycles', 'rec.sport.baseball', 'rec.sport.hockey', 'sci.crypt', 'sci.electronics', 'sci.med', 'sci.space', 'soc.religion.christian', 'talk.politics.guns', 'talk.politics.mideast', 'talk.politics.misc', 'talk.religion.misc']


In [None]:
len(news_data.data)

18846

In [None]:
print(news_data.data[0])

From: egreen@east.sun.com (Ed Green - Pixel Cruncher)
Subject: Re: Observation re: helmets
Organization: Sun Microsystems, RTP, NC
Lines: 21
Distribution: world
Reply-To: egreen@east.sun.com
NNTP-Posting-Host: laser.east.sun.com

In article 211353@mavenry.altcit.eskimo.com, maven@mavenry.altcit.eskimo.com (Norman Hamer) writes:
> 
> The question for the day is re: passenger helmets, if you don't know for 
>certain who's gonna ride with you (like say you meet them at a .... church 
>meeting, yeah, that's the ticket)... What are some guidelines? Should I just 
>pick up another shoei in my size to have a backup helmet (XL), or should I 
>maybe get an inexpensive one of a smaller size to accomodate my likely 
>passenger? 

If your primary concern is protecting the passenger in the event of a
crash, have him or her fitted for a helmet that is their size.  If your
primary concern is complying with stupid helmet laws, carry a real big
spare (you can put a big or small head in a big helmet, bu

In [None]:
# 기본 60% 40%로 나눠줌 (10%가 1883)
# 총 18846건임

from sklearn.datasets import fetch_20newsgroups

# subset='train'으로 학습용 데이터만 추출, remove=('headers, 'footers', 'quotes')로 내용만 추출
train_news = fetch_20newsgroups(subset='train', remove=('headers', 'footers', 'quotes'), random_state=156)    # subset 기본이 train임

X_train = train_news.data
y_train = train_news.target

# subset='test'으로 테스트 데이터만 추출, remove=('headers, 'footers', 'quotes')로 내용만 추출
test_news = fetch_20newsgroups(subset='test', remove=('headers', 'footers', 'quotes'), random_state=156)

X_test = test_news.data
y_test = test_news.target
print('학습 데이터 크기 {0}, 테스트 데이터 크기 {1}'.format(len(train_news.data), len(test_news.data)))

학습 데이터 크기 11314, 테스트 데이터 크기 7532


In [None]:
# !! 테스트 데이터에서 CountVectorizer를 적용할 때는 반드시 학습 데이터를 이용해 fit()이 수행된 CountVectorizer 객체를 이용해 테스트 데이터를 변환(transform)해야 함
# !!! 테스트 데이터의 피처 벡터화 시 fit_transform()을 사용하면 안 됨
# 둘 다 피처 개수가 안 맞는 문제가 발생함

from sklearn.feature_extraction.text import CountVectorizer

# Count Vectorization으로 피처 벡터화 변환 수행.
cnt_vect = CountVectorizer()
cnt_vect.fit(X_train)
X_train_cnt_vect = cnt_vect.transform(X_train)

# 학습 데이터로 fit()된 CountVectorizer를 이용해 테스트 데이터를 피터 벡처화 변환 수행.
X_test_cnt_vect = cnt_vect.transform(X_test)

print('학습 데이터 텍스트의 CountVectorizer Shape:', X_train_cnt_vect.shape)

학습 데이터 텍스트의 CountVectorizer Shape: (11314, 101631)


In [None]:
from sklearn.linear_model import LogisticRegression
from sklearn.metrics import accuracy_score
import warnings
warnings.filterwarnings('ignore')

# LogisticRegression을 이용하여 학습/예측/평가 수행.
lr_clf = LogisticRegression(solver='liblinear')
lr_clf.fit(X_train_cnt_vect, y_train)
pred = lr_clf.predict(X_test_cnt_vect)
print('CountVectorized Logistic Regression의 예측 정확도는 {0:.3f}'.format(accuracy_score(y_test, pred)))

CountVectorized Logistic Regression의 예측 정확도는 0.617


In [None]:
from sklearn.feature_extraction.text import TfidfVectorizer

# TF_IDF 벡터화를 적용해 학습 데이터 세트와 테스트 데이터 세트 변환
tfidf_vect = TfidfVectorizer()
tfidf_vect.fit(X_train)
X_train_tfidf_vect = tfidf_vect.transform(X_train)
X_test_tfidf_vect = tfidf_vect.transform(X_test)

# LogisticRegression을 이용해 학습/예측/평가 수행.
lr_clf = LogisticRegression(solver='liblinear')
lr_clf.fit(X_train_tfidf_vect, y_train)
pred = lr_clf.predict(X_test_tfidf_vect)
print('TF-IDF Logistic Regression의 예측 정확도는 {0:.3f}'.format(accuracy_score(y_test, pred)))

TF-IDF Logistic Regression의 예측 정확도는 0.678


In [None]:
# stop words 필터링을 추가하고 ngrams을 기본 (1,1)에서 (1,2)로 변경해 피처 벡터화 적용.
tfidf_vect = TfidfVectorizer(stop_words='english', ngram_range=(1,2), max_df=300)
tfidf_vect.fit(X_train)
X_train_tfidf_vect = tfidf_vect.transform(X_train)
X_test_tfidf_vect = tfidf_vect.transform(X_test)

lr_clf = LogisticRegression(solver='liblinear')
lr_clf.fit(X_train_tfidf_vect, y_train)

pred = lr_clf.predict(X_test_tfidf_vect)
print('TF-IDF Vectorized Logistic Regression의 예측 정확도는 {0:.3f}'.format(accuracy_score(y_test, pred)))

TF-IDF Vectorized Logistic Regression의 예측 정확도는 0.690


In [None]:
from sklearn.model_selection import GridSearchCV

# 최적 C 값 도출 튜닝 수행. CV는 3 폴드 세트로 설정.
params = { 'C':[0.01, 0.1, 1, 5, 10]}
grid_cv_lr = GridSearchCV(lr_clf, param_grid=params, cv=3, scoring='accuracy', verbose=1)
grid_cv_lr.fit(X_train_tfidf_vect, y_train)
print('Logistci Regression best C parameter:', grid_cv_lr.best_params_)

# 최적 C 값으로 학습된 grid_cv로 예측 및 정확도 평가.
pred = grid_cv_lr.predict(X_test_tfidf_vect)
print('TF-IDF Vectorized Logistic Regression의 예측 정확도는 {0:.3f}'.format(accuracy_score(y_test, pred)))

Fitting 3 folds for each of 5 candidates, totalling 15 fits
Logistci Regression best C parameter: {'C': 10}
TF-IDF Vectorized Logistic Regression의 예측 정확도는 0.704


In [None]:
from sklearn.pipeline import Pipeline

# TfidfVectorizer 객체를 tfidf_vect로, LogisticRegression 객체를 lr_clf로 생성하는 Pipeline 생성
pipeline = Pipeline([
    ('tfidf_vect', TfidfVectorizer(stop_words='english', ngram_range=(1,2), max_df=300)),
    ('lr_clf', LogisticRegression(solver='liblinear', C=10))
])

# 별도의 TfidfVectorizer 객체의 fit(), transform()과 LogisticRegression의 fit(), predict()가 필요 없음.
# pipeling의 fit()과 predict()만으로 한꺼번에 피처 벡터화와 ML 학습/예측이 가능.
pipeline.fit(X_train, y_train)
pred = pipeline.predict(X_test)
print('Pipeline을 통한 Logistic Regression의 예측 정확도는 {0:.3f}'.format(accuracy_score(y_test, pred)))

Pipeline을 통한 Logistic Regression의 예측 정확도는 0.704


In [None]:
from sklearn.pipeline import Pipeline

pipeline = Pipeline([
    ('tfidf_vect', TfidfVectorizer(stop_words='english')),
    ('lr_clf', LogisticRegression())
])

# Pipeline에 기술된 각각의 객체 변수에 언더바(_)2개를 연달아 붙여 GridSearchCV에 사용될
# 파라미터/하이퍼 파라미터 이름과 값을 설정. .
params = { 'tfidf_vect__ngram_range': [(1,1), (1,2), (1,3)],
           'tfidf_vect__max_df': [100, 300, 700],
           'lr_clf__C': [1,5,10]
}

# GridSearchCV의 생성자에 Estimator가 아닌 Pipeline 객체 입력
grid_cv_pipe = GridSearchCV(pipeline, param_grid=params, cv=3 , scoring='accuracy',verbose=1)
grid_cv_pipe.fit(X_train , y_train)
print(grid_cv_pipe.best_params_ , grid_cv_pipe.best_score_)

pred = grid_cv_pipe.predict(X_test)
print('Pipeline을 통한 Logistic Regression 의 예측 정확도는 {0:.3f}'.format(accuracy_score(y_test ,pred)))

Fitting 3 folds for each of 27 candidates, totalling 81 fits


### 감성 분석

In [None]:
# quoting=3 : 모든 따옴표 표시 무시(pandas)
# header=0

import pandas as pd

review_df = pd.read_csv('../dataset/labeledTrainData.tsv', header=0, sep='\t', quoting=3)
# review_df = pd.read_csv('/content/drive/Othercomputers/내 컴퓨터/hmkd1/dataset/labeledTrainData.tsv', header=0, sep='\t', quoting=3)
review_df.head(3)

Unnamed: 0,id,sentiment,review
0,"""5814_8""",1,"""With all this stuff going down at the moment ..."
1,"""2381_9""",1,"""\""The Classic War of the Worlds\"" by Timothy ..."
2,"""7759_3""",0,"""The film starts with a manager (Nicholas Bell..."


In [None]:
print(review_df.shape)
print(review_df['review'][0])

(25000, 3)
"With all this stuff going down at the moment with MJ i've started listening to his music, watching the odd documentary here and there, watched The Wiz and watched Moonwalker again. Maybe i just want to get a certain insight into this guy who i thought was really cool in the eighties just to maybe make up my mind whether he is guilty or innocent. Moonwalker is part biography, part feature film which i remember going to see at the cinema when it was originally released. Some of it has subtle messages about MJ's feeling towards the press and also the obvious message of drugs are bad m'kay.<br /><br />Visually impressive but of course this is all about Michael Jackson so unless you remotely like MJ in anyway then you are going to hate this and find it boring. Some may call MJ an egotist for consenting to the making of this movie BUT MJ and most of his fans would say that he made it for the fans which if true is really nice of him.<br /><br />The actual feature film bit when it 

In [None]:
import re

# <br> html 태그는 replace 함수로 공백으로 변환
review_df['review'] = review_df['review'].str.replace('<br />', ' ')

# 파이썬의 정규 표현식 모듈인 re를 이용해 영어 문자열이 아닌 문자는 모두 공백으로 변환
review_df['review'] = review_df['review'].apply(lambda x : re.sub("[^a-zA-Z]", " ", x))

In [None]:
review_df['review'][0]

' With all this stuff going down at the moment with MJ i ve started listening to his music  watching the odd documentary here and there  watched The Wiz and watched Moonwalker again  Maybe i just want to get a certain insight into this guy who i thought was really cool in the eighties just to maybe make up my mind whether he is guilty or innocent  Moonwalker is part biography  part feature film which i remember going to see at the cinema when it was originally released  Some of it has subtle messages about MJ s feeling towards the press and also the obvious message of drugs are bad m kay   Visually impressive but of course this is all about Michael Jackson so unless you remotely like MJ in anyway then you are going to hate this and find it boring  Some may call MJ an egotist for consenting to the making of this movie BUT MJ and most of his fans would say that he made it for the fans which if true is really nice of him   The actual feature film bit when it finally starts is only on for 

In [None]:
from sklearn.model_selection import train_test_split

class_df = review_df['sentiment']
feature_df = review_df.drop(['id', 'sentiment'], axis=1, inplace=False)

X_train, X_test, y_train, y_test = train_test_split(feature_df, class_df, test_size=0.3, random_state=156)
X_train.shape, X_test.shape

((17500, 1), (7500, 1))

In [None]:
from sklearn.feature_extraction.text import CountVectorizer, TfidfVectorizer
from sklearn.pipeline import Pipeline
from sklearn.linear_model import LogisticRegression
from sklearn.metrics import accuracy_score, roc_auc_score

# 스톱 워드는 English, filtering, ngram은 (1, 2)로 설정해 CountVectorization 수행.
# LogisticRegression의 C는 10으로 설정.
pipeline = Pipeline([
    ('cnt_vect', CountVectorizer(stop_words='english', ngram_range=(1,2))),
    ('lr_cnt', LogisticRegression(solver='liblinear', C=10))
])

# Pipeline 객체를 이용해 fit(), predict()로 학습/예측 수행. predict_proba()는 roc_auc 때문에 수행.
# text 내용 자체가 들어가야 하기에 ['review'] -> 위에서는 .data로 내용만 X_train에 들어갔으니까
pipeline.fit(X_train['review'], y_train)
pred = pipeline.predict(X_test['review'])
pred_probs = pipeline.predict_proba(X_test['review'])[:,1]

print('예측 정확도는 {0:.4f}, ROC-AUC는 {1:.4f}'.format(accuracy_score(y_test, pred), roc_auc_score(y_test, pred_probs)))

예측 정확도는 0.8861, ROC-AUC는 0.9503


In [None]:
# 스톱 워드는 english, filtering, ngram은 (1, 2)로 설정해 TF-IDF 벡터화 수행.
# LogisticRegression의 C는 10으로 설정.

pipeline = Pipeline([
    ('tfidf_vect', TfidfVectorizer(stop_words='english', ngram_range=(1,2))),
    ('lr_cnt', LogisticRegression(solver='liblinear', C=10))
])

pipeline.fit(X_train['review'], y_train)
pred = pipeline.predict(X_test['review'])
pred_probs = pipeline.predict_proba(X_test['review'])[:,1]

print('예측 정확도는 {0:.4f}, ROC-AUC는 {1:.4f}'.format(accuracy_score(y_test, pred), roc_auc_score(y_test, pred_probs)))

### 비지도학습 기반 감성 분석 소개

- NLP는 일반적인 용어이지만 여기서는 자연어 처리를 지원하는 여러 라이브러리를 묶어서 지칭
- NLTK:
    - NLTK는 파이썬의 가장 오래되고 대표적인 자연어 처리 라이브러리
    - 광범위한 기능을 제공하며, 특히 교육적인 목적으로 많이 사용
    - 토큰화, 어간 추출, 품사 태깅, 구문 분석 등 다양한 기능을 지원
    - NLTK는 또한 많은 언어 자료와 말뭉치를 포함

- WordNet은 어휘의 의미 사이의 관계를 기반으로 구조화된 영어 어휘 데이터베이스
- SentiWordNet은 WordNet의 확장판
    - 감성 분석을 위해 각각의 WordNet synset에 긍정적, 부정적, 중립적 점수를 할당
    - 이러한 점수는 해당 synset이나 연관 단어가 얼마나 긍정적, 부정적, 또는 중립적인지를 나타낸다.
    - 문장이나 문서의 전체적인 감성을 평가할 수 있고, 특정 단어나 표현이 문맥에 따라 어떤 감성적 효과를 가져오는지 이해

In [13]:
import nltk
nltk.download('all')

[nltk_data] Downloading collection 'all'
[nltk_data]    | 
[nltk_data]    | Downloading package abc to /root/nltk_data...
[nltk_data]    |   Unzipping corpora/abc.zip.
[nltk_data]    | Downloading package alpino to /root/nltk_data...
[nltk_data]    |   Unzipping corpora/alpino.zip.
[nltk_data]    | Downloading package averaged_perceptron_tagger to
[nltk_data]    |     /root/nltk_data...
[nltk_data]    |   Unzipping taggers/averaged_perceptron_tagger.zip.
[nltk_data]    | Downloading package averaged_perceptron_tagger_ru to
[nltk_data]    |     /root/nltk_data...
[nltk_data]    |   Unzipping
[nltk_data]    |       taggers/averaged_perceptron_tagger_ru.zip.
[nltk_data]    | Downloading package basque_grammars to
[nltk_data]    |     /root/nltk_data...
[nltk_data]    |   Unzipping grammars/basque_grammars.zip.
[nltk_data]    | Downloading package bcp47 to /root/nltk_data...
[nltk_data]    | Downloading package biocreative_ppi to
[nltk_data]    |     /root/nltk_data...
[nltk_data]    |   U

True

In [None]:
from nltk.corpus import wordnet as wn    # 몸체, 집합체, 말뭉치

term = 'present'

# 'present' 라는 단어로 wordnet의 synsets 생성.
synsets = wn.synsets(term)
print('synsets() 반환 type :', type(synsets))
print('synsets() 반환 값 개수:', len(synsets))
print('synsets() 반환 값:', synsets)

synsets() 반환 type : <class 'list'>
synsets() 반환 값 개수: 18
synsets() 반환 값: [Synset('present.n.01'), Synset('present.n.02'), Synset('present.n.03'), Synset('show.v.01'), Synset('present.v.02'), Synset('stage.v.01'), Synset('present.v.04'), Synset('present.v.05'), Synset('award.v.01'), Synset('give.v.08'), Synset('deliver.v.01'), Synset('introduce.v.01'), Synset('portray.v.04'), Synset('confront.v.03'), Synset('present.v.12'), Synset('salute.v.06'), Synset('present.a.01'), Synset('present.a.02')]


In [None]:
for synset in synsets:
    print('#### Synset name : ', synset.name(), '####')
    print('POD :', synset.lexname())
    print('Definition:', synset.definition())
    print('Lemmas:', synset.lemma_names())

#### Synset name :  present.n.01 ####
POD : noun.time
Definition: the period of time that is happening now; any continuous stretch of time including the moment of speech
Lemmas: ['present', 'nowadays']
#### Synset name :  present.n.02 ####
POD : noun.possession
Definition: something presented as a gift
Lemmas: ['present']
#### Synset name :  present.n.03 ####
POD : noun.communication
Definition: a verb tense that expresses actions or states at the time of speaking
Lemmas: ['present', 'present_tense']
#### Synset name :  show.v.01 ####
POD : verb.perception
Definition: give an exhibition of to an interested audience
Lemmas: ['show', 'demo', 'exhibit', 'present', 'demonstrate']
#### Synset name :  present.v.02 ####
POD : verb.communication
Definition: bring forward and present to the mind
Lemmas: ['present', 'represent', 'lay_out']
#### Synset name :  stage.v.01 ####
POD : verb.creation
Definition: perform (a play), especially on a stage
Lemmas: ['stage', 'present', 'represent']
#### Syn

In [None]:
# name() - WordNet에서 이름을 반환하는 메서드
# path_similarity() - 어휘 간의 유사도를 나타내주는 메서드

# synset 객체를 단어별로 생성합니다.
tree = wn.synset('tree.n.01')
lion = wn.synset('lion.n.01')
tiger = wn.synset('tiger.n.02')
cat = wn.synset('cat.n.01')
dog = wn.synset('dog.n.01')

entities = [tree , lion , tiger , cat , dog]
similarities = []
entity_names = [ entity.name().split('.')[0] for entity in entities]    #이름인 tree.n.01를 .으로 나눠서 tree만 선택

# 단어별 synset 들을 iteration 하면서 다른 단어들의 synset과 유사도를 측정합니다.
for entity in entities:
    similarity = [ round(entity.path_similarity(compared_entity), 2)  for compared_entity in entities ]
    similarities.append(similarity)

# 개별 단어별 synset과 다른 단어의 synset과의 유사도를 DataFrame형태로 저장합니다.
similarity_df = pd.DataFrame(similarities , columns=entity_names,index=entity_names)
similarity_df

Unnamed: 0,tree,lion,tiger,cat,dog
tree,1.0,0.07,0.07,0.08,0.12
lion,0.07,1.0,0.33,0.25,0.17
tiger,0.07,0.33,1.0,0.25,0.17
cat,0.08,0.25,0.25,1.0,0.2
dog,0.12,0.17,0.17,0.2,1.0


In [None]:
# senti_synsets - 단순한 하나의 단어가 아니라 그 단어가 가지는 문맥, 시멘틱 정보를 제공

import nltk
from nltk.corpus import sentiwordnet as swn

senti_synsets = list(swn.senti_synsets('slow'))
print('senti_synsets() 반환 type :', type(senti_synsets))
print('senti_synsets() 반환 값 갯수:', len(senti_synsets))
print('senti_synsets() 반환 값 :', senti_synsets)

senti_synsets() 반환 type : <class 'list'>
senti_synsets() 반환 값 갯수: 11
senti_synsets() 반환 값 : [SentiSynset('decelerate.v.01'), SentiSynset('slow.v.02'), SentiSynset('slow.v.03'), SentiSynset('slow.a.01'), SentiSynset('slow.a.02'), SentiSynset('dense.s.04'), SentiSynset('slow.a.04'), SentiSynset('boring.s.01'), SentiSynset('dull.s.08'), SentiSynset('slowly.r.01'), SentiSynset('behind.r.03')]


In [None]:
# 반복 가능한 객체에서 지정된 조건을 충족하는 요소들로 이루어진 새로운 필터링된 객체를 생성하는데 사용
swn.senti_synsets('slow')

<filter at 0x7f3dffab0790>

In [None]:
import nltk
from nltk.corpus import sentiwordnet as swn

father = swn.senti_synset('father.n.01')
print('father 긍정감성 지수: ', father.pos_score())
print('father 부정감성 지수: ', father.neg_score())
print('father 객관성 지수: ', father.obj_score())
print('\n')
fabulous = swn.senti_synset('fabulous.a.01')
print('fabulous 긍정감성 지수: ',fabulous .pos_score())
print('fabulous 부정감성 지수: ',fabulous .neg_score())

father 긍정감성 지수:  0.0
father 부정감성 지수:  0.0
father 객관성 지수:  1.0


fabulous 긍정감성 지수:  0.875
fabulous 부정감성 지수:  0.125


In [None]:
slow = swn.senti_synset('slow.a.01')
print('father 긍정감성 지수: ', slow.pos_score())
print('father 부정감성 지수: ', slow.neg_score())
print('father 객관성 지수: ', slow.obj_score())

father 긍정감성 지수:  0.0
father 부정감성 지수:  0.0
father 객관성 지수:  1.0


In [None]:
# Penn Treebank POS 태그는 45개의 서로 다른 태그 세트
# NLTK의 PennTreebank 품사 태그를 WordNet 품사 태그로 변환하는 커스텀 함수

from nltk.corpus import wordnet as wn

# 간단한 NTLK PennTreebank Tag를 기반으로 WordNet 기반의 품사 Tag로 변환
def penn_to_wn(tag):
    if tag.startswith('J'):
        return wn.ADJ
    elif tag.startswith('N'):
        return wn.NOUN
    elif tag.startswith('R'):
        return wn.ADV
    elif tag.startswith('V'):
        return wn.VERB

In [None]:
from nltk.stem import WordNetLemmatizer    # 원형 단어 찾기
from nltk.corpus import sentiwordnet as swn
from nltk import sent_tokenize, word_tokenize, pos_tag    # 문장, 단어로 토큰화하기

def swn_polarity(text):
    # 감성 지수 초기화
    sentiment = 0.0
    tokens_count = 0

    lemmatizer = WordNetLemmatizer()
    raw_sentences = sent_tokenize(text)
    # 분해된 문장별로 단어 토큰 -> 품사 태깅 후에 SentiSynset 생성 -> 감성 지수 확인
    for raw_sentence in raw_sentences:
        # NTLK 기반의 품사 태깅 문장 추출
        tagged_sentence = pos_tag(word_tokenize(raw_sentence))
        for word, tag in tagged_sentence:

            # WordNet 기반 품사 태깅과 어근 추출
            wn_tag = penn_to_wn(tag)
            if wn_tag not in (wn.NOUN, wn.ADJ, wn.ADV):
                continue
            lemma = lemmatizer.lemmatize(word, pos=wn_tag)
            if not lemma:
                continue
            # 어근을 추출한 단어와 WordNet 기반 품사 태깅을 입력해 Synset 객체를 생성.
            synsets = wn.synsets(lemma, pos=wn_tag)
            if not synsets:
                continue
            # sentiwordnet의 감성 단어 분석으로 감성 synset 추출
            # 모든 단어에 대해 긍정 감성 지수는 +로 부정 감성 지수는 -로 합산해 감성 지수 계산.
            synset = synsets[0]
            swn_synset = swn.senti_synset(synset.name())
            sentiment += (swn_synset.pos_score() - swn_synset.neg_score())
            tokens_count += 1

    if not tokens_count:
        return 0

    # 총 score가 0 이상일 경우 긍정(Positive) 1, 그렇지 않을 경우 부정(Negative) 0 반환
    if sentiment >= 0 :
        return 1

    return 0

In [None]:
review_df['preds'] = review_df['review'].apply(lambda x : swn_polarity(x))
y_target = review_df['sentiment'].values
preds = review_df['preds'].values

In [None]:
', 'DT'), ('number', 'NN'), ('of', 'IN'), ('films', 'NNS'), ('you', 'PRP'), ('can', 'MD'), ('find', 'VB'), ('Nicholas', 'JJ'), ('Cage', 'NNP'), ('as', 'IN'), ('a', 'DT'), ('strong', 'JJ'), ('silent', 'NN'), ('hero', 'NN'), ('Dennis', 'NNP'), ('Hopper', 'NNP'), ('as', 'IN'), ('a', 'DT'), ('homicidal', 'JJ'), ('maniac', 'NN'), ('Lara', 'NNP'), ('Flynn', 'NNP'), ('Boyle', 'NNP'), ('as', 'IN'), ('a', 'DT'), ('vamp', 'NN'), ('tramp', 'NN'), ('and', 'CC'), ('the', 'DT'), ('late', 'JJ'), ('lamented', 'VBD'), ('J', 'NNP'), ('T', 'NNP'), ('Walsh', 'NNP'), ('as', 'IN'), ('the', 'DT'), ('heavy', 'JJ'), ('These', 'DT'), ('are', 'VBP'), ('the', 'DT'), ('types', 'NNS'), ('of', 'IN'), ('roles', 'NNS'), ('these', 'DT'), ('four', 'CD'), ('can', 'MD'), ('play', 'VB'), ('in', 'IN'), ('their', 'PRP$'), ('sleep', 'NN'), ('and', 'CC'), ('they', 'PRP'), ('have', 'VBP'), ('done', 'VBN'), ('so', 'RB'), ('often', 'RB'), ('enough', 'JJ'), ('that', 'IN'), ('to', 'TO'), ('see', 'VB'), ('them', 'PRP'), ('playing', 'VBG'), ('them', 'PRP'), ('again', 'RB'), ('borders', 'NNS'), ('on', 'IN'), ('cliche', 'NN'), ('What', 'WP'), ('a', 'DT'), ('relief', 'NN'), ('therefore', 'NN'), ('that', 'IN'), ('John', 'NNP'), ('Dahl', 'NNP'), ('a', 'DT'), ('master', 'NN'), ('at', 'IN'), ('getting', 'VBG'), ('a', 'DT'), ('lot', 'NN'), ('of', 'IN'), ('mood', 'NN'), ('out', 'IN'), ('of', 'IN'), ('a', 'DT'), ('little', 'JJ'), ('action', 'NN'), ('directed', 'VBD'), ('this', 'DT'), ('nuanced', 'JJ'), ('noirish', 'JJ'), ('thriller', 'NN'), ('Hopper', 'NNP'), ('manages', 'VBZ'), ('to', 'TO'), ('keep', 'VB'), ('from', 'IN'), ('going', 'VBG'), ('over', 'IN'), ('the', 'DT'), ('top', 'JJ'), ('Cage', 'NN'), ('shows', 'VBZ'), ('a', 'DT'), ('little', 'RB'), ('more', 'RBR'), ('depth', 'JJ'), ('than', 'IN'), ('his', 'PRP$'), ('usually', 'RB'), ('superficial', 'JJ'), ('action', 'NN'), ('heroes', 'NNS'), ('Boyle', 'NNP'), ('is', 'VBZ'), ('by', 'IN'), ('turns', 'NNS'), ('sultry', 'JJ'), ('innocent', 'JJ'), ('and', 'CC'), ('scheming', 'VBG'), ('and', 'CC'), ('one', 'CD'), ('gets', 'VBZ'), ('a', 'DT'), ('sense', 'NN'), ('of', 'IN'), ('the', 'DT'), ('hard', 'JJ'), ('iron', 'NN'), ('of', 'IN'), ('the', 'DT'), ('soul', 'NN'), ('that', 'WDT'), ('is', 'VBZ'), ('central', 'JJ'), ('to', 'TO'), ('his', 'PRP$'), ('character', 'NN'), ('Wayne', 'NNP'), ('Dahl', 'NNP'), ('s', 'JJ'), ('direction', 'NN'), ('gives', 'VBZ'), ('a', 'DT'), ('sense', 'NN'), ('of', 'IN'), ('the', 'DT'), ('emptiness', 'NN'), ('of', 'IN'), ('the', 'DT'), ('Big', 'NNP'), ('Sky', 'NNP'), ('country', 'NN'), ('where', 'WRB'), ('the', 'DT'), ('story', 'NN'), ('takes', 'VBZ'), ('place', 'NN'), ('while', 'IN'), ('also', 'RB'), ('being', 'VBG'), ('intimate', 'JJ'), ('enough', 'RB'), ('to', 'TO'), ('show', 'VB'), ('how', 'WRB'), ('a', 'DT'), ('wrinkled', 'JJ'), ('brow', 'NN'), ('can', 'MD'), ('indicate', 'VB'), ('a', 'DT'), ('radical', 'JJ'), ('change', 'NN'), ('of', 'IN'), ('plot', 'NN'), ('in', 'IN'), ('store', 'NN'), ('The', 'DT'), ('plot', 'NN'), ('twists', 'NNS'), ('are', 'VBP'), ('top', 'JJ'), ('notch', 'NN'), ('and', 'CC'), ('one', 'CD'), ('of', 'IN'), ('the', 'DT'), ('other', 'JJ'), ('great', 'JJ'), ('twists', 'NNS'), ('in', 'IN'), ('this', 'DT'), ('movie', 'NN'), ('is', 'VBZ'), ('that', 'IN'), ('some', 'DT'), ('of', 'IN'), ('the', 'DT'), ('supporting', 'VBG'), ('characters', 'NNS'), ('actually', 'RB'), ('act', 'VBP'), ('as', 'IN'), ('if', 'IN'), ('they', 'PRP'), ('have', 'VBP'), ('brains', 'NNS'), ('It', 'PRP'), ('isn', 'VBZ'), ('t', 'RB'), ('often', 'RB'), ('that', 'IN'), ('minor', 'JJ'), ('characters', 'NNS'), ('like', 'IN'), ('deputy', 'NN'), ('sheriffs', 'NNS'), ('have', 'VBP'), ('more', 'JJR'), ('brains', 'NNS'), ('than', 'IN'), ('their', 'PRP$'), ('headlining', 'JJ'), ('superiors', 'NNS'), ('But', 'CC'), ('with', 'IN'), ('a', 'DT'), ('director', 'NN'), ('as', 'RB'), ('smart', 'JJ'), ('as', 'IN'), ('Dahl', 'NNP'), ('you', 'PRP'), ('shouldn', 'VBP'), ('t', 'RB'), ('be', 'VB'), ('surprised', 'VBN'), ('by', 'IN'), ('the', 'DT'), ('intelligence', 'NN'), ('of', 'IN'), ('anything', 'NN'), ('connected', 'VBN'), ('with', 'IN'), ('this', 'DT'), ('film', 'NN'), ('An', 'DT'), ('excellent', 'JJ'), ('movie', 'NN')]
[('The', 'DT'), ('One', 'CD'), ('and', 'CC'), ('only', 'RB'), ('was', 'VBD'), ('a', 'DT'), ('great', 'JJ'), ('film', 'NN'), ('I', 'PRP'), ('had', 'VBD'), ('just', 'RB'), ('finished', 'VBN'), ('viewing', 'VBG'), ('it', 'PRP'), ('on', 'IN'), ('EncoreW', 'NNP'), ('on', 'IN'), ('DirecTV', 'NNP'), ('I', 'PRP'), ('am', 'VBP'), ('an', 'DT'), ('independent', 'JJ'), ('professional', 'JJ'), ('wrestler', 'NN'), ('and', 'CC'), ('I', 'PRP'), ('thought', 'VBD'), ('this', 'DT'), ('was', 'VBD'), ('a', 'DT'), ('good', 'JJ'), ('portray', 'NN'), ('of', 'IN'), ('what', 'WP'), ('life', 'NN'), ('is', 'VBZ'), ('like', 'IN'), ('as', 'IN'), ('a', 'DT'), ('professional', 'JJ'), ('wrestler', 'NN'), ('Now', 'RB'), ('this', 'DT'), ('film', 'NN'), ('was', 'VBD'), ('made', 'VBN'), ('years', 'NNS'), ('before', 'IN'), ('I', 'PRP'), ('was', 'VBD'), ('born', 'VBN'), ('but', 'CC'), ('I', 'PRP'), ('don', 'VBP'), ('t', 'JJ'), ('think', 'VBP'), ('the', 'DT'), ('rigors', 'NNS'), ('of', 'IN'), ('professional', 'JJ'), ('wrestling', 'NN'), ('traveling', 'NN'), ('has', 'VBZ'), ('changed', 'VBN'), ('all', 'DT'), ('that', 'RB'), ('much', 'JJ'), ('Sad', 'NNP'), ('funny', 'NN'), ('and', 'CC'), ('all', 'DT'), ('around', 'RB'), ('GREAT', 'NNP')]
[('The', 'DT'), ('movie', 'NN'), ('is', 'VBZ'), ('okay', 'JJ'), ('it', 'PRP'), ('has', 'VBZ'), ('it', 'PRP'), ('s', 'JJ'), ('moments', 'VBZ'), ('the', 'DT'), ('music', 'NN'), ('scenes', 'NNS'), ('are', 'VBP'), ('the', 'DT'), ('best', 'JJS'), ('of', 'IN'), ('all', 'PDT'), ('The', 'DT'), ('soundtrack', 'NN'), ('is', 'VBZ'), ('a', 'DT'), ('true', 'JJ'), ('classic', 'JJ'), ('It', 'PRP'), ('s', 'VBZ'), ('a', 'DT'), ('perfect', 'JJ'), ('album', 'NN'), ('it', 'PRP'), ('starts', 'VBZ'), ('out', 'RP'), ('with', 'IN'), ('Let', 'NNP'), ('s', 'PRP'), ('Go', 'NNP'), ('Crazy', 'NNP'), ('appropriate', 'NN'), ('for', 'IN'), ('the', 'DT'), ('beginning', 'NN'), ('as', 'IN'), ('it', 'PRP'), ('s', 'VBZ'), ('a', 'DT'), ('great', 'JJ'), ('party', 'NN'), ('song', 'NN'), ('and', 'CC'), ('very', 'RB'), ('up', 'RB'), ('tempo', 'JJ'), ('Take', 'VB'), ('Me', 'NN'), ('With', 'IN'), ('U', 'NNP'), ('a', 'DT'), ('fun', 'NN'), ('pop', 'NN'), ('song', 'NN'), ('The', 'DT'), ('Beautiful', 'NNP'), ('Ones', 'NNP'), ('a', 'DT'), ('cheerful', 'JJ'), ('ballad', 'NN'), ('probably', 'RB'), ('the', 'DT'), ('closest', 'JJS'), ('thing', 'NN'), ('to', 'TO'), ('R', 'NNP'), ('B', 'NNP'), ('on', 'IN'), ('this', 'DT'), ('whole', 'JJ'), ('album', 'NN'), ('Computer', 'NNP'), ('Blue', 'NNP'), ('a', 'DT'), ('somewhat', 'RB'), ('angry', 'JJ'), ('anthem', 'NN'), ('towards', 'NNS'), ('Appolonia', 'NNP'), ('Darling', 'NNP'), ('Nikki', 'NNP'), ('one', 'CD'), ('of', 'IN'), ('the', 'DT'), ('funniest', 'JJS'), ('songs', 'NNS'), ('ever', 'RB'), ('it', 'PRP'), ('very', 'RB'), ('vaguely', 'RB'), ('makes', 'VBZ'), ('fun', 'NN'), ('of', 'IN'), ('Appolonia', 'NNP'), ('When', 'WRB'), ('Doves', 'VBZ'), ('Cry', 'NNP'), ('the', 'DT'), ('climax', 'NN'), ('to', 'TO'), ('this', 'DT'), ('masterpiece', 'NN'), ('I', 'PRP'), ('Would', 'MD'), ('Die', 'VB'), ('U', 'NNP'), ('Baby', 'NNP'), ('I', 'PRP'), ('m', 'VBP'), ('A', 'DT'), ('Star', 'NNP'), ('and', 'CC'), ('of', 'IN'), ('course', 'NN'), ('Purple', 'NNP'), ('Rain', 'NNP'), ('a', 'DT'), ('true', 'JJ'), ('classic', 'NN'), ('a', 'DT'), ('very', 'RB'), ('appropriate', 'JJ'), ('ending', 'VBG'), ('for', 'IN'), ('this', 'DT'), ('classic', 'JJ'), ('album', 'VBZ'), ('The', 'DT'), ('movie', 'NN'), ('and', 'CC'), ('the', 'DT'), ('album', 'NN'), ('are', 'VBP'), ('both', 'DT'), ('very', 'RB'), ('good', 'JJ'), ('I', 'PRP'), ('highly', 'RB'), ('recommend', 'VBP'), ('them', 'PRP')]
[('It', 'PRP'), ('must', 'MD'), ('have', 'VB'), ('been', 'VBN'), ('several', 'JJ'), ('years', 'NNS'), ('after', 'IN'), ('it', 'PRP'), ('was', 'VBD'), ('released', 'VBN'), ('so', 'RB'), ('don', 'JJ'), ('t', 'NN'), ('know', 'VBP'), ('why', 'WRB'), ('it', 'PRP'), ('was', 'VBD'), ('at', 'IN'), ('the', 'DT'), ('movies', 'NNS'), ('But', 'CC'), ('as', 'IN'), ('a', 'DT'), ('kid', 'NN'), ('I', 'PRP'), ('enjoyed', 'VBD'), ('it', 'PRP'), ('I', 'PRP'), ('just', 'RB'), ('found', 'VBD'), ('a', 'DT'), ('VHS', 'NNP'), ('tape', 'NN'), ('of', 'IN'), ('Superman', 'NNP'), ('and', 'CC'), ('the', 'DT'), ('Mole', 'NNP'), ('Men', 'NNP'), ('at', 'IN'), ('the', 'DT'), ('flea', 'NN'), ('market', 'NN'), ('and', 'CC'), ('decided', 'VBD'), ('to', 'TO'), ('watch', 'VB'), ('it', 'PRP'), ('again', 'RB'), ('it', 'PRP'), ('s', 'VBD'), ('been', 'VBN'), ('a', 'DT'), ('lot', 'NN'), ('of', 'IN'), ('years', 'NNS'), ('I', 'PRP'), ('wasn', 'VBP'), ('t', 'JJ'), ('expecting', 'VBG'), ('much', 'RB'), ('now', 'RB'), ('knowing', 'VBG'), ('how', 'WRB'), ('the', 'DT'), ('B', 'NNP'), ('movies', 'NNS'), ('were', 'VBD'), ('made', 'VBN'), ('at', 'IN'), ('that', 'DT'), ('time', 'NN'), ('But', 'CC'), ('I', 'PRP'), ('was', 'VBD'), ('pleasantly', 'RB'), ('surprised', 'VBN'), ('to', 'TO'), ('find', 'VB'), ('the', 'DT'), ('movie', 'NN'), ('very', 'RB'), ('watchable', 'JJ'), ('and', 'CC'), ('the', 'DT'), ('acting', 'NN'), ('by', 'IN'), ('all', 'DT'), ('outstanding', 'JJ'), ('Usual', 'NNP'), ('acting', 'NN'), ('in', 'IN'), ('these', 'DT'), ('type', 'NN'), ('movies', 'NNS'), ('leaves', 'VBZ'), ('a', 'DT'), ('lot', 'NN'), ('to', 'TO'), ('be', 'VB'), ('desired', 'VBN'), ('Surprisingly', 'RB'), ('the', 'DT'), ('writing', 'NN'), ('wasn', 'NN'), ('t', 'NN'), ('bad', 'JJ'), ('either', 'CC'), ('Forget', 'VB'), ('the', 'DT'), ('fact', 'NN'), ('that', 'IN'), ('Superman', 'NNP'), ('went', 'VBD'), ('from', 'IN'), ('sequence', 'NN'), ('to', 'TO'), ('sequence', 'NN'), ('and', 'CC'), ('could', 'MD'), ('have', 'VB'), ('kicked', 'VBN'), ('all', 'DT'), ('their', 'PRP$'), ('butts', 'NN'), ('in', 'IN'), ('the', 'DT'), ('beginning', 'NN'), ('because', 'IN'), ('then', 'RB'), ('the', 'DT'), ('story', 'NN'), ('would', 'MD'), ('have', 'VB'), ('ended', 'VBN'), ('right', 'RB'), ('OK', 'IN'), ('the', 'DT'), ('mole', 'JJ'), ('men', 'NNS'), ('costumes', 'NNS'), ('were', 'VBD'), ('hokey', 'JJ'), ('and', 'CC'), ('not', 'RB'), ('very', 'RB'), ('scary', 'JJ'), ('they', 'PRP'), ('didn', 'VBP'), ('t', 'JJ'), ('even', 'RB'), ('scare', 'VB'), ('me', 'PRP'), ('as', 'IN'), ('a', 'DT'), ('kid', 'NN'), ('However', 'RB'), ('making', 'VBG'), ('allowances', 'NNS'), ('for', 'IN'), ('the', 'DT'), ('probable', 'JJ'), ('low', 'JJ'), ('budget', 'NN'), ('for', 'IN'), ('background', 'NN'), ('and', 'CC'), ('costumes', 'NNS'), ('it', 'PRP'), ('was', 'VBD'), ('a', 'DT'), ('job', 'NN'), ('well', 'RB'), ('done', 'VBN'), ('by', 'IN'), ('all', 'DT'), ('I', 'PRP'), ('recognized', 'VBD'), ('the', 'DT'), ('sheriff', 'NN'), ('right', 'RB'), ('away', 'RB'), ('as', 'IN'), ('The', 'DT'), ('Old', 'NNP'), ('Ranger', 'NNP'), ('from', 'IN'), ('Death', 'NNP'), ('Valley', 'NNP'), ('Days', 'NNP'), ('and', 'CC'), ('plenty', 'NN'), ('of', 'IN'), ('supporting', 'VBG'), ('roles', 'NNS'), ('in', 'IN'), ('TV', 'NN'), ('westerns', 'NNS'), ('J', 'NNP'), ('Farrell', 'NNP'), ('MacDonald', 'NNP'), ('played', 'VBD'), ('old', 'JJ'), ('Pop', 'NNP'), ('and', 'CC'), ('was', 'VBD'), ('always', 'RB'), ('a', 'DT'), ('great', 'JJ'), ('supporting', 'VBG'), ('actor', 'NN'), ('in', 'IN'), ('more', 'JJR'), ('movies', 'NNS'), ('than', 'IN'), ('I', 'PRP'), ('can', 'MD'), ('count', 'VB'), ('Walter', 'NNP'), ('Reed', 'NNP'), ('and', 'CC'), ('Jeff', 'NNP'), ('Corey', 'NNP'), ('were', 'VBD'), ('familiar', 'JJ'), ('faces', 'VBZ'), ('as', 'RB'), ('well', 'RB'), ('from', 'IN'), ('other', 'JJ'), ('movies', 'NNS'), ('Did', 'NNP'), ('you', 'PRP'), ('recognize', 'VB'), ('the', 'DT'), ('old', 'JJ'), ('doctor', 'NN'), ('as', 'IN'), ('the', 'DT'), ('captain', 'NN'), ('of', 'IN'), ('the', 'DT'), ('ship', 'NN'), ('that', 'WDT'), ('went', 'VBD'), ('to', 'TO'), ('get', 'VB'), ('King', 'NNP'), ('Kong', 'NNP'), ('Did', 'NNP'), ('you', 'PRP'), ('recognize', 'VB'), ('the', 'DT'), ('little', 'JJ'), ('girl', 'NN'), ('rolling', 'VBG'), ('the', 'DT'), ('ball', 'NN'), ('to', 'TO'), ('the', 'DT'), ('mole', 'JJ'), ('men', 'NNS'), ('as', 'IN'), ('Lisbeth', 'NNP'), ('Searcy', 'NNP'), ('in', 'IN'), ('Old', 'NNP'), ('Yeller', 'NNP'), ('Some', 'DT'), ('of', 'IN'), ('the', 'DT'), ('mole', 'JJ'), ('men', 'NNS'), ('were', 'VBD'), ('famous', 'JJ'), ('too', 'RB'), ('Jerry', 'NNP'), ('Maren', 'NNP'), ('has', 'VBZ'), ('played', 'VBN'), ('Mayor', 'NNP'), ('McCheese', 'NNP'), ('for', 'IN'), ('McDonalds', 'NNP'), ('Little', 'NNP'), ('Oscar', 'NNP'), ('Mayer', 'NNP'), ('was', 'VBD'), ('the', 'DT'), ('Munchkin', 'NNP'), ('that', 'WDT'), ('handed', 'VBD'), ('Dorothy', 'NNP'), ('the', 'DT'), ('lollipop', 'NN'), ('was', 'VBD'), ('on', 'IN'), ('a', 'DT'), ('Seifeld', 'NNP'), ('episode', 'NN'), ('and', 'CC'), ('a', 'DT'), ('wealth', 'NN'), ('of', 'IN'), ('other', 'JJ'), ('work', 'NN'), ('Billy', 'NNP'), ('Curtis', 'NNP'), ('played', 'VBD'), ('an', 'DT'), ('unforgettable', 'JJ'), ('part', 'NN'), ('with', 'IN'), ('Clint', 'NNP'), ('Eastwood', 'NNP'), ('in', 'IN'), ('High', 'NNP'), ('Plains', 'NNP'), ('Drifter', 'NNP'), ('was', 'VBD'), ('one', 'CD'), ('of', 'IN'), ('the', 'DT'), ('friends', 'NNS'), ('met', 'VBN'), ('by', 'IN'), ('the', 'DT'), ('star', 'NN'), ('in', 'IN'), ('Incredible', 'JJ'), ('Shrinking', 'VBG'), ('Man', 'NN'), ('he', 'PRP'), ('had', 'VBD'), ('a', 'DT'), ('part', 'NN'), ('in', 'IN'), ('a', 'DT'), ('movie', 'NN'), ('I', 'PRP'), ('just', 'RB'), ('luckily', 'RB'), ('grabbed', 'VBN'), ('at', 'IN'), ('a', 'DT'), ('flea', 'NN'), ('market', 'NN'), ('titled', 'VBD'), ('My', 'NNP'), ('Gal', 'NNP'), ('Sal', 'NNP'), ('with', 'IN'), ('Rita', 'NNP'), ('Hayworth', 'NNP'), ('Wizard', 'NNP'), ('of', 'IN'), ('Oz', 'NNP'), ('and', 'CC'), ('plenty', 'NN'), ('of', 'IN'), ('other', 'JJ'), ('parts', 'NNS'), ('great', 'JJ'), ('actor', 'NN'), ('John', 'NNP'), ('Brambury', 'NNP'), ('was', 'VBD'), ('also', 'RB'), ('a', 'DT'), ('Munchkin', 'NNP'), ('Phillis', 'NNP'), ('Coates', 'NNP'), ('who', 'WP'), ('played', 'VBD'), ('Lois', 'NNP'), ('Lane', 'NNP'), ('in', 'IN'), ('this', 'DT'), ('movie', 'NN'), ('was', 'VBD'), ('without', 'IN'), ('question', 'NN'), ('wonderful', 'NN'), ('in', 'IN'), ('the', 'DT'), ('part', 'NN'), ('and', 'CC'), ('George', 'NNP'), ('Reeves', 'NNP'), ('as', 'IN'), ('Superman', 'NNP'), ('Clark', 'NNP'), ('Kent', 'NNP'), ('WAS', 'NNP'), ('Superman', 'NNP'), ('He', 'PRP'), ('did', 'VBD'), ('a', 'DT'), ('great', 'JJ'), ('job', 'NN'), ('of', 'IN'), ('playing', 'VBG'), ('the', 'DT'), ('strong', 'JJ'), ('man', 'NN'), ('Bottom', 'NNP'), ('line', 'NN'), ('to', 'TO'), ('all', 'DT'), ('I', 'PRP'), ('ve', 'VBP'), ('said', 'VBD'), ('is', 'VBZ'), ('that', 'IN'), ('this', 'DT'), ('movie', 'NN'), ('is', 'VBZ'), ('worth', 'JJ'), ('watching', 'VBG'), ('because', 'IN'), ('of', 'IN'), ('the', 'DT'), ('cast', 'NN'), ('and', 'CC'), ('writing', 'NN'), ('in', 'IN'), ('dealing', 'VBG'), ('with', 'IN'), ('a', 'DT'), ('pretty', 'JJ'), ('flimsy', 'JJ'), ('idea', 'NN'), ('for', 'IN'), ('a', 'DT'), ('movie', 'NN'), ('But', 'CC'), ('it', 'PRP'), ('was', 'VBD'), ('the', 'DT'), ('s', 'NN'), ('and', 'CC'), ('anything', 'NN'), ('was', 'VBD'), ('possible', 'JJ'), ('from', 'IN'), ('intruders', 'NNS'), ('from', 'IN'), ('outer', 'JJ'), ('space', 'NN'), ('to', 'TO'), ('mole', 'VB'), ('men', 'NNS'), ('from', 'IN'), ('inner', 'JJ'), ('space', 'NN'), ('It', 'PRP'), ('is', 'VBZ'), ('definitely', 'RB'), ('worth', 'JJ'), ('seeing', 'VBG'), ('there', 'EX'), ('isn', 'JJ'), ('t', 'NN'), ('a', 'DT'), ('bad', 'JJ'), ('actor', 'NN'), ('in', 'IN'), ('the', 'DT'), ('group', 'NN'), ('Whomever', 'WDT'), ('put', 'VBD'), ('the', 'DT'), ('cast', 'NN'), ('together', 'NN'), ('was', 'VBD'), ('very', 'RB'), ('very', 'RB'), ('fortunate', 'VB'), ('to', 'TO'), ('get', 'VB'), ('so', 'RB'), ('many', 'JJ'), ('gifted', 'JJ'), ('actors', 'NNS'), ('into', 'IN'), ('a', 'DT'), ('B', 'NNP'), ('type', 'NN'), ('film', 'NN'), ('Some', 'DT'), ('already', 'RB'), ('had', 'VBD'), ('a', 'DT'), ('wealth', 'NN'), ('of', 'IN'), ('experience', 'NN'), ('and', 'CC'), ('some', 'DT'), ('were', 'VBD'), ('about', 'IN'), ('to', 'TO'), ('obtain', 'VB'), ('a', 'DT'), ('wealth', 'NN'), ('of', 'IN'), ('experience', 'NN'), ('but', 'CC'), ('all', 'DT'), ('were', 'VBD'), ('gifted', 'VBN'), ('So', 'RB'), ('if', 'IN'), ('you', 'PRP'), ('get', 'VBP'), ('a', 'DT'), ('chance', 'NN'), ('to', 'TO'), ('see', 'VB'), ('the', 'DT'), ('film', 'NN'), ('forget', 'VB'), ('the', 'DT'), ('dopey', 'NN'), ('costumes', 'NNS'), ('and', 'CC'), ('just', 'RB'), ('enjoy', 'VB'), ('the', 'DT'), ('excitement', 'NN'), ('and', 'CC'), ('acting', 'NN'), ('Is', 'VBZ'), ('it', 'PRP'), ('a', 'DT'), ('bird', 'NN'), ('Is', 'VBZ'), ('it', 'PRP'), ('a', 'DT'), ('plane', 'NN'), ('No', 'NNP'), ('just', 'RB'), ('a', 'DT'), ('good', 'JJ'), ('old', 'JJ'), ('fashioned', 'VBN'), ('movie', 'NN'), ('to', 'TO'), ('enjoy', 'VB')]
[('A', 'DT'), ('very', 'RB'), ('engaging', 'JJ'), ('documentary', 'NN'), ('about', 'IN'), ('Scottish', 'JJ'), ('artist', 'NN'), ('Andy', 'NNP'), ('Goldsworthy', 'NNP'), ('whose', 'WP$'), ('work', 'NN'), ('consists', 'VBZ'), ('mostly', 'RB'), ('of', 'IN'), ('ephemeral', 'JJ'), ('sculptures', 'NNS'), ('made', 'VBN'), ('from', 'IN'), ('elements', 'NNS'), ('from', 'IN'), ('nature', 'NN'), ('His', 'PRP$'), ('work', 'NN'), ('is', 'VBZ'), ('made', 'VBN'), ('of', 'IN'), ('rocks', 'NNS'), ('leaves', 'NNS'), ('grass', 'VBP'), ('ice', 'NN'), ('etc', 'NN'), ('that', 'WDT'), ('gets', 'VBZ'), ('blown', 'RP'), ('away', 'RB'), ('when', 'WRB'), ('the', 'DT'), ('tide', 'NN'), ('arrives', 'VBZ'), ('at', 'IN'), ('the', 'DT'), ('beach', 'NN'), ('or', 'CC'), ('the', 'DT'), ('wind', 'NN'), ('blows', 'VBZ'), ('at', 'IN'), ('the', 'DT'), ('field', 'NN'), ('Thus', 'NNP'), ('most', 'JJS'), ('of', 'IN'), ('Goldsworthy', 'NNP'), ('s', 'FW'), ('works', 'VBZ'), ('don', 'FW'), ('t', 'NN'), ('really', 'RB'), ('last', 'JJ'), ('except', 'IN'), ('as', 'IN'), ('photos', 'NN'), ('or', 'CC'), ('films', 'NNS'), ('of', 'IN'), ('what', 'WP'), ('they', 'PRP'), ('were', 'VBD'), ('Now', 'RB'), ('one', 'CD'), ('can', 'MD'), ('argue', 'VB'), ('that', 'IN'), ('Goldsworthy', 'NNP'), ('s', 'NN'), ('works', 'NNS'), ('are', 'VBP'), ('a', 'DT'), ('reflection', 'NN'), ('of', 'IN'), ('mortality', 'NN'), ('or', 'CC'), ('words', 'NNS'), ('to', 'TO'), ('that', 'DT'), ('effect', 'NN'), ('but', 'CC'), ('isn', 'NN'), ('t', 'VBP'), ('it', 'PRP'), ('easier', 'JJR'), ('to', 'TO'), ('say', 'VB'), ('that', 'IN'), ('what', 'WP'), ('he', 'PRP'), ('does', 'VBZ'), ('is', 'VBZ'), ('just', 'RB'), ('beautiful', 'JJ'), ('art', 'NN'), ('And', 'CC'), ('at', 'IN'), ('a', 'DT'), ('time', 'NN'), ('when', 'WRB'), ('the', 'DT'), ('stereotype', 'NN'), ('about', 'IN'), ('artists', 'NNS'), ('is', 'VBZ'), ('that', 'IN'), ('they', 'PRP'), ('are', 'VBP'), ('mostly', 'RB'), ('bitter', 'JJ'), ('pretentious', 'JJ'), ('often', 'RB'), ('mentally', 'RB'), ('unstable', 'JJ'), ('people', 'NNS'), ('who', 'WP'), ('live', 'VBP'), ('in', 'IN'), ('decrepit', 'JJ'), ('urban', 'JJ'), ('settings', 'NNS'), ('Goldsworthy', 'NNP'), ('seems', 'VBZ'), ('to', 'TO'), ('be', 'VB'), ('the', 'DT'), ('opposite', 'JJ'), ('a', 'DT'), ('stable', 'JJ'), ('unpretentious', 'JJ'), ('family', 'NN'), ('oriented', 'VBN'), ('person', 'NN'), ('who', 'WP'), ('loves', 'VBZ'), ('nature', 'NN'), ('and', 'CC'), ('lives', 'VBZ'), ('in', 'IN'), ('a', 'DT'), ('small', 'JJ'), ('village', 'NN'), ('in', 'IN'), ('Scotland', 'NNP'), ('of', 'IN'), ('course', 'NN'), ('I', 'PRP'), ('m', 'VBP'), ('sure', 'JJ'), ('those', 'DT'), ('are', 'VBP'), ('the', 'DT'), ('same', 'JJ'), ('reasons', 'NNS'), ('why', 'WRB'), ('he', 'PRP'), ('s', 'VBD'), ('shunned', 'VBN'), ('by', 'IN'), ('some', 'DT'), ('people', 'NNS'), ('on', 'IN'), ('the', 'DT'), ('art', 'NN'), ('world', 'NN'), ('who', 'WP'), ('found', 'VBD'), ('his', 'PRP$'), ('works', 'NNS'), ('fluffy', 'RBR'), ('or', 'CC'), ('superficial', 'JJ')]

['I', 'love', 'to', 'eat', 'pizza']

In [None]:
pos_tag(word_tokenize(raw_sentence))

In [None]:
print(word_tokenize('I love you'))
pos_tag(word_tokenize('I love you'))

['I', 'love', 'you']


[('I', 'PRP'), ('love', 'VBP'), ('you', 'PRP')]

In [None]:
from sklearn.metrics import accuracy_score, confusion_matrix, precision_score, recall_score, f1_score, roc_auc_score
import numpy as np

print(confusion_matrix(y_target, preds))
print('정확도:', np.round(accuracy_score(y_target, preds), 4))
print('정밀도:', np.round(precision_score(y_target, preds), 4))
print('재현도:', np.round(recall_score(y_target, preds), 4))

[[7649 4851]
 [3578 8922]]
정확도: 0.6628
정밀도: 0.6478
재현도: 0.7138


### VADER를 이용한 감성 분석

### VADER lexicon을 이용한 Sentiment Analysis
- VADER (Valence Aware Dictionary and sEntiment Reasoner) Lexicon은 텍스트의 감성을 분석하는 데 사용되는 규칙 기반의 도구이다. 이 도구는 특히 소셜 미디어 텍스트와 같은 짧고 정보가 풍부한 텍스트를 분석하는 데 효과적이다.

- 각 단어에 대해 긍정적, 부정적, 중립적 점수를 부여한다. 더불어 문맥에 따라 감성 점수를 조정하는 휴리스틱 규칙들을 적용한다. 예를 들어, "not good"이라는 표현은 단독으로 보았을 때의 "good"보다 더 부정적인 의미를 지닌다.

- 강조의 정도도 고려한다. 예컨대, "good"보다 "very good"이 더 긍정적이며, "good!!!"은 "good"보다 강한 감정을 나타낸다.

- 주어진 텍스트에 대해 긍정, 부정, 중립, 그리고 복합 점수를 제공하며, 이 복합 점수는 전체 텍스트의 감성을 대표한다.

In [None]:
from nltk.sentiment.vader import SentimentIntensityAnalyzer

senti_analyzer = SentimentIntensityAnalyzer()
senti_scores = senti_analyzer.polarity_scores(review_df['review'][0])
print(senti_scores)

{'neg': 0.13, 'neu': 0.744, 'pos': 0.126, 'compound': -0.8278}


In [None]:
def vader_polarity(review, threshold=0.1):
    analyzer = SentimentIntensityAnalyzer()
    scores = analyzer.polarity_scores(review)

    # compound 값에 기반에 threshold 입력값보다 크면 1, 그렇지 않으면 0을 반환
    agg_score = scores['compound']
    final_sentiment = 1 if agg_score >= threshold else 0
    return final_sentiment

# apply lambda 식을 이용해 레코드벼롤 vader_polarity()를 수행하고 결과를 'vader_preds'에 저장
review_df['vader_preds'] = review_df['review'].apply(lambda x : vader_polarity(x, 0.1))
y_target = review_df['sentiment'].values
vader_preds = review_df['vader_preds'].values

print(confusion_matrix(y_target, vader_preds))
print('정확도:', np.round(accuracy_score(y_target, vader_preds),4))
print('정밀도:', np.round(precision_score(y_target, vader_preds),4))
print('재현율:', np.round(recall_score(y_target, vader_preds),4))

[[ 6819  5681]
 [ 1936 10564]]
정확도: 0.6953
정밀도: 0.6503
재현율: 0.8451


In [34]:
all_files

['/content/drive/Othercomputers/내 컴퓨터/hmkd1/dataset/OpinosisDataset1.0/topics/rooms_bestwestern_hotel_sfo.txt.data',
 '/content/drive/Othercomputers/내 컴퓨터/hmkd1/dataset/OpinosisDataset1.0/topics/food_swissotel_chicago.txt.data',
 '/content/drive/Othercomputers/내 컴퓨터/hmkd1/dataset/OpinosisDataset1.0/topics/battery-life_amazon_kindle.txt.data',
 '/content/drive/Othercomputers/내 컴퓨터/hmkd1/dataset/OpinosisDataset1.0/topics/eyesight-issues_amazon_kindle.txt.data',
 '/content/drive/Othercomputers/내 컴퓨터/hmkd1/dataset/OpinosisDataset1.0/topics/battery-life_ipod_nano_8gb.txt.data',
 '/content/drive/Othercomputers/내 컴퓨터/hmkd1/dataset/OpinosisDataset1.0/topics/size_asus_netbook_1005ha.txt.data',
 '/content/drive/Othercomputers/내 컴퓨터/hmkd1/dataset/OpinosisDataset1.0/topics/comfort_toyota_camry_2007.txt.data',
 '/content/drive/Othercomputers/내 컴퓨터/hmkd1/dataset/OpinosisDataset1.0/topics/screen_garmin_nuvi_255W_gps.txt.data',
 '/content/drive/Othercomputers/내

In [77]:
import pandas as pd
import glob ,os
pd.set_option('display.max_colwidth', 700)

# 아래는 제 컴퓨터에서 압축 파일을 풀어 놓은 디렉토리이니, 여러분의 디렉토리를 설정해 주십시요
path = r'/content/drive/Othercomputers/내 컴퓨터/hmkd1/dataset/OpinosisDataset1.0/topics'
# path로 지정한 디렉토리 밑에 있는 모든 .data 파일들의 파일명을 리스트로 취합
all_files = glob.glob(os.path.join(path, "*.data"))
filename_list = []
opinion_text = []

# filename_list.append(os.path.basename(filename))
# 개별 파일들의 파일명은 filename_list 리스트로 취합,
# 개별 파일들의 파일내용은 DataFrame로딩 후 다시 string으로 변환하여 opinion_text 리스트로 취합
for file_ in all_files:
    # 개별 파일을 읽어서 DataFrame으로 생성
    df = pd.read_table(file_, encoding='latin1')   # 텍스트 파일의 내용을 테이블 형식으로 읽어와 DataFrame으로 반환함 (영어본문만 있음)
    # df = pd.read_table(file_,index_col=None, header=0, encoding='latin1')   # 텍스트 파일의 내용을 테이블 형식으로 읽어와 DataFrame으로 반환함 (영어본문만 있음)
                                                                            # 파이썬의 file 내장함수와의 혼동을 피하기 위해 변수명 file_로 함
                                                                            # index_col=None (첫 번째 컬럼이 아닌 0부터 시작하는 정수형 인덱스를 생성해줌)
                                                                            # 여기에서는 컬럼이 한 개밖에 없어서 index_col=None을 안해줘도 자동으로 정수형
                                                                            #  인덱스를 0부터 생성해줌
                                                                            # header=0도 여기에서는 의미가 없음

    # 절대경로로 주어진 file 명을 가공. 만일 Linux에서 수행시에는 아래 \\를 / 변경. 맨 마지막 .data 확장자도 제거
    # filename_ = file_.split('\\')[-1]    # local-windows
    filename_ = file_.split('/')[-1]    #colab
    filename = filename_.split('.')[0]

    #파일명 리스트와 파일내용 리스트에 파일명과 파일 내용을 추가.
    filename_list.append(filename)
    opinion_text.append(df.to_string())

# 파일명 리스트와 파일내용 리스트를  DataFrame으로 생성
document_df = pd.DataFrame({'filename':filename_list, 'opinion_text':opinion_text})
document_df.head()

Unnamed: 0,filename,opinion_text
0,rooms_bestwestern_hotel_sfo,"Great Location , Nice Rooms , H..."
1,food_swissotel_chicago,The food for our event was delicious .\n0 ...
2,battery-life_amazon_kindle,"After I plugged it in to my USB hub on my computer to charge the battery the charging cord design is very clever !\n0 After you have paged tru a 500, page book one, page, at, a, time to get from Chapter 2 to Chapter 15, see how excited you are about a low battery and all the time it took to get there !\n1 ..."
3,eyesight-issues_amazon_kindle,"It feels as easy to read as the K1 but doesn't seem any crisper to my eyes .\n0 the white is really GREY, and to avoid considerable eye, strain I had to refresh pages every other page .\n1 The dream has always been a portable electronic device that could hold a ton of reading material, automate subscriptions and fa..."
4,battery-life_ipod_nano_8gb,short battery life I moved up from an 8gb .\n0 I love this ipod except for the battery life .\n1 ...


In [80]:
file_ = all_files[0]
pd.read_table(file_, index_col=None, encoding='latin1')
file_
print(file_.split('/')[-1])
filename_.split('.')

rooms_bestwestern_hotel_sfo.txt.data


['service_bestwestern_hotel_sfo', 'txt', 'data']

In [10]:
document_df.loc[0, 'opinion_text']

"                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                     Great Location ,  Nice   Rooms ,  Helpless Concierge\n0                                                                                                                                                                                                                                                                                           

In [11]:
from nltk.stem import WordNetLemmatizer
import nltk
import string

remove_punct_dict = dict((ord(punct), None) for punct in string.punctuation)
lemmar = WordNetLemmatizer()

def LemTokens(tokens):
    return [lemmar.lemmatize(token) for token in tokens]

def LemNormalize(text):
    return LemTokens(nltk.word_tokenize(text.lower().translate(remove_punct_dict)))

In [14]:
from sklearn.feature_extraction.text import TfidfVectorizer

tfidf_vect = TfidfVectorizer(tokenizer=LemNormalize, stop_words='english' , \
                             ngram_range=(1,2), min_df=0.05, max_df=0.85 )

#opinion_text 컬럼값으로 feature vectorization 수행
feature_vect = tfidf_vect.fit_transform(document_df['opinion_text'])



In [15]:
from sklearn.cluster import KMeans

# 5개 집합으로 군집화 수행. 예제를 위해 동일한 클러스터링 결과 도출용 random_state=0
km_cluster = KMeans(n_clusters=5, max_iter=10000, random_state=0)
km_cluster.fit(feature_vect)
cluster_label = km_cluster.labels_
cluster_centers = km_cluster.cluster_centers_



In [16]:
document_df['cluster_label'] = cluster_label
document_df.head()

Unnamed: 0,filename,opinion_text,cluster_label
0,/content/drive/Othercomputers/내 컴퓨터/hmkd1...,...,0
1,/content/drive/Othercomputers/내 컴퓨터/hmkd1...,...,0
2,/content/drive/Othercomputers/내 컴퓨터/hmkd1...,...,4
3,/content/drive/Othercomputers/내 컴퓨터/hmkd1...,...,3
4,/content/drive/Othercomputers/내 컴퓨터/hmkd1...,...,4


In [17]:
document_df[document_df['cluster_label']==0].sort_values(by='filename')

Unnamed: 0,filename,opinion_text,cluster_label
0,/content/drive/Othercomputers/내 컴퓨터/hmkd1...,...,0
1,/content/drive/Othercomputers/내 컴퓨터/hmkd1...,...,0
8,/content/drive/Othercomputers/내 컴퓨터/hmkd1...,...,0
10,/content/drive/Othercomputers/내 컴퓨터/hmkd1...,...,0
13,/content/drive/Othercomputers/내 컴퓨터/hmkd1...,...,0
16,/content/drive/Othercomputers/내 컴퓨터/hmkd1...,...,0
20,/content/drive/Othercomputers/내 컴퓨터/hmkd1...,...,0
21,/content/drive/Othercomputers/내 컴퓨터/hmkd1...,...,0
24,/content/drive/Othercomputers/내 컴퓨터/hmkd1...,...,0
25,/content/drive/Othercomputers/내 컴퓨터/hmkd1...,...,0


In [18]:
document_df[document_df['cluster_label']==1].sort_values(by='filename')

Unnamed: 0,filename,opinion_text,cluster_label
6,/content/drive/Othercomputers/내 컴퓨터/hmkd1...,...,1
9,/content/drive/Othercomputers/내 컴퓨터/hmkd1...,...,1
18,/content/drive/Othercomputers/내 컴퓨터/hmkd1...,...,1
27,/content/drive/Othercomputers/내 컴퓨터/hmkd1...,...,1
28,/content/drive/Othercomputers/내 컴퓨터/hmkd1...,...,1
29,/content/drive/Othercomputers/내 컴퓨터/hmkd1...,...,1
40,/content/drive/Othercomputers/내 컴퓨터/hmkd1...,...,1
41,/content/drive/Othercomputers/내 컴퓨터/hmkd1...,...,1
43,/content/drive/Othercomputers/내 컴퓨터/hmkd1...,...,1
45,/content/drive/Othercomputers/내 컴퓨터/hmkd1...,...,1


In [19]:
document_df[document_df['cluster_label']==2].sort_values(by='filename')

Unnamed: 0,filename,opinion_text,cluster_label
5,/content/drive/Othercomputers/내 컴퓨터/hmkd1...,...,2
7,/content/drive/Othercomputers/내 컴퓨터/hmkd1...,...,2
15,/content/drive/Othercomputers/내 컴퓨터/hmkd1...,...,2
22,/content/drive/Othercomputers/내 컴퓨터/hmkd1...,...,2
26,/content/drive/Othercomputers/내 컴퓨터/hmkd1...,...,2
33,/content/drive/Othercomputers/내 컴퓨터/hmkd1...,...,2
34,/content/drive/Othercomputers/내 컴퓨터/hmkd1...,...,2
38,/content/drive/Othercomputers/내 컴퓨터/hmkd1...,...,2
42,/content/drive/Othercomputers/내 컴퓨터/hmkd1...,...,2


In [20]:
document_df[document_df['cluster_label']==3].sort_values(by='filename')

Unnamed: 0,filename,opinion_text,cluster_label
3,/content/drive/Othercomputers/내 컴퓨터/hmkd1...,...,3
11,/content/drive/Othercomputers/내 컴퓨터/hmkd1...,...,3
17,/content/drive/Othercomputers/내 컴퓨터/hmkd1...,...,3
19,/content/drive/Othercomputers/내 컴퓨터/hmkd1...,...,3
23,/content/drive/Othercomputers/내 컴퓨터/hmkd1...,...,3
36,/content/drive/Othercomputers/내 컴퓨터/hmkd1...,...,3
39,/content/drive/Othercomputers/내 컴퓨터/hmkd1...,...,3
46,/content/drive/Othercomputers/내 컴퓨터/hmkd1...,...,3
47,/content/drive/Othercomputers/내 컴퓨터/hmkd1...,...,3
48,/content/drive/Othercomputers/내 컴퓨터/hmkd1...,...,3


In [21]:
document_df[document_df['cluster_label']==4].sort_values(by='filename')

Unnamed: 0,filename,opinion_text,cluster_label
2,/content/drive/Othercomputers/내 컴퓨터/hmkd1...,...,4
4,/content/drive/Othercomputers/내 컴퓨터/hmkd1...,...,4
12,/content/drive/Othercomputers/내 컴퓨터/hmkd1...,...,4
14,/content/drive/Othercomputers/내 컴퓨터/hmkd1...,...,4
32,/content/drive/Othercomputers/내 컴퓨터/hmkd1...,headphone jack i got a clear case for it a...,4
37,/content/drive/Othercomputers/내 컴퓨터/hmkd1...,...,4


In [22]:
from sklearn.cluster import KMeans

# 3개의 집합으로 군집화
km_cluster = KMeans(n_clusters=3, max_iter=10000, random_state=0)
km_cluster.fit(feature_vect)
cluster_label = km_cluster.labels_


# 소속 클러스터를 cluster_label 컬럼으로 할당하고 cluster_label 값으로 정렬
document_df['cluster_label'] = cluster_label
document_df.sort_values(by='cluster_label')



Unnamed: 0,filename,opinion_text,cluster_label
9,/content/drive/Othercomputers/내 컴퓨터/hmkd1...,...,0
27,/content/drive/Othercomputers/내 컴퓨터/hmkd1...,...,0
28,/content/drive/Othercomputers/내 컴퓨터/hmkd1...,...,0
29,/content/drive/Othercomputers/내 컴퓨터/hmkd1...,...,0
6,/content/drive/Othercomputers/내 컴퓨터/hmkd1...,...,0
18,/content/drive/Othercomputers/내 컴퓨터/hmkd1...,...,0
41,/content/drive/Othercomputers/내 컴퓨터/hmkd1...,...,0
43,/content/drive/Othercomputers/내 컴퓨터/hmkd1...,...,0
45,/content/drive/Othercomputers/내 컴퓨터/hmkd1...,...,0
40,/content/drive/Othercomputers/내 컴퓨터/hmkd1...,...,0


In [23]:
cluster_centers = km_cluster.cluster_centers_
print('cluster_centers shape :',cluster_centers.shape)
print(cluster_centers)

cluster_centers shape : (3, 4611)
[[0.         0.00092551 0.         ... 0.         0.         0.        ]
 [0.         0.00099499 0.00174637 ... 0.         0.00183397 0.00144581]
 [0.01005322 0.         0.         ... 0.00706287 0.         0.        ]]


In [24]:
# 군집별 top n 핵심단어, 그 단어의 중심 위치 상대값, 대상 파일명들을 반환함.
def get_cluster_details(cluster_model, cluster_data, feature_names, clusters_num, top_n_features=10):
    cluster_details = {}

    # cluster_centers array 의 값이 큰 순으로 정렬된 index 값을 반환
    # 군집 중심점(centroid)별 할당된 word 피처들의 거리값이 큰 순으로 값을 구하기 위함.
    centroid_feature_ordered_ind = cluster_model.cluster_centers_.argsort()[:,::-1]

    #개별 군집별로 iteration하면서 핵심단어, 그 단어의 중심 위치 상대값, 대상 파일명 입력
    for cluster_num in range(clusters_num):
        # 개별 군집별 정보를 담을 데이터 초기화.
        cluster_details[cluster_num] = {}
        cluster_details[cluster_num]['cluster'] = cluster_num

        # cluster_centers_.argsort()[:,::-1] 로 구한 index 를 이용하여 top n 피처 단어를 구함.
        top_feature_indexes = centroid_feature_ordered_ind[cluster_num, :top_n_features]
        top_features = [ feature_names[ind] for ind in top_feature_indexes ]

        # top_feature_indexes를 이용해 해당 피처 단어의 중심 위치 상댓값 구함
        top_feature_values = cluster_model.cluster_centers_[cluster_num, top_feature_indexes].tolist()

        # cluster_details 딕셔너리 객체에 개별 군집별 핵심 단어와 중심위치 상대값, 그리고 해당 파일명 입력
        cluster_details[cluster_num]['top_features'] = top_features
        cluster_details[cluster_num]['top_features_value'] = top_feature_values
        filenames = cluster_data[cluster_data['cluster_label'] == cluster_num]['filename']
        filenames = filenames.values.tolist()
        cluster_details[cluster_num]['filenames'] = filenames

    return cluster_details

In [25]:
def print_cluster_details(cluster_details):
    for cluster_num, cluster_detail in cluster_details.items():
        print('####### Cluster {0}'.format(cluster_num))
        print('Top features:', cluster_detail['top_features'])
        print('Reviews 파일명 :',cluster_detail['filenames'][:7])
        print('==================================================')

In [26]:
feature_names = tfidf_vect.get_feature_names()

cluster_details = get_cluster_details(cluster_model=km_cluster, cluster_data=document_df,\
                                  feature_names=feature_names, clusters_num=3, top_n_features=10 )
print_cluster_details(cluster_details)

AttributeError: ignored