# 감성분석

codecs 패키지는 `utf-8 -> byte -> unicode` 변환 과정을 스트리밍 방식으로 수행한다. 따라서 ram 문제, 시간 문제를 해결하기 위해 사용한다.ㅡ

In [61]:
import codecs
with codecs.open("ratings_train.txt", encoding='utf-8') as f:
    data = [line.split('\t') for line in f.read().splitlines()]
    data = data[1:]   # header 제외

해당 데이터는 번호, 내용, 평점으로 구성됨

In [63]:
# 데이터 확인

data[0]

['9976970', '아 더빙.. 진짜 짜증나네요 목소리', '0']

내용을 X, 평점을 y로 지정

In [87]:
X = list(zip(*data))[1]
y = np.array(list(zip(*data))[2], dtype=int)

In [73]:
X[0], y[0]

('아 더빙.. 진짜 짜증나네요 목소리', 0)

# 다항 NB 모델 학습

## CountVectorizer

In [75]:
from sklearn.feature_extraction.text import CountVectorizer
from sklearn.naive_bayes import MultinomialNB
from sklearn.pipeline import Pipeline
from sklearn.metrics import classification_report

# 파이프라인 설정
model1 = Pipeline([
    ('vect', CountVectorizer()),
    ('mb', MultinomialNB()),
])

In [76]:
# 모델 학습

%%time
model1.fit(X,y)

Wall time: 2.16 s


Pipeline(steps=[('vect', CountVectorizer()), ('mb', MultinomialNB())])

In [77]:
# test용 text데이터 로드

import codecs
with codecs.open("ratings_test.txt", encoding='utf-8') as f:
    data_test = [line.split('\t') for line in f.read().splitlines()]
    data_test = data_test[1:]   # header 제외

In [79]:
# test 데이터를 이용한 성능평가

X_test = list(zip(*data_test))[1]
y_test = np.array(list(zip(*data_test))[2], dtype=int)

print(classification_report(y_test, model1.predict(X_test)))

              precision    recall  f1-score   support

           0       0.81      0.84      0.83     24827
           1       0.84      0.81      0.82     25173

    accuracy                           0.83     50000
   macro avg       0.83      0.83      0.83     50000
weighted avg       0.83      0.83      0.83     50000



In [84]:
# predict 결과 확인

new_datas = ['이게 무슨 영화냐 시간아깝다',
            '진짜 감동이다',
            '완전 비추입니다 보지마세요',
            '인생영화 무조건 강추입니다',
            '장난하냐 돈아깝다']

model1.predict(new_datas)

array([0, 1, 0, 1, 0])

## TfidVectorizer

In [89]:
from sklearn.feature_extraction.text import TfidfVectorizer

model2 = Pipeline([
    ('vect', TfidfVectorizer()),
    ('mb', MultinomialNB())
])

In [90]:
%%time

model2.fit(X, y)

Wall time: 2.31 s


Pipeline(steps=[('vect', TfidfVectorizer()), ('mb', MultinomialNB())])

In [91]:
# 성능평가

print(classification_report(y_test, model2.predict(X_test)))

              precision    recall  f1-score   support

           0       0.81      0.84      0.83     24827
           1       0.84      0.81      0.83     25173

    accuracy                           0.83     50000
   macro avg       0.83      0.83      0.83     50000
weighted avg       0.83      0.83      0.83     50000



In [92]:
# predict 예시

model2.predict(new_datas)

array([0, 1, 0, 1, 0])

## 형태소분석기 사용

In [93]:
from konlpy.tag import Okt
pos_tagger = Okt()

def tokenize_pos(doc):
    return ['/'.join(t) for t in pos_tagger.pos(doc)]

In [94]:
model3 = Pipeline([
    ('vect', CountVectorizer(tokenizer=tokenize_pos)),
    ('mb', MultinomialNB()),
])

In [95]:
%%time
model3.fit(X, y)

Wall time: 3min 15s


Pipeline(steps=[('vect',
                 CountVectorizer(tokenizer=<function tokenize_pos at 0x000001D0FB93ADC8>)),
                ('mb', MultinomialNB())])

In [96]:
print(classification_report(y_test, model3.predict(X_test)))

              precision    recall  f1-score   support

           0       0.85      0.86      0.85     24827
           1       0.86      0.85      0.85     25173

    accuracy                           0.85     50000
   macro avg       0.85      0.85      0.85     50000
weighted avg       0.85      0.85      0.85     50000



## 바이그램 사용

In [97]:
model4 = Pipeline([
    ('vect', TfidfVectorizer(tokenizer=tokenize_pos, ngram_range=(1, 2))),
    ('mb', MultinomialNB()),
])

In [98]:
%%time
model4.fit(X, y)

Wall time: 3min 40s


Pipeline(steps=[('vect',
                 TfidfVectorizer(ngram_range=(1, 2),
                                 tokenizer=<function tokenize_pos at 0x000001D0FB93ADC8>)),
                ('mb', MultinomialNB())])

In [99]:
print(classification_report(y_test, model4.predict(X_test)))

              precision    recall  f1-score   support

           0       0.86      0.87      0.87     24827
           1       0.87      0.86      0.87     25173

    accuracy                           0.87     50000
   macro avg       0.87      0.87      0.87     50000
weighted avg       0.87      0.87      0.87     50000

