# 네이버 영화리뷰 평점 분석

## CountVectorizer

In [3]:
import codecs
with codecs.open("ratings_train.txt", encoding='utf-8') as f:
    data = [line.split('\t') for line in f.read().splitlines()]
    data=data[1:] # header제외

- 이 데이터는 번호, 내용, 평점으로 이루져 있으므로 내용을 X, 평점을 y로 저장한다.

- 평점은 0 / 1 (bad / good)

- pprint : pretty print / 예쁘게 프린트

In [4]:
from pprint import pprint
pprint(data[0])

['9976970', '아 더빙.. 진짜 짜증나네요 목소리', '0']


In [None]:
list(zip(*data))[1]

In [10]:
X = list(zip(*data))[1]
y = np.array(list(zip(*data))[2], dtype=np.int)

In [13]:
from sklearn.feature_extraction.text import CountVectorizer
from sklearn.naive_bayes import MultinomialNB
from sklearn.pipeline import Pipeline
from sklearn.metrics import classification_report

In [14]:
model1 = Pipeline([
  ('vect', CountVectorizer()),
    ('mb', MultinomialNB()),
])

In [15]:
%%time
model1.fit(X,y)

Wall time: 6.45 s


Pipeline(steps=[('vect', CountVectorizer()), ('mb', MultinomialNB())])

In [17]:
import codecs
with codecs.open('ratings_test.txt', encoding='utf-8') as f:
    data_test = [line.split('\t') for line in f.read().splitlines()]
    data_test = data_test[1:]

In [20]:
X_test = list(zip(*data_test))[1]
y_test = np.array(list(zip(*data_test))[2], dtype=np.int)

In [24]:
print(classification_report(y_test, model1.predict(X_test)))

              precision    recall  f1-score   support

           0       0.81      0.84      0.83     24827
           1       0.84      0.81      0.82     25173

    accuracy                           0.83     50000
   macro avg       0.83      0.83      0.83     50000
weighted avg       0.83      0.83      0.83     50000



- 모델 테스트

In [29]:
model1.predict(['짱이다']), model1.predict(['병신']), model1.predict(['너무'])

(array([1]), array([0]), array([1]))

In [32]:
model1.predict(['명작']), model1.predict(['까다롭다']), model1.predict(['구리다'])

(array([1]), array([0]), array([0]))

## TfidfVectorizer

In [35]:
from sklearn.feature_extraction.text import TfidfVectorizer
model2 = Pipeline([
    ('vec',TfidfVectorizer()),
    ('mb', MultinomialNB())
])
model2.fit(X,y)

Pipeline(steps=[('vec', TfidfVectorizer()), ('mb', MultinomialNB())])

In [36]:
print(classification_report(y_test, model2.predict(X_test)))

              precision    recall  f1-score   support

           0       0.81      0.84      0.83     24827
           1       0.84      0.81      0.83     25173

    accuracy                           0.83     50000
   macro avg       0.83      0.83      0.83     50000
weighted avg       0.83      0.83      0.83     50000



## 형태소분석기 사용

In [38]:
from konlpy.tag import Okt
pos_tagger = Okt()

def tokenize_pos(doc):
    return ['/'.join(t) for t in pos_tagger.pos(doc)]

In [41]:
model3 = Pipeline([
    ('vect', CountVectorizer(tokenizer=tokenize_pos)),
    ('mb', MultinomialNB())
])
model3.fit(X,y)

Pipeline(steps=[('vect',
                 CountVectorizer(tokenizer=<function tokenize_pos at 0x0000016F9E15CC10>)),
                ('mb', MultinomialNB())])

In [43]:
print(classification_report(y_test, model3.predict(X_test)))

              precision    recall  f1-score   support

           0       0.85      0.86      0.85     24827
           1       0.86      0.85      0.85     25173

    accuracy                           0.85     50000
   macro avg       0.85      0.85      0.85     50000
weighted avg       0.85      0.85      0.85     50000



## (1,2)gram 사용

In [45]:
model4 = Pipeline([
    ('vect',TfidfVectorizer(tokenizer=tokenize_pos, ngram_range=(1,2))),
    ('mb', MultinomialNB()),
])

In [46]:
model4.fit(X,y)

Pipeline(steps=[('vect',
                 TfidfVectorizer(ngram_range=(1, 2),
                                 tokenizer=<function tokenize_pos at 0x0000016F9E15CC10>)),
                ('mb', MultinomialNB())])

In [47]:
print(classification_report(y_test, model4.predict(X_test)))

              precision    recall  f1-score   support

           0       0.86      0.87      0.87     24827
           1       0.87      0.86      0.87     25173

    accuracy                           0.87     50000
   macro avg       0.87      0.87      0.87     50000
weighted avg       0.87      0.87      0.87     50000

