# 네이버 영화평 감성분석 - TfidfVectorizer

In [1]:
import numpy as np
import pandas as pd

In [2]:
train_df = pd.read_csv('../00.data/NaverMovie/train.tsv', sep='\t')
test_df = pd.read_csv('../00.data/NaverMovie/test.tsv', sep='\t')


### Tokenizer 함수 정의

In [3]:
from konlpy.tag import Okt

okt = Okt()
def tw_tokenizer(text):
    tokens_ko = okt.morphs(text)
    return tokens_ko

### TfidfVectorizer 로 학습/ 변환

In [4]:
from sklearn.feature_extraction.text import TfidfVectorizer

tvecter = TfidfVectorizer(tokenizer=tw_tokenizer, ngram_range=(1,2),
                          min_df=3, max_df=0.9)

In [5]:
%time tvecter.fit(train_df.document)

Wall time: 3min 52s


TfidfVectorizer(max_df=0.9, min_df=3, ngram_range=(1, 2),
                tokenizer=<function tw_tokenizer at 0x0000029B9810A4C0>)

In [6]:
%time X_train_tvect = tvecter.transform(train_df['document'])

Wall time: 4min 11s


In [7]:
%time X_test_tvect = tvecter.transform(test_df['document'])

Wall time: 1min 21s


In [8]:
y_train = train_df.label.values
y_test = test_df.label.values

### LogisticRegression 으로 학습/예측/평가

In [9]:
from sklearn.linear_model import LogisticRegression
from sklearn.metrics import accuracy_score

In [10]:
lr_clf = LogisticRegression(C=3.5)
lr_clf.fit(X_train_tvect, y_train)
pred = lr_clf.predict(X_test_tvect)
accuracy_score(y_test, pred)

0.8590672517603837

### 실제 테스트

In [11]:
review1 = '진짜 개노잼이다.. 1편이랑 같은 감독맞나?러닝타임도 길어서 개지루함 ㄹㅇ'
review2 = '이런 사랑영화가 다시 나올 수 있을까?'

In [12]:
import re
review = re.sub("[^ㄱ-ㅎㅏ-ㅣ가-힣 ]","", review1)
review_tvect = tvecter.transform([review])
pred = lr_clf.predict(review_tvect)
pred[0]

0

In [13]:
review = re.sub("[^ㄱ-ㅎㅏ-ㅣ가-힣 ]","", review2)
review_tvect = tvecter.transform([review])
pred = lr_clf.predict(review_tvect)
pred[0]

1

In [14]:
reviews = ['진짜 개노잼이다.. 1편이랑 같은 감독맞나?러닝타임도 길어서 개지루함 ㄹㅇ',
           '이런 사랑영화가 다시 나올 수 있을까?']

In [15]:
reviews = list(map(lambda x: re.sub("[^ㄱ-ㅎㅏ-ㅣ가-힣 ]","", x), reviews))
print(reviews)

['진짜 개노잼이다 편이랑 같은 감독맞나러닝타임도 길어서 개지루함 ㄹㅇ', '이런 사랑영화가 다시 나올 수 있을까']


In [16]:
review_tvect = tvecter.transform(reviews)
pred = lr_clf.predict(review_tvect)
pred[0], pred[1]

(0, 1)

### 최적 하이퍼 파라미터 도출

In [17]:
from sklearn.model_selection import GridSearchCV
lr_clf = LogisticRegression()
params = {
    'C': [1, 3, 3.5, 5, 10]
}
grid_cv = GridSearchCV(lr_clf, param_grid=params, scoring='accuracy', cv=3,
                       verbose=1)
grid_cv.fit(X_train_tvect, y_train)
print(grid_cv.best_params_, grid_cv.best_score_)

Fitting 3 folds for each of 5 candidates, totalling 15 fits
[Parallel(n_jobs=1)]: Using backend SequentialBackend with 1 concurrent workers.
[Parallel(n_jobs=1)]: Done  15 out of  15 | elapsed:   32.1s finished
{'C': 3.5} 0.8551282315094896


In [18]:
pred = grid_cv.predict(X_test_tvect)
acc = accuracy_score(y_test, pred)
print(f'Tfidf Vectorizer + Logistic Regression 정확도: {acc:.4f}')

Tfidf Vectorizer + Logistic Regression 정확도: 0.8591
