# 네이버 영화평 감성분석

In [1]:
import numpy as np
import pandas as pd
import re
import matplotlib.pyplot as plt
%matplotlib inline

In [2]:
from sklearn.model_selection import train_test_split

In [3]:
train_df = pd.read_csv('../../../machine-learning/00.data/NaverMovie/ratings_train.txt', sep='\t')
test_df = pd.read_csv('../../../machine-learning/00.data/NaverMovie/ratings_test.txt', sep='\t')
train_df.head(3)

Unnamed: 0,id,document,label
0,9976970,아 더빙.. 진짜 짜증나네요 목소리,0
1,3819312,흠...포스터보고 초딩영화줄....오버연기조차 가볍지 않구나,1
2,10265843,너무재밓었다그래서보는것을추천한다,0


### 데이터 전처리

In [4]:
# 중복 여부 확인
train_df['document'].nunique()

146182

In [5]:
# 중복 샘플 제거
train_df.drop_duplicates(subset=['document'], inplace=True)
train_df.shape

(146183, 3)

In [6]:
train_df.isnull().sum()

id          0
document    1
label       0
dtype: int64

In [7]:
train_df.loc[train_df.document.isnull()]

Unnamed: 0,id,document,label
25857,2172111,,1


In [8]:
# Null 값 제거
train_df = train_df.dropna(how = 'any')
train_df.shape

(146182, 3)

In [9]:
test_df.head(3)

Unnamed: 0,id,document,label
0,6270596,굳 ㅋ,1
1,9274899,GDNTOPCLASSINTHECLUB,0
2,8544678,뭐야 이 평점들은.... 나쁘진 않지만 10점 짜리는 더더욱 아니잖아,0


In [10]:
# 중복 여부 확인
test_df['document'].nunique()

49157

In [11]:
# 중복 샘플 제거
test_df.drop_duplicates(subset=['document'], inplace=True)
test_df.shape

(49158, 3)

In [12]:
# Null 값 제거
test_df = test_df.dropna(how='any')
test_df.shape

(49157, 3)

- 텍스트 전처리

In [13]:
train_df['document'] = train_df['document'].str.replace("[^ㄱ-ㅎㅏ-ㅣ가-힣 ]","")
# 나중에 웹에선 re 사용
train_df.head()

Unnamed: 0,id,document,label
0,9976970,아 더빙 진짜 짜증나네요 목소리,0
1,3819312,흠포스터보고 초딩영화줄오버연기조차 가볍지 않구나,1
2,10265843,너무재밓었다그래서보는것을추천한다,0
3,9045019,교도소 이야기구먼 솔직히 재미는 없다평점 조정,0
4,6483659,사이몬페그의 익살스런 연기가 돋보였던 영화스파이더맨에서 늙어보이기만 했던 커스틴 던...,1


In [14]:
train_df['document'].replace('', np.nan, inplace=True) # 빈데이터가 있으면 nan으로 바꿈
train_df.isnull().sum()

id            0
document    391
label         0
dtype: int64

In [15]:
# Null 값 제거
train_df = train_df.dropna(how = 'any')
train_df.shape

(145791, 3)

In [16]:
test_df['document'] = test_df['document'].str.replace("[^ㄱ-ㅎㅏ-ㅣ가-힣 ]","")
test_df['document'].replace('', np.nan, inplace=True) # 빈데이터가 있으면 nan으로 바꿈
test_df.isnull().sum()

id            0
document    162
label         0
dtype: int64

In [17]:
# Null 값 제거
test_df = test_df.dropna(how = 'any')
test_df.shape

(48995, 3)

In [18]:
train_df.to_csv('../static/data/naver/movie_train.tsv', sep='\t', index=False)
test_df.to_csv('../static/data/naver/movie_test.tsv', sep='\t', index=False)

### 토큰화

In [19]:
from konlpy.tag import Okt

okt = Okt()

In [20]:
stopwords = ['의','가','이','은','들','는','좀','잘','걍','과','도','를','으로','자','에','와','한','하다','을']

In [21]:
okt.morphs('교도소 이야기구먼 솔직히 재미는 없다평점 조정')

['교도소', '이야기', '구먼', '솔직히', '재미', '는', '없다', '평점', '조정']

In [None]:
# review = re.sub("[^ㄱ-ㅎㅏ-ㅣ가-힣 ]","", sentence)

In [22]:
from tqdm import tqdm_notebook
X_train = []
for sentence in tqdm_notebook(train_df['document']):
    morphs = okt.morphs(sentence, stem=True) # 토큰화
    temp_X = ' '.join([word for word in morphs if not word in stopwords]) # 불용어 제거
    X_train.append(temp_X)

HBox(children=(FloatProgress(value=0.0, max=145791.0), HTML(value='')))




In [23]:
from tqdm import tqdm_notebook
X_test = []
for sentence in tqdm_notebook(test_df['document']):
    morphs = okt.morphs(sentence, stem=True) # 토큰화
    temp_X = ' '.join([word for word in morphs if not word in stopwords]) # 불용어 제거
    X_test.append(temp_X)

HBox(children=(FloatProgress(value=0.0, max=48995.0), HTML(value='')))




In [24]:
y_train = train_df.label.values
y_test = test_df.label.values

### 피처 벡터화 변환과 머신러닝 모델 학습/예측/평가

In [25]:
from sklearn.feature_extraction.text import TfidfVectorizer
from sklearn.feature_extraction.text import CountVectorizer
from sklearn.model_selection import GridSearchCV
from sklearn.pipeline import Pipeline
from sklearn.linear_model import LogisticRegression
from sklearn.metrics import accuracy_score
from sklearn.svm import SVC
import joblib

In [26]:
from sklearn.naive_bayes import MultinomialNB

- Case 1. Count Vectorizer + LogisticRegression

In [27]:
pipeline = Pipeline([
    ('count_vect', CountVectorizer(stop_words='english', ngram_range=(1, 2))),
    ('lr_clf', LogisticRegression())
])

In [28]:
params = {
    'count_vect__max_df': [200, 300],
    'lr_clf__C': [0.5, 1, 5, 10]
}

In [29]:
grid_pipe = GridSearchCV(pipeline, param_grid=params, cv=3, scoring='accuracy', verbose=1)
grid_pipe.fit(X_train, y_train)
print(grid_pipe.best_params_, grid_pipe.best_score_)

Fitting 3 folds for each of 8 candidates, totalling 24 fits
[Parallel(n_jobs=1)]: Using backend SequentialBackend with 1 concurrent workers.
[Parallel(n_jobs=1)]: Done  24 out of  24 | elapsed:  8.3min finished
{'count_vect__max_df': 300, 'lr_clf__C': 1} 0.803485811881392


In [30]:
best_count_lr = grid_pipe.best_estimator_ # best_estimator_ : 최적의 파라미터로 모델 생성
pred_count_lr = best_count_lr.predict(X_test)
acc_count_lr = accuracy_score(y_test, pred_count_lr)
print(f'Count Vectorizer + LogisticRegression 평균 정확도 : {acc_count_lr:.4f}')

Count Vectorizer + LogisticRegression 평균 정확도 : 0.8024


In [31]:
joblib.dump(best_count_lr, '../static/model/naver_count_lr.pkl')

['../static/model/naver_count_lr.pkl']

- Case 3. Tf - idf vectorizer + LogisticRegression

In [32]:
pipeline = Pipeline([
    ('tfidf_vect', TfidfVectorizer(stop_words='english', ngram_range=(1, 2))),
    ('lr_clf', LogisticRegression())
])

In [33]:
params = {
    'tfidf_vect__max_df': [100, 300, 500],
    'lr_clf__C': [5, 10, 15]
}

In [34]:
grid_pipe = GridSearchCV(pipeline, param_grid=params, cv=3, scoring='accuracy', verbose=1)
grid_pipe.fit(X_train, y_train)
print(grid_pipe.best_params_, grid_pipe.best_score_)

Fitting 3 folds for each of 9 candidates, totalling 27 fits
[Parallel(n_jobs=1)]: Using backend SequentialBackend with 1 concurrent workers.
[Parallel(n_jobs=1)]: Done  27 out of  27 | elapsed:  8.5min finished
{'lr_clf__C': 5, 'tfidf_vect__max_df': 500} 0.8169983057939105


In [35]:
best_tfid_lr = grid_pipe.best_estimator_ # best_estimator_ : 최적의 파라미터로 모델 생성
pred_tfid_lr = best_tfid_lr.predict(X_test)
acc_tfid_lr = accuracy_score(y_test, pred_tfid_lr)
print(f'Tf - idf vectorizer + LogisticRegression 평균 정확도 : {acc_tfid_lr:.4f}')

Tf - idf vectorizer + LogisticRegression 평균 정확도 : 0.8141


In [36]:
joblib.dump(best_tfid_lr, '../static/model/naver_tfid_lr.pkl')

['../static/model/naver_tfid_lr.pkl']

- Case 2. Count Vectorizer + MultinomialNB

In [37]:
pipeline = Pipeline([
    ('count_vect', CountVectorizer(stop_words='english', ngram_range=(1, 2))),
    ('nb_clf', MultinomialNB())
])

In [38]:
params = {
    'count_vect__max_df': [200, 300]
}

In [39]:
grid_pipe = GridSearchCV(pipeline, param_grid=params, cv=3, scoring='accuracy', verbose=1)
grid_pipe.fit(X_train, y_train)
print(grid_pipe.best_params_, grid_pipe.best_score_)

Fitting 3 folds for each of 2 candidates, totalling 6 fits
[Parallel(n_jobs=1)]: Using backend SequentialBackend with 1 concurrent workers.
[Parallel(n_jobs=1)]: Done   6 out of   6 | elapsed:   46.7s finished
{'count_vect__max_df': 300} 0.8154344232497205


In [40]:
best_count_nb = grid_pipe.best_estimator_ # best_estimator_ : 최적의 파라미터로 모델 생성
pred_count_nb = best_count_nb.predict(X_test)
acc_count_nb = accuracy_score(y_test, pred_count_nb)
print(f'Count Vectorizer + LogisticRegression 평균 정확도 : {acc_count_nb:.4f}')

Count Vectorizer + LogisticRegression 평균 정확도 : 0.8152


In [41]:
joblib.dump(best_count_nb, '../static/model/naver_count_nb.pkl')

['../static/model/naver_count_nb.pkl']

- Case 4. Tf - idf vectorizer + MultinomialNB

In [42]:
pipeline = Pipeline([
    ('tfidf_vect', TfidfVectorizer(stop_words='english', ngram_range=(1, 2))),
    ('nb_clf', MultinomialNB())
])

In [43]:
params = {
    'tfidf_vect__max_df': [100, 300, 500]
}

In [44]:
grid_pipe = GridSearchCV(pipeline, param_grid=params, cv=3, scoring='accuracy', verbose=1)
grid_pipe.fit(X_train, y_train)
print(grid_pipe.best_params_, grid_pipe.best_score_)

Fitting 3 folds for each of 3 candidates, totalling 9 fits
[Parallel(n_jobs=1)]: Using backend SequentialBackend with 1 concurrent workers.
[Parallel(n_jobs=1)]: Done   9 out of   9 | elapsed:  1.1min finished
{'tfidf_vect__max_df': 500} 0.8250509290696956


In [45]:
best_tfid_nb = grid_pipe.best_estimator_ # best_estimator_ : 최적의 파라미터로 모델 생성
pred_tfid_nb = best_tfid_nb.predict(X_test)
acc_tfid_nb = accuracy_score(y_test, pred_tfid_nb)
print(f'Count Vectorizer + LogisticRegression 평균 정확도 : {acc_tfid_nb:.4f}')

Count Vectorizer + LogisticRegression 평균 정확도 : 0.8236


In [46]:
joblib.dump(best_tfid_nb, '../static/model/naver_tfid_nb.pkl')

['../static/model/naver_tfid_nb.pkl']

## test

In [125]:
index = 25

In [126]:
df_test = pd.read_csv('../static/data/naver/movie_test.tsv', sep='\t') #, index_col='id'
df_test.tail(3)

Unnamed: 0,id,document,label
48992,9072549,그림도 좋고 완성도도 높았지만 보는 내내 불안하게 만든다,0
48993,5802125,절대 봐서는 안 될 영화 재미도 없고 기분만 잡치고 한 세트장에서 다 해먹네,0
48994,6070594,마무리는 또 왜이래,0


In [127]:
review = df_test.document[index]
label = df_test.label[index]
review, label

('번은 봤네요어쩜 이렇게 잘만들었을까', 1)

In [112]:
# label = df_test.label[index]
# label

1

In [115]:
# test_data = []
# test_data.append(df_test.document[index])
# test_data

['번은 봤네요어쩜 이렇게 잘만들었을까']

In [128]:
test_data = []
morphs = okt.morphs(review, stem=True) # 토큰화
temp_X = ' '.join([word for word in morphs if not word in stopwords]) # 불용어 제거
test_data.append(temp_X)
test_data

['번은 보다 어쩜 이렇게 만들다']

In [55]:
naver_count_lr = joblib.load('../static/model/naver_count_lr.pkl')

In [129]:
pred = naver_count_lr.predict(test_data)

In [130]:
pred[0], label

(1, 1)

In [84]:
result_dict = {'index':index, 'label':label, 
                        'pred':f'{pred[0]}'}
result_dict

{'index': 5000, 'label': 1, 'pred': '0'}

### 리뷰입력

In [120]:
review = """ 내 대부분의 시간을 보내는 이곳에서의 일이 좀 의미가 있었으면 좋겠어요."새해에는 저도 이렇게 살랍니다.영화관에서 보고싶었는데 못갔어요 넷플로 바로 봤습니다. 좋은시간이였어요. """

In [121]:
review = re.sub("[^ㄱ-ㅎㅏ-ㅣ가-힣 ]","", review)
test_data = []
morphs = okt.morphs(review, stem=True) # 토큰화
temp_X = ' '.join([word for word in morphs if not word in stopwords]) # 불용어 제거
test_data.append(temp_X)
test_data

['내 대부분 시간 보내다 곳 에서의 일이 의미 있다 좋다 새해 에는 저 이렇게 살다 영화관 에서 보다 못 가다 넷플 로 바로 보다 좋다 시간 이다']

In [122]:
pred = naver_count_lr.predict(test_data)

In [123]:
pred[0]

1

In [124]:
result_dict = {'pred':f'{pred[0]}'}                      
result_dict

{'pred': '1'}