### **Install dependancy**

In [1]:
! pip install konlpy



<center><h2><b>Imports</b></h2></center>

In [2]:
import json
import os

import pandas as pd
import numpy as np

import nltk
from konlpy.tag import Okt


from keras import losses
from keras import metrics
from keras import models
from keras import layers

from tensorflow.keras import optimizers

<center><h2><b>Loading Data</b></h2></center>

In [3]:
def open_txt(filename):
    with open(filename, 'r',encoding = 'utf-8') as f:  
        data = [line.split('\t') for line in f.read().splitlines()]
        data = data[1:]
    return data

In [4]:
train_data = open_txt('./data/train_sample.txt')
test_data = open_txt('./data/test_sample.txt')

df = pd.read_csv('./data/df_notk.csv', encoding = 'cp949')

<center><h2><b>Preprocessing</b></h2></center>

In [5]:
okt = Okt()

In [6]:
# 하나의 문장을 토큰화 한 후 텍스트와 품사태깅을 / 구분자로 묶어준다.
def tokenizing(docs):
    return ['/'.join(t) for t in okt.pos(docs, norm=True, stem=True)]

In [7]:
# parse to list
train_list, test_list = [], []  

for i in train_data:
    try:
        train_value = [tokenizing(i[1]), i[2]]
        train_list.append(train_value)
    except:
        pass

for i in test_data:
    try:
        test_value = [tokenizing(i[1]), i[2]]
        test_list.append(test_value)
    except:
        pass

In [8]:
tokens = [t for d in train_list for t in d[0]]

text = nltk.Text(tokens,name='NMSC')
text.vocab().most_common(10) #vocab().most_common(10) - 텍스트 빈도 상위 10개 보여주기

selected_words = [f[0] for f in text.vocab().most_common(10000)]

In [9]:
def term_frequency(doc):
    return [doc.count(word) for word in selected_words]

In [10]:
# 1번째리뷰를 상위 10000개와 각각 매칭하여 각 10000개의 단어가 해당 문장에 얼마나 포함되는지를 확인
train_x = [term_frequency(d) for d, _ in train_list]
test_x = [term_frequency(d) for d, _ in test_list]

train_y = [c for _, c in train_list]
test_y = [c for _, c in test_list]

<center><h2><b>Data Preparation</b></h2></center>

In [11]:
x_train = np.asarray(train_x).astype('float32')
x_test = np.asarray(test_x).astype('float32')

y_train = np.asarray(train_y).astype('float32')
y_test = np.asarray(test_y).astype('float32')

<center><h2><b>Modeling</b></h2></center>

In [12]:
# ANN

model = models.Sequential()

model.add(layers.Dense(64, activation='relu', input_shape=(10000,))) 
model.add(layers.Dense(64, activation='relu'))
model.add(layers.Dense(1, activation='sigmoid'))

model.compile(optimizer=optimizers.RMSprop(learning_rate=0.001),
              loss=losses.binary_crossentropy,
              metrics=[metrics.binary_accuracy])

model.summary()

# train
model.fit(x_train, y_train, epochs=10, batch_size=512)


Model: "sequential"
_________________________________________________________________
Layer (type)                 Output Shape              Param #   
dense (Dense)                (None, 64)                640064    
_________________________________________________________________
dense_1 (Dense)              (None, 64)                4160      
_________________________________________________________________
dense_2 (Dense)              (None, 1)                 65        
Total params: 644,289
Trainable params: 644,289
Non-trainable params: 0
_________________________________________________________________
Epoch 1/10
Epoch 2/10
Epoch 3/10
Epoch 4/10
Epoch 5/10
Epoch 6/10
Epoch 7/10
Epoch 8/10
Epoch 9/10
Epoch 10/10


<keras.callbacks.History at 0x1b7645fe9e8>

<center><h2><b>Evaluation</b></h2></center>

In [13]:
results = model.evaluate(x_test, y_test)

print('Loss : {}, Accuracy : {}'.format(*results))

Loss : 0.6513171195983887, Accuracy : 0.8263999819755554


In [14]:
# predict samples
def input_text(text):
    token = tokenizing(text)
    tf =term_frequency(token)

    data = np.expand_dims(np.asarray(tf).astype('float32'), axis=0)

    score = float(model.predict(data)) #새로운 데이터를 받으면 결과 예측
    if(score > 0.5):
        print("{} : 긍정 [{:.2f}%] \n".format(text, score * 100))
    else:
        print("{} : 부정 [{:.2f}%] \n".format(text, (1 - score) * 100))

In [15]:
for i in range(5) :
    input_text(df['review'][i])

판타지를 넘어 영화 역사에 남을 명작이다. 내가 이걸 왜 극장에서 못봤을까.... 폰으로 봤을때의 감동과는 비교가 안될텐데.... : 부정 [56.61%] 

이집트여행하는느낌의 영화 : 긍정 [98.09%] 

목포 연설 장면은 넋을 잃고 보게 된다. : 긍정 [98.38%] 

그저 그렇네요. 뻔한 반전과 결말. : 부정 [99.17%] 

더 배트맨 조커처럼 몰입감이 있으면 추천 반대로 지루하면 비추천 : 부정 [92.08%] 

