In [None]:
!apt-get update
!apt-get install g++ openjdk-8-jdk
!pip3 install konlpy

In [None]:
from six.moves import urllib
urllib.request.urlretrieve("https://raw.githubusercontent.com/e9t/nsmc/master/ratings_train.txt", filename="ratings_train.txt")
urllib.request.urlretrieve("https://raw.githubusercontent.com/e9t/nsmc/master/ratings_test.txt", filename='ratings_test.txt')

In [None]:
import pandas as pd
train_data = pd.read_table('/content/ratings_train.txt')
test_data = pd.read_table('/content/ratings_test.txt')

In [None]:
train_data

In [None]:
test_data

In [None]:
train_lst = train_data['document']
test_lst = test_data['document']

In [None]:
train_lst

In [None]:
test_lst

In [None]:
train_data.info()

In [None]:
import re
sub_rext = '[^a-zA-Zㄱ-ㅣ가-힣0-9 ]'

In [None]:
import re

def clean_text(texts):
    corpus = []
    for i in range(0, len(texts)):
        review = re.sub(sub_rext, ' ',str(texts[i])) #remove punctuation
        corpus.append(review)
    return corpus

In [None]:
re_train=clean_text(train_lst)
re_train

In [None]:
re_test=clean_text(test_lst)
re_test

In [None]:
df_train = pd.DataFrame(re_train,columns = ['review'])
df_train

In [None]:
df_test = pd.DataFrame(re_test,columns = ['review'] )
df_test

In [None]:
train_data = pd.concat([train_data, df_train], axis=1)
train_data

In [None]:
test_data = pd.concat([test_data, df_test], axis=1)
test_data

In [None]:
train_data.drop(['document'],axis=1,inplace=True)

In [None]:
train_data

In [None]:
test_data.drop(['document'],axis=1,inplace=True)

In [None]:
test_data

In [None]:
from google.colab import files
myfile = files.upload()

In [None]:
# 필요 키워드 설정
k_stopword = pd.read_csv('/content/korean_stopword.csv')

stopword = list(k_stopword['불용어'])+['을','은','를','이가','과','의',
                                    '는','에','가','이','들','좀','잘',
                                    '걍','과','도','으로','자','에','와',
                                    '한','하다', '있다', '되다', '에서']
stopword[:5]

In [None]:
from konlpy.tag import Okt
okt = Okt()

morphs_lst = []
for i in test_data.index:
  morphs=""
  morph = okt.morphs(test_data.loc[i]['review'], stem=True)
  for txt in morph:
    if txt not in stopword:
      morphs = morphs+" " +txt
  morphs_lst.append(morphs)
test_data['morphs'] = morphs_lst
test_data.head()

In [None]:
from konlpy.tag import Okt
okt = Okt()

morphs_lst = []
for i in train_data.index:
  morphs=""
  morph = okt.morphs(train_data.loc[i]['review'], stem=True)
  for txt in morph:
    if txt not in stopword:
      morphs = morphs+" " +txt
  morphs_lst.append(morphs)
train_data['morphs'] = morphs_lst
train_data.head()

In [None]:
train_data['label'].value_counts().plot(kind = 'bar')

In [None]:
print(train_data.isnull().values.any())

In [None]:
from sklearn.model_selection import train_test_split

x = train_data['morphs']
y=train_data['label']

x_train, x_test ,y_train, y_test = train_test_split(x,y,
                                                    test_size=0.2,
                                                    random_state=11)

In [None]:
print('학습 데이터의 수: ', x_train.shape[0])
print('학습 데이터의 수: ', x_test.shape[0])
print('### 학습데이터의 라벨 분포 ###')
y_train.value_counts()

In [None]:
from tensorflow.keras.preprocessing.text import Tokenizer

tokenizer = Tokenizer()
tokenizer.fit_on_texts(x_train)

In [None]:
print(tokenizer.word_index)

In [None]:
threshold = 3
total_cnt = len(tokenizer.word_index) # 단어의 수
rare_cnt = 0 # 등장 빈도수가 threshold보다 작은 단어의 개수를 카운트
total_freq = 0# 훈련 데이터의 단어 빈도수 종합
rare_freq = 0 # 등장 빈도수가 threshold보다 작은 단어의 등장 빈도수의 총 합

# 단어와 빈도수의 쌍(pair)을 key와 value로 받는다.
for key, value in tokenizer.word_counts.items():
  total_freq = total_freq+value

  # 단어의 등장 빈도수가 threshold보다 작으면
  if(value < threshold):
    rare_cnt = rare_cnt+1
    rare_freq = rare_freq +value

print('단어 집합(vocabulary)의 크기:',total_cnt)
print('등장 빈도가 %s번 이하인 희귀 단어의 수 :%s'%(threshold-1,rare_cnt))
print("단어 집합에서 희귀 단어의 비율:", (rare_cnt / total_cnt)*100)
print("전체 등장 빈도에서 희귀 단어 빈도 비율:", (rare_freq / total_freq)*100)

In [None]:
vocab_size = total_cnt - rare_cnt +2
print("단어의 집합의 크기: ",vocab_size)

In [None]:
tokenizer = Tokenizer(vocab_size, oov_token = 'OOV')
tokenizer.fit_on_texts(x_train)

In [None]:
x_train = tokenizer.texts_to_sequences(x_train)
x_test = tokenizer.texts_to_sequences(x_test)

In [None]:
import matplotlib.pyplot as plt

print('문서의 최대 길이 :', max(len(l) for l in x_train))
print('문서의 평균 길이 :', sum(map(len, x_train))/len(x_train))
plt.hist([len(s) for s in x_train], bins=50)
plt.xlabel('length of samples')
plt.ylabel('number of samples')
plt.show()

In [None]:
def below_threshold_len(max_len, nested_list):
  cnt = 0
  for s in nested_list:
    if(len(s)<=max_len):
      cnt = cnt+1
  print('전체 샘플 중 길이가 %s 이하인 샘플의 비율: %s'%(max_len, (cnt/len(nested_list))*100))

In [None]:
max_len = 35
below_threshold_len(max_len, x_train)

In [None]:
from tensorflow.keras.preprocessing import sequence

print('시퀀스 패딩(sample * time)')
x_train = sequence.pad_sequences(x_train,maxlen=max_len)
x_test = sequence.pad_sequences(x_test,maxlen=max_len)
print('x_train 크기', x_train.shape)
print('x_test 크기', x_test.shape)

In [None]:
x_train, x_val, y_train, y_val = train_test_split(x_train, y_train,
                                                  test_size = 0.2,
                                                  random_state=11)

In [None]:
print(len(x_val), len(x_train))

In [None]:
from tensorflow.keras.models import Sequential
from tensorflow.keras.layers import LSTM, Dense, Embedding

model = Sequential()
model.add(Embedding(input_dim = 1000, output_dim = 64))
model.add(LSTM(128, return_sequences =True))
model.add(LSTM(64))
model.add(Dense(1, activation='sigmoid'))

model.compile(optimizer = 'adam',
              loss = 'binary_crossentropy',
              metrics = ['acc'])

In [None]:
history = model.fit(x_train, y_train,
                    batch_size = 32, epochs=100,
                    validation_split=0.2)

In [None]:
import matplotlib.pyplot as plt

his_dict = history.history
loss = his_dict['loss']
val_loss = his_dict['val_loss'] 

epochs = range(1, len(loss) + 1)
fig = plt.figure(figsize = (10, 5))

# 훈련 및 검증 손실 그리기
ax1 = fig.add_subplot(1, 2, 1)
ax1.plot(epochs, loss, color = 'blue', label = 'train_loss')
ax1.plot(epochs, val_loss, color = 'orange', label = 'val_loss')
ax1.set_title('train and val loss')
ax1.set_xlabel('epochs')
ax1.set_ylabel('loss')
ax1.legend()

acc = his_dict['acc']
val_acc = his_dict['val_acc']

# 훈련 및 검증 정확도 그리기
ax2 = fig.add_subplot(1, 2, 2)
ax2.plot(epochs, acc, color = 'blue', label = 'train_acc')
ax2.plot(epochs, val_acc, color = 'orange', label = 'val_acc')
ax2.set_title('train and val acc')
ax2.set_xlabel('epochs')
ax2.set_ylabel('acc')
ax2.legend()

plt.show()

In [None]:
import numpy as np
def pred(words):
    words = okt.morphs(words, stem=True)
    words = [word for word in words if not word in stopword]
    encoded = tokenizer.texts_to_sequences([words])
    pad_words=sequence.pad_sequences(encoded,maxlen=max_len)
    
    
    results = model.predict(pad_words)
    
    if results >= 0.5:
      print('긍정')
    else:
      print('부정')

In [None]:
pred('위성락을 뛰어넘는 또 하나의 역대급 악역이 탄생했다.')

In [None]:
pred('빌런 진선규의 존재감은 압도적이다.')

In [None]:
pred('너무 재미있다.')

In [None]:
pred('더럽게 재미없네')

In [None]:
pred('ㅋㅋㅋ 별로임')

In [None]:
pred(' 1편보다 더 웃김ㅋㅋㅋ 배우들이 더 친해져서 그런가 확실히 더 재밌음')

In [None]:
pred('나만 볼 수는 없지')

In [None]:
pred('나만 볼 수 없지 ')

In [None]:
pred('ㅅㅂ 나만 볼 수 없지')