## RNN 과제 Colab 기준으로 작성, data 너무 커서 train 10000개, test 2000개만 사용

In [3]:
!apt-get update
!apt-get install g++ openjdk-8-jdk 
!pip3 install konlpy

Get:1 https://cloud.r-project.org/bin/linux/ubuntu bionic-cran35/ InRelease [3,626 B]
Ign:2 https://developer.download.nvidia.com/compute/cuda/repos/ubuntu1804/x86_64  InRelease
Get:3 http://security.ubuntu.com/ubuntu bionic-security InRelease [88.7 kB]
Get:4 http://ppa.launchpad.net/graphics-drivers/ppa/ubuntu bionic InRelease [21.3 kB]
Hit:5 http://archive.ubuntu.com/ubuntu bionic InRelease
Ign:6 https://developer.download.nvidia.com/compute/machine-learning/repos/ubuntu1804/x86_64  InRelease
Get:7 https://developer.download.nvidia.com/compute/cuda/repos/ubuntu1804/x86_64  Release [564 B]
Hit:8 https://developer.download.nvidia.com/compute/machine-learning/repos/ubuntu1804/x86_64  Release
Get:9 https://developer.download.nvidia.com/compute/cuda/repos/ubuntu1804/x86_64  Release.gpg [819 B]
Get:10 http://archive.ubuntu.com/ubuntu bionic-updates InRelease [88.7 kB]
Get:11 http://ppa.launchpad.net/marutter/c2d4u3.5/ubuntu bionic InRelease [15.4 kB]
Get:12 http://archive.ubuntu.com/ubuntu

In [1]:
import urllib.request
import pandas as pd
import numpy as np
from konlpy.tag import Okt
import pickle
from tensorflow.keras.preprocessing.sequence import pad_sequences
from tensorflow.keras.layers import Embedding, Dense, LSTM
from tensorflow.keras.models import Sequential
from tensorflow.keras.models import load_model
from tensorflow.keras.callbacks import EarlyStopping, ModelCheckpoint
from gensim.models import FastText

## 데이터 불러오기 및 전처리

In [2]:
### data를 받아오는 코드
urllib.request.urlretrieve("https://raw.githubusercontent.com/e9t/nsmc/master/ratings_train.txt", filename="ratings_train.txt")
urllib.request.urlretrieve("https://raw.githubusercontent.com/e9t/nsmc/master/ratings_test.txt", filename="ratings_test.txt")

('ratings_test.txt', <http.client.HTTPMessage at 0x7f2957114358>)

In [0]:
train_data = pd.read_table('ratings_train.txt')
test_data = pd.read_table('ratings_test.txt')

In [4]:
### train data는 150000개이고, label은 1이 긍정, 2가 부정으로 되어 있음
print(train_data.__len__())
train_data = train_data[:10000]

150000


In [5]:
### test_data는 50000개
print(test_data.__len__())
test_data = test_data[:2000]

50000


In [0]:
### 텍스트 데이터 전치리 ( 특수 문자 제거 )
train_data['document'] = train_data['document'].str.replace("[^ㄱ-ㅎㅏ-ㅣ가-힣 ]","")
train_data['document'].replace('', np.nan, inplace=True)

In [7]:
### 데이터 정제 ( 해당 데이터에 중복도 존재하고 document가 빈것이 있는지, NaN 데이터가 있는지 확인)
print(train_data[train_data['document'].isna()].__len__())
print(train_data[train_data['document'] == ''])
train_data.drop_duplicates(subset=['document'], inplace=True)
print(train_data.__len__())
train_data.dropna(inplace=True)
print(train_data.__len__())

47
Empty DataFrame
Columns: [id, document, label]
Index: []
9831
9830


## 형태소 분석 ( 토큰화 )

In [0]:
okt = Okt()

def tokenizer_morphs(doc):
    try:
        doc = okt.pos(doc, norm=True, stem=True) ### Okt 패키지를 활용해 Tokenize
        doc = [word[0] for word in doc if word[1] in ['Noun', 'Verb', 'Adjective', 'Adverb']]
    except:
        doc = []
    ### 명사 동사 형용사 부사만 남김 ( 내맘 )
    return doc

In [0]:
train_data['document'] = train_data['document'].apply(tokenizer_morphs) ### 시간 오래 걸림 ( 저장하고 하는 것 추천 )
test_data['document'] = test_data['document'].apply(tokenizer_morphs)

with open('train_data.pkl', 'wb') as f:   ### pickle 데이터로 저장
    pickle.dump(train_data, f)
with open('test_data.pkl', 'wb') as f:   ### pickle 데이터로 저장
    pickle.dump(test_data, f)

In [0]:
with open('train_data.pkl', 'rb') as f:   ### pickle 데이터로 저장
    train_data = pickle.load(f)
with open('test_data.pkl', 'rb') as f:   ### pickle 데이터로 저장
    test_data = pickle.load(f)

## 워드임베딩

In [11]:
### Fasttext 임베딩 모델 생성 ###
model = FastText(train_data['document'].tolist(), size = 100, window = 5, min_count=1, workers=1)
model.save("NaverMovie.vec") ## 모델 저장

  'See the migration notes for details: %s' % _MIGRATION_NOTES_URL


In [12]:
model = FastText.load("NaverMovie.vec")

  'See the migration notes for details: %s' % _MIGRATION_NOTES_URL


## LSTM에 집어 넣기 위한 데이터 전처리


In [0]:
### 벡터로 변환
for i, words in enumerate(train_data['document']):
    for j, word in enumerate(words):
        try:
            words[j] = model.wv[word]
        except:
            words[j] = np.zeros(100)

In [0]:
### 벡터로 변환
for i, words in enumerate(test_data['document']):
    for j, word in enumerate(words):
        try:
            words[j] = model.wv[word]
        except:
            words[j] = np.zeros(100)

In [15]:
for i, words in enumerate(train_data['document']):
    words = [np.zeros(100, dtype=np.float32)]*(54-len(words)) + words
    train_data['document'].iloc[i] = words

A value is trying to be set on a copy of a slice from a DataFrame

See the caveats in the documentation: http://pandas.pydata.org/pandas-docs/stable/user_guide/indexing.html#returning-a-view-versus-a-copy
  self._setitem_with_indexer(indexer, value)


In [0]:
X_train = np.array(train_data['document'].tolist())
X_test = np.array(test_data['document'].tolist())
y_train = np.array(train_data['label'])
y_test = np.array(train_data['label'])

## LSTM

In [73]:
lmodel = Sequential()
lmodel.add(LSTM(20, input_shape = (54, 100)))
lmodel.add(Dense(1, activation='sigmoid'))
lmodel.summary()

es = EarlyStopping(monitor='val_loss', mode='min', verbose=1, patience=4)
mc = ModelCheckpoint('final_model.h5', monitor='val_acc', mode='max', verbose=1, save_best_only=True)

Model: "sequential_24"
_________________________________________________________________
Layer (type)                 Output Shape              Param #   
lstm_18 (LSTM)               (None, 20)                9680      
_________________________________________________________________
dense_16 (Dense)             (None, 1)                 21        
Total params: 9,701
Trainable params: 9,701
Non-trainable params: 0
_________________________________________________________________


In [74]:
lmodel.compile(optimizer='Adam', loss='binary_crossentropy', metrics=['acc'])
history = lmodel.fit(X_train, y_train, epochs=15, callbacks=[es, mc], batch_size=60, validation_split=0.2)

Train on 7864 samples, validate on 1966 samples
Epoch 1/15
Epoch 00001: val_acc improved from -inf to 0.52798, saving model to final_model.h5
Epoch 2/15
Epoch 00002: val_acc did not improve from 0.52798
Epoch 3/15
Epoch 00003: val_acc did not improve from 0.52798
Epoch 4/15
Epoch 00004: val_acc improved from 0.52798 to 0.53764, saving model to final_model.h5
Epoch 5/15
Epoch 00005: val_acc improved from 0.53764 to 0.54324, saving model to final_model.h5
Epoch 6/15
Epoch 00006: val_acc did not improve from 0.54324
Epoch 7/15
Epoch 00007: val_acc did not improve from 0.54324
Epoch 8/15
Epoch 00008: val_acc did not improve from 0.54324
Epoch 9/15
Epoch 00009: val_acc improved from 0.54324 to 0.54476, saving model to final_model.h5
Epoch 10/15
Epoch 00010: val_acc did not improve from 0.54476
Epoch 11/15
Epoch 00011: val_acc did not improve from 0.54476
Epoch 12/15
Epoch 00012: val_acc did not improve from 0.54476
Epoch 13/15
Epoch 00013: val_acc did not improve from 0.54476
Epoch 14/15
Ep