<a href="https://colab.research.google.com/github/chhak/DeepLearning/blob/master/Text_movie_sentiment.ipynb" target="_parent"><img src="https://colab.research.google.com/assets/colab-badge.svg" alt="Open In Colab"/></a>

In [None]:
#구글 드라이브 연동
from google.colab import drive
drive.mount('/content/drive')

In [None]:
"""
날짜 : 2020/09/08
이름 : 권기민
내용 : 영화 리뷰 감성 분석하기
"""
import codecs
import numpy as np
from tensorflow.keras.preprocessing import sequence
from tensorflow.keras.preprocessing.text import Tokenizer
from tensorflow.keras.preprocessing.sequence import pad_sequences
from tensorflow.keras.models import Sequential
from tensorflow.keras.layers import Dense, Embedding
from tensorflow.keras.layers import LSTM

In [None]:
# 파일 로드 함수정의
def load_data(file):
  result = []

  with open(file, 'r', encoding='utf-8') as f:
    lines = f.read().splitlines()

    for line in lines:
      data = line.split('\t')
      result.append(data)

  result = result[1:] # header정보 제외
  return result

# 데이터셋 생성 함수정의
def make_dataset(train_data, test_data):
  #학습용
  train_x = []
  train_y = []

  for i in range(len(train_data)):
    train_x.append(train_data[i][1])
    train_y.append(int(train_data[i][2]))

  #테스트용
  test_x = []
  test_y = []

  for i in range(len(test_data)):
    test_x.append(test_data[i][1])
    test_y.append(int(test_data[i][2]))

  # 최대 5000개의 단어를 갖는 단어사전 생성
  tokenizer = Tokenizer(num_words=5000)
  tokenizer.fit_on_texts(train_x)

  # BOW 표기
  token_train_x = tokenizer.texts_to_sequences(train_x)
  token_test_x = tokenizer.texts_to_sequences(test_x)

  return (token_train_x, train_y), (token_test_x, test_y)

In [None]:
# 데이터 파일 로드
train_data = load_data('/content/drive/My Drive/Tensorflow_works/data/ratings_train.txt')
test_data = load_data('/content/drive/My Drive/Tensorflow_works/data/ratings_test.txt')
train_data[0]

['9976970', '아 더빙.. 진짜 짜증나네요 목소리', '0']

In [None]:
# 데이터셋 생성
(train_x, train_y), (test_x, test_y) = make_dataset(train_data, test_data)

[23, 936, 4, 1097] 0


In [None]:
# 데이터셋 확인
print(train_x[0], train_y[0])
print(train_x[1], train_y[1])
print(test_x[0], test_y[0])
print(test_x[49999], test_y[49999])

In [None]:
# 데이터 전처리
train_x = sequence.pad_sequences(train_x, maxlen=100)
test_x = sequence.pad_sequences(test_x, maxlen=100)

train_y = np.array(train_y)
test_y = np.array(test_y)

train_x[0]

array([   0,    0,    0,    0,    0,    0,    0,    0,    0,    0,    0,
          0,    0,    0,    0,    0,    0,    0,    0,    0,    0,    0,
          0,    0,    0,    0,    0,    0,    0,    0,    0,    0,    0,
          0,    0,    0,    0,    0,    0,    0,    0,    0,    0,    0,
          0,    0,    0,    0,    0,    0,    0,    0,    0,    0,    0,
          0,    0,    0,    0,    0,    0,    0,    0,    0,    0,    0,
          0,    0,    0,    0,    0,    0,    0,    0,    0,    0,    0,
          0,    0,    0,    0,    0,    0,    0,    0,    0,    0,    0,
          0,    0,    0,    0,    0,    0,    0,    0,   23,  936,    4,
       1097], dtype=int32)

In [None]:
# 모델 구성
model = Sequential()
model.add(Embedding(5000, 128))
model.add(LSTM(128, dropout=0.2))
model.add(Dense(1, activation='sigmoid'))

model.summary()

Model: "sequential_1"
_________________________________________________________________
Layer (type)                 Output Shape              Param #   
embedding_1 (Embedding)      (None, None, 128)         640000    
_________________________________________________________________
lstm_1 (LSTM)                (None, 128)               131584    
_________________________________________________________________
dense_1 (Dense)              (None, 1)                 129       
Total params: 771,713
Trainable params: 771,713
Non-trainable params: 0
_________________________________________________________________


In [None]:
# 모델 설정
model.compile(optimizer='adam',
              loss='binary_crossentropy',
              metrics=['acc'])

In [None]:
# 모델 학습 - 1시간 소요
model.fit(train_x, 
          train_y,
          batch_size = 128,
          epochs = 10)

Epoch 1/10
Epoch 2/10
Epoch 3/10
Epoch 4/10
Epoch 5/10
Epoch 6/10
Epoch 7/10
Epoch 8/10
Epoch 9/10
Epoch 10/10


<tensorflow.python.keras.callbacks.History at 0x7fb5a92eaf60>

In [None]:
# 모델 성능확인
result = model.evaluate(test_x, test_y, batch_size=128)
result



[0.5349799394607544, 0.7687199711799622]

In [None]:
# 모델 저장
model.save('/content/drive/My Drive/Tensorflow_works/model/review1.model')

Instructions for updating:
This property should not be used in TensorFlow 2.0, as updates are applied automatically.
Instructions for updating:
This property should not be used in TensorFlow 2.0, as updates are applied automatically.
INFO:tensorflow:Assets written to: /content/drive/My Drive/Tensorflow_works/model/review1.model/assets
