In [1]:
from IPython.display import display, HTML
display(HTML("""
<style>
div.container{width:86% !important;}
div.cell.code_cell.rendered{width:100%;}
div.CodeMirror {font-family:Consolas; font-size:12pt;}
div.output {font-size:15pt; font-weight:bold;}
div.input {font-family:Consolas; font-size:12pt;}
div.prompt {min-width:70px;}
div#toc-wrapper{padding-top:120px;}
div.text_cell_render ul li{font-size:12pt;padding:5px;}
table.dataframe{font-size:15px;}
</style>
"""))

**<font size="4" color="red">ch5. LSTM/GRU</font>
- 5만개 영화 감상평(독립변수) -> 부정/긍정(타겟변수)

In [2]:
# 1. 패키지
import numpy as np
from tensorflow.keras.datasets import imdb
from tensorflow.keras.preprocessing.sequence import pad_sequences

from tensorflow.keras.models import Sequential
from tensorflow.keras.layers import Embedding, LSTM, Dense
from time import time # 70.01.01부터 현재까지의 밀리세컨

from sklearn.metrics import confusion_matrix, f1_score, precision_score, recall_score

In [3]:
# 2. 하이퍼 파라미터 설정(이 파라미터를 바꾸면 정확도나 학습 속도에 차이 발생)
MY_WORDS  = 10000 # imdb 데이터의 단어 수
MY_LENGTH = 80    # 영화평 단어 수 80개만 독립변수
MY_EMBED  = 32    # Embedding layer의 결과 차원
MY_HIDDEN = 64    # LSTM의 units 차원

MY_EPOCH  = 10    # 학습 수(fit)
MY_BATCH  = 200   # batch_size(fit시 매번 데이터를 가져오는 데이터)

In [4]:
# 3. 데이터 불러오기
(x_train, y_train), (x_test, y_test) = imdb.load_data(num_words=MY_WORDS)

In [5]:
print('학습셋 입력변수 모양 : ', x_train.shape)
print('학습셋 타겟변수 모양 : ', y_train.shape)
print('학습셋 샘플 :', type(x_train[2]), x_train[100], y_train[2])
print('테스트셋 변수들 모양 :', x_test.shape, y_test.shape)

학습셋 입력변수 모양 :  (25000,)
학습셋 타겟변수 모양 :  (25000,)
학습셋 샘플 : <class 'list'> [1, 13, 244, 6, 87, 337, 7, 628, 2219, 5, 28, 285, 15, 240, 93, 23, 288, 549, 18, 1455, 673, 4, 241, 534, 3635, 8448, 20, 38, 54, 13, 258, 46, 44, 14, 13, 1241, 7258, 12, 5, 5, 51, 9, 14, 45, 6, 762, 7, 2, 1309, 328, 5, 428, 2473, 15, 26, 1292, 5, 3939, 6728, 5, 1960, 279, 13, 92, 124, 803, 52, 21, 279, 14, 9, 43, 6, 762, 7, 595, 15, 16, 2, 23, 4, 1071, 467, 4, 403, 7, 628, 2219, 8, 97, 6, 171, 3596, 99, 387, 72, 97, 12, 788, 15, 13, 161, 459, 44, 4, 3939, 1101, 173, 21, 69, 8, 401, 2, 4, 481, 88, 61, 4731, 238, 28, 32, 11, 32, 14, 9, 6, 545, 1332, 766, 5, 203, 73, 28, 43, 77, 317, 11, 4, 2, 953, 270, 17, 6, 3616, 13, 545, 386, 25, 92, 1142, 129, 278, 23, 14, 241, 46, 7, 158] 0
테스트셋 변수들 모양 : (25000,) (25000,)


In [6]:
# 긍정 / 부정 개수
print('학습셋의 긍정 개수 : ', y_train.sum())
print('테스트셋의 긍정 개수 : ', y_test.sum())

학습셋의 긍정 개수 :  12500
테스트셋의 긍정 개수 :  12500


## 4. 문자단어 -> 정수

In [7]:
word_to_id = imdb.get_word_index() # ('word':id)
print(word_to_id['movie'])
print(word_to_id['film'])
print(word_to_id['sonja'])
print(word_to_id['a'])
print(word_to_id['the'])
# 정수 -> 문자단어
id_to_word = {} # {1:'the', 3:'a', 16816:'sonja'}
for word, value in word_to_id.items() :
    id_to_word[value] = word
print(id_to_word[1])
print(id_to_word[3])

17
19
16816
3
1
the
a


In [8]:
msg = "What a wonderful movie"
msg = msg.lower().split()
# 1: 리뷰 시작을 알리는 숫자, 2: 문자가 짤려서 잘못 읽어옴, 3: padding 처리
data = [1] + [word_to_id.get(m, -1)+3 for m in msg]
print('원 후기 : ', msg)
print('encoded된 data : ', data)
print('추정된 data : ', [id_to_word.get(d-3, '???') for d in data])
print('추정된 data : ', ' '.join([id_to_word.get(d-3, '???') for d in data]))

원 후기 :  ['what', 'a', 'wonderful', 'movie']
encoded된 data :  [1, 51, 6, 389, 20]
추정된 data :  ['???', 'what', 'a', 'wonderful', 'movie']
추정된 data :  ??? what a wonderful movie


## 5. 숫자영화평 -> 자연어 영화평 return 함수

In [9]:
def decoding(review_num) :
    decoded = [id_to_word.get(num-3, '???') for num in review_num]
    return ' '.join(decoded)

In [10]:
print(decoding(x_train[1]), y_train[1])

??? big hair big boobs bad music and a giant safety pin these are the words to best describe this terrible movie i love cheesy horror movies and i've seen hundreds but this had got to be on of the worst ever made the plot is paper thin and ridiculous the acting is an abomination the script is completely laughable the best is the end showdown with the cop and how he worked out who the killer is it's just so damn terribly written the clothes are sickening and funny in equal ??? the hair is big lots of boobs ??? men wear those cut ??? shirts that show off their ??? sickening that men actually wore them and the music is just ??? trash that plays over and over again in almost every scene there is trashy music boobs and ??? taking away bodies and the gym still doesn't close for ??? all joking aside this is a truly bad film whose only charm is to look back on the disaster that was the 80's and have a good old laugh at how bad everything was back then 0


## 6. 영화평(입력변수)의 길이

In [11]:
def show_length() : 
    print("첫 20개 영화평 길이")
    print([len(x_data) for x_data in x_train[:20]])

In [12]:
# pad_sequence 전
show_length()

첫 20개 영화평 길이
[218, 189, 141, 550, 147, 43, 123, 562, 233, 130, 450, 99, 117, 238, 109, 129, 163, 752, 212, 177]


In [13]:
print("제일 긴 영화평 단어 길이 : ",
         max(len(x_data) for x_data in x_train))
print("제일 짧은 영화평 단어 길이 : ",
         min(len(x_data) for x_data in x_train))
print("제일 긴 영화평 단어 길이 : ",
        np.median([len(x_data) for x_data in x_train]))

제일 긴 영화평 단어 길이 :  2494
제일 짧은 영화평 단어 길이 :  11
제일 긴 영화평 단어 길이 :  178.0


## 7. 모든 영화평 길이를 동일하기(80)

In [15]:
X_train = pad_sequences(x_train,
                        padding='post',
                        truncating='post', # 뒷부분을 자르고 앞부분을 남김
                        maxlen=MY_LENGTH
                        )
X_test = pad_sequences(x_test,
                        padding='post',
                        truncating='post', # 뒷부분을 자르고 앞부분을 남김
                        maxlen=MY_LENGTH
                        )
show_length(X_train), show_length(X_test)

TypeError: show_length() takes 0 positional arguments but 1 was given