In [2]:
%matplotlib inline
import matplotlib.pyplot as plt
import re
from konlpy.tag import Okt
from tensorflow.keras.preprocessing.text import Tokenizer
import numpy as np
from tensorflow.keras.preprocessing.sequence import pad_sequences

## word2vec 임포트

In [3]:
from gensim.models import Word2Vec
ko_model= Word2Vec.load('word2vec_movie.model')

In [4]:
# 모델 확인 
ko_model['평점']

  


array([ 1.37894905e+00,  9.87708807e-01, -5.03856897e-01, -5.77842712e-01,
        2.71781266e-01, -4.84355837e-01,  7.09133685e-01,  4.83562052e-01,
       -2.12013578e+00, -1.53143942e+00, -2.21780896e+00, -4.36312646e-01,
       -7.34049320e-01,  1.32041967e+00, -5.64270079e-01, -1.27724886e+00,
       -6.67323589e-01,  1.25785872e-01,  8.86485755e-01,  5.52864611e-01,
        1.83218852e-01,  3.81816566e-01, -1.45142281e+00,  1.32863295e+00,
       -2.28901446e-01, -8.47232699e-01, -1.07557023e+00,  1.31017220e+00,
       -1.11492407e+00,  4.67573434e-01, -1.73956227e+00,  5.30944943e-01,
       -7.81446993e-01,  8.03073198e-02,  1.06718056e-02, -7.76855171e-01,
       -5.18462360e-01,  1.82522148e-01, -3.42364371e-01,  8.30587745e-01,
       -6.77727163e-01,  2.17546612e-01, -8.99270713e-01,  3.50216419e-01,
        1.27970874e-01, -1.43166259e-02, -6.87184393e-01,  7.87147760e-01,
        3.13703835e-01, -1.34181023e+00, -2.88948655e-01,  4.87908751e-01,
       -4.40034330e-01, -

In [5]:
print(ko_model.wv.most_similar("최민식"))

[('한석규', 0.9022693634033203), ('이민호', 0.8778703808784485), ('이미숙', 0.8716947436332703), ('설경구', 0.8695714473724365), ('김명민', 0.8679181933403015), ('이정재', 0.863978385925293), ('정재영', 0.8619986176490784), ('메릴', 0.8613458871841431), ('이주승', 0.861168622970581), ('공리', 0.8594560027122498)]


## Train Test Data 불러오기

In [6]:
import pickle
import pandas as pd
test = pd.read_pickle("token_test_data.pkl")
train = pd.read_pickle("token_train_data.pkl")

# train, test 명시 
training_sentences = train['tokens']
testing_sentences = test['tokens']

training_labels = train['labels']
testing_labels = test['labels']

training_sentences.shape, testing_sentences.shape, training_labels.shape, testing_labels.shape

((145791,), (48995,), (145791,), (48995,))

In [7]:
training_sentences

0                                  [더빙, 진짜, 짜증나다, 목소리]
1                 [흠, 포스터, 보고, 초딩, 영화, 줄, 오버, 연기, 가볍다]
2                                 [무재, 밓었, 다그, 래서, 추천]
3                 [교도소, 이야기, 구먼, 솔직하다, 재미, 없다, 평점, 조정]
4         [몬페, 의, 익살스럽다, 연기, 영화, 스파이더맨, 커스틴, 던스트, 이쁘다]
                              ...                     
145786                                 [인간, 문제, 소, 죄인]
145787                                        [평점, 낮다]
145788                  [이, 뭐, 한국인, 먹거리, 필리핀, 혼혈, 착하다]
145789                 [청춘, 영화, 최고봉, 방황, 우울하다, 날, 자화상]
145790                        [한국, 영화, 최초, 수간, 내용, 영화]
Name: tokens, Length: 145791, dtype: object

## Tokenize / Padding 

In [20]:
import tensorflow as tf
from tensorflow.keras.preprocessing.text import Tokenizer
from tensorflow.keras.preprocessing.sequence import pad_sequences

# 파라미터 명시 
vocab_size = 20000
embedding_dim = 200
max_length = 30
truct_type = 'post'
padding_type = 'post'
oov_tok = '<OOV>'

In [17]:
import tensorflow as tf
from tensorflow.keras.preprocessing.text import Tokenizer
from tensorflow.keras.preprocessing.sequence import pad_sequences

tokenizer = Tokenizer(num_words=vocab_size, oov_token=oov_tok)
tokenizer.fit_on_texts(training_sentences)
word_index = tokenizer.word_index

word_index, len(word_index)

({'<OOV>': 1,
  '영화': 2,
  '없다': 3,
  '있다': 4,
  '좋다': 5,
  '재밌다': 6,
  '정말': 7,
  '것': 8,
  '같다': 9,
  '진짜': 10,
  '아니다': 11,
  '점': 12,
  '이': 13,
  '연기': 14,
  '평점': 15,
  '최고': 16,
  '왜': 17,
  '스토리': 18,
  '생각': 19,
  '드라마': 20,
  '감동': 21,
  '사람': 22,
  '보고': 23,
  '말': 24,
  '이렇다': 25,
  '아깝다': 26,
  '더': 27,
  '배우': 28,
  '때': 29,
  '감독': 30,
  '거': 31,
  '내': 32,
  '재미있다': 33,
  '뭐': 34,
  '시간': 35,
  '재미': 36,
  '내용': 37,
  '그냥': 38,
  '좀': 39,
  '그': 40,
  '지루하다': 41,
  '재미없다': 42,
  '쓰레기': 43,
  '수': 44,
  '그렇다': 45,
  '작품': 46,
  '사랑': 47,
  '나': 48,
  '하나': 49,
  '다시': 50,
  '마지막': 51,
  '볼': 52,
  '이다': 53,
  '정도': 54,
  '처음': 55,
  '완전': 56,
  '많다': 57,
  '장면': 58,
  '액션': 59,
  '주인공': 60,
  '안되다': 61,
  '돈': 62,
  '최악': 63,
  '이야기': 64,
  '지금': 65,
  '걸': 66,
  '느낌': 67,
  '연출': 68,
  '임': 69,
  '끝': 70,
  '듯': 71,
  '좋아하다': 72,
  '명작': 73,
  '별로': 74,
  '년': 75,
  '역시': 76,
  '개': 77,
  '이해': 78,
  '안': 79,
  '이영화': 80,
  '괜찮다': 81,
  '또': 82,
  '때문': 83,
  '여자': 84,


In [38]:
# Sequence 만들기 / Padding하기 - train, test 모두 
training_sequences  = tokenizer.texts_to_sequences(training_sentences)
training_padded = pad_sequences(training_sequences, maxlen=max_length, 
                                padding=padding_type, truncating=truct_type)


testing_sequences  = tokenizer.texts_to_sequences(testing_sentences)
testing_padded = pad_sequences(testing_sequences, maxlen=max_length, 
                                padding=padding_type, truncating=truct_type)

## Word2Vec weight 만들기 

In [None]:
ko_model

In [50]:
import numpy as np

vocab_size = len(tokenizer.index_word) + 1
embedding_dim = 200

embedding_matrix = np.zeros((VOCAB_SIZE, EMBEDDING_DIM))

# tokenizer에 있는 단어 사전을 순회하면서 word2vec의 200차원 vector를 가져오기
for word, idx in tokenizer.word_index.items():
    embedding_vector = ko_model[word] if word in ko_model else None
    if embedding_vector is not None:
        embedding_matrix[idx] = embedding_vector
        
embedding_matrix.shape

  # Remove the CWD from sys.path while we load stuff.
  # Remove the CWD from sys.path while we load stuff.


(40015, 200)

In [None]:
word_index['최고']

In [46]:
ko_model['최고']

  """Entry point for launching an IPython kernel.


array([ 0.20391989,  0.853267  , -1.0373845 , -0.76402867,  0.23367792,
       -0.10476255,  0.63840646, -0.34341994, -0.65395963, -0.19811577,
        0.12869409,  0.04804405, -0.29633138,  0.83437115, -0.6217438 ,
       -0.46856475,  0.6242253 ,  0.23053367,  0.06618986,  0.9534592 ,
        1.1363881 ,  0.65448356,  0.07115816, -0.04635384,  0.62641287,
       -0.6753973 ,  0.68135244, -0.16181223, -0.01163022,  0.0247314 ,
       -0.8546157 , -0.13200873, -0.725247  ,  0.45765552,  0.03358468,
       -0.6409772 ,  0.67605525,  1.1375098 ,  0.02453117, -0.807451  ,
        0.00170812, -0.14917341,  0.755414  ,  0.893047  ,  0.36652172,
        0.10760504, -1.0363314 , -0.14687106,  0.4854047 , -1.525439  ,
       -0.03832943, -0.43770686, -0.97458017, -1.2530044 , -0.5024239 ,
        0.13908501, -0.9827486 , -0.42983523, -0.7944189 ,  0.9259701 ,
       -0.31750435, -0.0639666 ,  0.26727185,  0.34849894, -0.34705332,
        0.7082175 ,  0.16015601,  1.0106773 , -0.13566843, -0.88

In [48]:
embedding_matrix[16]

array([-0.85884821,  0.92817163,  0.02203734,  1.36972463, -0.47099909,
       -1.50105476, -0.49414608,  0.49253559, -0.23244537, -0.04845205,
        0.09304108, -0.11300896,  0.5571053 , -0.11795222,  0.49380097,
       -0.55563283,  0.11953602, -0.18062858, -1.25813925, -0.81152546,
        0.42466506,  0.8579601 ,  0.19897878, -0.81691873,  0.86665642,
       -1.04654455, -0.49829051,  0.25393131, -0.09490417, -0.93356013,
       -1.64327502,  0.32434571,  1.26135635,  1.10397637, -0.21867326,
       -0.55674183,  0.90235335, -1.17161012,  0.77494115, -0.52130711,
       -0.21292363, -0.30222899, -0.80103284, -0.12620486, -1.19516397,
       -0.22341141,  0.38163522,  0.00649113,  0.26027831,  0.02779087,
        0.95976293, -0.24270368, -0.39551368,  0.12866431, -0.59832066,
        0.12882997, -0.12808815,  0.87104553,  0.00889622, -0.32881373,
        0.03664391,  0.78057766,  0.28789434,  2.01207733,  0.0571333 ,
        1.24475467, -0.7004512 , -0.26596418, -0.51164573,  0.31

## 모델링

In [65]:
# 모델링  - L2 추가 
embedding_dim = 200
filter_sizes = (3, 4, 5)
num_filters = 100
dropout = 0.5
hidden_dims = 100

conv_blocks =[]
input_shape = (30)
model_input = tf.keras.layers.Input(shape=input_shape)
z = model_input
for sz in filter_sizes:
    embedding = tf.keras.layers.Embedding(vocab_size, embedding_dim, input_length=max_length,
                                         weights = [embedding_matrix], trainable = False)(z)
    conv = tf.keras.layers.Conv1D(filters=num_filters,
                         kernel_size=sz,
                         padding="valid",
                         activation="relu",
                         strides=1)(embedding)
    conv = tf.keras.layers.GlobalAveragePooling1D()(conv)
    conv = tf.keras.layers.Flatten()(conv)
    conv_blocks.append(conv)
z = tf.keras.layers.Concatenate()(conv_blocks) if len(conv_blocks) > 1 else conv_blocks[0]

z = tf.keras.layers.Dense(hidden_dims, activation="relu", kernel_regularizer=tf.keras.regularizers.l2(0.003), bias_regularizer=tf.keras.regularizers.l2(0.003))(z)
z = tf.keras.layers.Dropout(dropout)(z)
model_output = tf.keras.layers.Dense(1, activation="sigmoid")(z)
model = tf.keras.Model(model_input, model_output)

In [66]:
model.summary()

Model: "functional_5"
__________________________________________________________________________________________________
Layer (type)                    Output Shape         Param #     Connected to                     
input_5 (InputLayer)            [(None, 30)]         0                                            
__________________________________________________________________________________________________
embedding_7 (Embedding)         (None, 30, 200)      8003000     input_5[0][0]                    
__________________________________________________________________________________________________
embedding_8 (Embedding)         (None, 30, 200)      8003000     input_5[0][0]                    
__________________________________________________________________________________________________
embedding_9 (Embedding)         (None, 30, 200)      8003000     input_5[0][0]                    
_______________________________________________________________________________________

In [67]:
# 모델 컴파일 
batch_size = 50
num_epochs = 10
min_word_count = 1
context = 10

model.compile(loss='binary_crossentropy', optimizer='adam', metrics=['accuracy'])

In [68]:
# model training 
num_epochs = 30
early_stopping=tf.keras.callbacks.EarlyStopping(patience=2)
history = model.fit(training_padded, training_labels, epochs=num_epochs, validation_data=(testing_padded, testing_labels), callbacks=[early_stopping])

Epoch 1/30
Epoch 2/30
 338/4556 [=>............................] - ETA: 1:52 - loss: 0.4274 - accuracy: 0.8100

KeyboardInterrupt: 