In [1]:
import numpy as np
import keras
from keras.models import Sequential , Model
from keras.layers import Input , RepeatVector , TimeDistributed  , GRU , Bidirectional , LSTM , Dense
from keras import regularizers
import os
import pickle
from tensorflow.keras.preprocessing.sequence import pad_sequences
from tqdm import tqdm
import matplotlib.pyplot as plt
import tensorflow as tf
import pandas as pd
from keras.models import model_from_json
from keras.datasets import mnist
from gensim.models.word2vec import Word2Vec

gpus = tf.config.experimental.list_physical_devices('GPU')
print(gpus)
if gpus:
    try:
        tf.config.experimental.set_memory_growth(gpus[0], True)
    except RuntimeError as e:
        # 프로그램 시작시에 메모리 증가가 설정되어야만 합니다
        print(e)


Using TensorFlow backend.


[PhysicalDevice(name='/physical_device:GPU:0', device_type='GPU')]


In [2]:
def double_model(maxlen , word_vector):
    inputs = Input(shape=(maxlen , word_vector))
    encoded = GRU(256 , return_sequences=True)(inputs)
    encoded = GRU(128, return_sequences=False)(encoded)
    decoded = RepeatVector(maxlen)(encoded)
    decoded = GRU(128,return_sequences=True)(decoded)
    decoded = GRU(word_vector ,return_sequences=True)(decoded)
    output = TimeDistributed(Dense(word_vector))(decoded)
    sequence_autoencoder = Model(inputs, output)
    
    encoder = Model(inputs, encoded)
    return sequence_autoencoder

In [3]:
def data_load(BASEPATH):
    data_files = []
    filenames = os.listdir(BASEPATH)[:20000]
    for filename in tqdm(filenames):
        with open(os.path.join(BASEPATH,filename) , 'rb') as f:
            data_files.append(pickle.load(f))
    return data_files
    
    

In [4]:
def pre_processing(data_files, word_vector, func_len , vector_size ):
    unknown = 'unknown'
    zero_padding = [0] * vector_size
    result_data = []
    file = []
    for data_file in tqdm(data_files):
        for blocks in data_file:
            vec_block = []
            for block in blocks[:func_len]:
                for mnemonic in block:
                    try:
                        vec_block.append(word_vector[mnemonic])
                    except:
                        vec_block.append(word_vector[unknown])
            if (len(vec_block) >= 30):
                if (len(vec_block) < func_len):
                    for i in range(0, func_len - len(vec_block)):
                        vec_block.append(zero_padding)
                file.append(vec_block[:func_len])
    X_train = np.array(file)
    return X_train

In [5]:
word2vec = Word2Vec.load('word2vec_0402_16_upgrade.wv')
word_vector = word2vec.wv


In [6]:
data_files = data_load(r'C:\capstone\modeling\3\data\ben')

100%|████████████████████████████████████████████████████████████████████████████| 10000/10000 [03:37<00:00, 45.94it/s]


In [7]:
pre_datas = pre_processing(data_files , word_vector , 100 ,16)

100%|████████████████████████████████████████████████████████████████████████████| 10000/10000 [06:28<00:00, 25.76it/s]


In [8]:
data_size = len(pre_datas)
rate = 0.9
X_train = pre_datas[:int(data_size*rate)]
X_test = pre_datas[int(data_size*rate):]
print('Train : ', len(X_train))
print('Test : ', len(X_test))


Train :  2071751
Test :  230195


In [9]:
double_model = double_model(100,16)
double_model.compile(optimizer='adam' , loss='mae')
double_model.summary()

Model: "model_1"
_________________________________________________________________
Layer (type)                 Output Shape              Param #   
input_1 (InputLayer)         (None, 100, 16)           0         
_________________________________________________________________
gru_1 (GRU)                  (None, 100, 256)          209664    
_________________________________________________________________
gru_2 (GRU)                  (None, 128)               147840    
_________________________________________________________________
repeat_vector_1 (RepeatVecto (None, 100, 128)          0         
_________________________________________________________________
gru_3 (GRU)                  (None, 100, 128)          98688     
_________________________________________________________________
gru_4 (GRU)                  (None, 100, 16)           6960      
_________________________________________________________________
time_distributed_1 (TimeDist (None, 100, 16)           272 

In [10]:
X_train.shape

(2071751, 100, 16)

In [14]:
double_model.fit(X_train, X_train,epochs=40,batch_size=2048,shuffle=False,verbose =2 ,validation_data=(X_test, X_test))

Train on 2071751 samples, validate on 230195 samples
Epoch 1/40
 - 765s - loss: 0.8074 - val_loss: 0.8204
Epoch 2/40
 - 764s - loss: 0.8160 - val_loss: 0.8154
Epoch 3/40
 - 763s - loss: 0.8098 - val_loss: 0.7932
Epoch 4/40
 - 763s - loss: 0.8083 - val_loss: 0.8009
Epoch 5/40
 - 763s - loss: 0.8099 - val_loss: 0.7877
Epoch 6/40
 - 713s - loss: 0.8062 - val_loss: 0.8284
Epoch 7/40
 - 763s - loss: 0.8065 - val_loss: 0.8269
Epoch 8/40
 - 764s - loss: 0.8053 - val_loss: 0.7732
Epoch 9/40
 - 763s - loss: 0.7968 - val_loss: 0.8647
Epoch 10/40
 - 763s - loss: 0.7987 - val_loss: 0.7725
Epoch 11/40
 - 762s - loss: 0.7972 - val_loss: 0.7916
Epoch 12/40
 - 762s - loss: 0.8006 - val_loss: 0.7843
Epoch 13/40
 - 762s - loss: 0.7984 - val_loss: 0.7780
Epoch 14/40
 - 767s - loss: 0.7942 - val_loss: 0.7719
Epoch 15/40
 - 764s - loss: 0.7947 - val_loss: 0.8242
Epoch 16/40
 - 765s - loss: 0.7932 - val_loss: 0.7971
Epoch 17/40
 - 761s - loss: 0.8062 - val_loss: 0.7764
Epoch 18/40
 - 763s - loss: 0.7888 - v

<keras.callbacks.callbacks.History at 0x2334d0b6b08>

In [15]:
model_json = double_model.to_json()
with open("model_ida_gru_0406.json", "w") as json_file : 
    json_file.write(model_json)

double_model.save_weights("model_ida_gru_0406.h5")
print("Saved model to disk0406")




Saved model to disk0406
