In [1]:
# 단어를 시퀀스로 처리, 시퀀스 모델링

In [2]:
# 자연어처리2에서 https://ai.stanford.edu/~amaas/data/sentiment/aclImdb_v1.tar.gz 경로에서 다운받은
# 파일을 사용

# 단어를 시퀀스로 처리

In [7]:
# 데이터를 가져오는 부분
import urllib.request as req    
url = 'https://ai.stanford.edu/~amaas/data/sentiment/aclImdb_v1.tar.gz'
filename = 'aclImdb_v1.tar.gz'
with req.urlopen(url) as f:
    with open(filename,'wb') as of:
        of.write(f.read())

In [8]:
# 압축풀기
import tarfile
with tarfile.open(filename,'r:gz') as tr:
    tr.extractall()

In [9]:
# 데이터를 분할 폴더별로 나누기
import os, pathlib, shutil, random
base_dir = pathlib.Path('aclImdb')
val_dir = base_dir / 'val'
train_dir = base_dir / 'train'
for category in ('neg', 'pos'):
    os.makedirs(val_dir / category)
    files = os.listdir(train_dir / category)
    random.Random(1337).shuffle(files)
    num_val_samples = int(0.2*len(files))
    val_files = files[-num_val_samples:]
    for fname in val_files:
        shutil.move(train_dir / category / fname, val_dir / category / fname)

In [12]:
# 데이터셋 확보
import os, pathlib, shutil, random
from tensorflow import keras
batch_size = 32
train_ds = keras.utils.text_dataset_from_directory(
    "aclImdb/train",batch_size=batch_size
)
val_ds = keras.utils.text_dataset_from_directory(
    "aclImdb/val",batch_size=batch_size
)
test_ds = keras.utils.text_dataset_from_directory(
    "aclImdb/test",batch_size=batch_size
)

Found 20000 files belonging to 2 classes.
Found 5000 files belonging to 2 classes.
Found 25000 files belonging to 2 classes.


In [15]:
text_only_trains_ds = train_ds.map(lambda x,y : x)

#### 정수 시퀀스 데이터셋 준비

In [16]:
from tensorflow.keras import layers
max_length = 600
max_tokens = 20000
text_vectorization =  layers.TextVectorization(
    max_tokens = max_tokens,
    output_mode='int',
    output_sequence_length=max_length
)
text_vectorization.adapt(text_only_trains_ds)

int_train_ds =  train_ds.map(
    lambda x, y : (text_vectorization(x),y), num_parallel_calls=4
)
int_val_ds = val_ds.map(
    lambda x, y : (text_vectorization(x),y), num_parallel_calls=4
)
int_test_ds = test_ds.map(
    lambda x, y : (text_vectorization(x),y), num_parallel_calls=4
)

### 원-핫 인코딩된 벡터 시퀀스로 시퀀스 모델

In [22]:
import tensorflow as tf
inputs = keras.Input(shape=(None,), dtype = 'int64')
embedded = tf.one_hot(inputs, depth=max_tokens)
x = layers.Bidirectional(layers.LSTM(32))(embedded)
x = layers.Dropout(0.5)(x)
outputs = layers.Dense(1, activation='sigmoid')(x)
model = keras.Model(inputs,outputs)
model.compile(optimizer='adam',
                    loss = 'binary_crossentropy',
                    metrics=['accuracy']
                   )
model.summary()

Model: "model_1"
_________________________________________________________________
 Layer (type)                Output Shape              Param #   
 input_4 (InputLayer)        [(None, None)]            0         
                                                                 
 tf.one_hot_2 (TFOpLambda)   (None, None, 20000)       0         
                                                                 
 bidirectional_2 (Bidirectio  (None, 64)               5128448   
 nal)                                                            
                                                                 
 dropout_2 (Dropout)         (None, 64)                0         
                                                                 
 dense_2 (Dense)             (None, 1)                 65        
                                                                 
Total params: 5,128,513
Trainable params: 5,128,513
Non-trainable params: 0
_________________________________________________

#### 첫번재 시퀀스 모델 훈련

In [25]:
callback = [
    keras.callbacks.ModelCheckpoint('one_hot_bidir_lstm.keras', save_best_only=True)
]
model.fit(int_train_ds,validation_data=int_val_ds, epochs=10,callbacks=callback)
print(f"테스트 정확도 : {model.evaluate(int_test_ds)[1]:.3f}")

Epoch 1/10
 23/625 [>.............................] - ETA: 2:31:20 - loss: 0.6936 - accuracy: 0.4810


KeyboardInterrupt



#### 단어임베딩
#### 임베딩 층으로 단어 임베딩 학습하기