In [1]:
# 딥러닝을 이용한 자연어처리
# 1. 데이터 준비
# 2. 텍스트를 표준화
# 3. 텍스트 분할(토큰화)
# 4. 어휘 인덱싱

In [1]:
import string

In [2]:
test_sentence = "I write, rewrite, and still rewrite again!!"

In [3]:
text = test_sentence.lower()
text

'i write, rewrite, and still rewrite again!!'

In [4]:
text = "".join([char for char in text if char not in string.punctuation])
text


'i write rewrite and still rewrite again'

In [5]:
# 데이터 표준화 함수
def standardize(text):
    text = text.lower()
    return "".join([char for char in text if char not in string.punctuation])

In [6]:
# 토큰화
def tokenize(text):
    return text.split()

In [7]:
# vocabulary 화
vocabulary = {"":0, "[UNK]":1}


In [8]:
dataset = [
    "I write, erase, rewrite",
    "Erase again, and then",
    "A poppy blooms.",
]

In [9]:
for text in dataset:
    text = standardize(text)
    tokens = tokenize(text)
    for token in tokens:
        if token not in vocabulary:
            vocabulary[token] = len(vocabulary)

In [10]:
dict((k,v) for k, v in vocabulary.items())

{'': 0,
 '[UNK]': 1,
 'i': 2,
 'write': 3,
 'erase': 4,
 'rewrite': 5,
 'again': 6,
 'and': 7,
 'then': 8,
 'a': 9,
 'poppy': 10,
 'blooms': 11}

In [11]:
vocabulary

{'': 0,
 '[UNK]': 1,
 'i': 2,
 'write': 3,
 'erase': 4,
 'rewrite': 5,
 'again': 6,
 'and': 7,
 'then': 8,
 'a': 9,
 'poppy': 10,
 'blooms': 11}

In [19]:
class Vectorizer:
    def standardize(self, text):
        text = text.lower()
        return "".join(char for char in text if char not in string.punctuation)
    def tokenize(self, text):
        return text.split()
    def make_vocabulary(self, dataset):
        self.vocabulary = {"":0, '[UNK]' : 1}
        for text in dataset:
            text = self.standardize(text)
            tokens = self.tokenize(text)
            for token in tokens:
                if token not in self.vocabulary:
                    self.vocabulary[token] = len(self.vocabulary)
        self.inverse_vocabulary = dict(
            (v,k) for k, v in self.vocabulary.items()
        )
    def encode(self,text):
        text = self.standardize(text)
        tokens = self.tokenize(text)
        return [self.vocabulary.get(token,1) for token in tokens]
    def decode(self, int_sequence):
        return " ".join(
            self.inverse_vocabulary.get(i,'[UNK]') for i in int_sequence
        )

In [20]:
vectorizer = Vectorizer()
dataset = [
    'I write, erase, rewrite',
    'Erase again, and then',
    'A poppy blooms'
]
vectorizer.make_vocabulary(dataset)

In [22]:
# 단어 집합에 없는 단어일 경우 UNK로 대체
vectorizer.vocabulary

{'': 0,
 '[UNK]': 1,
 'i': 2,
 'write': 3,
 'erase': 4,
 'rewrite': 5,
 'again': 6,
 'and': 7,
 'then': 8,
 'a': 9,
 'poppy': 10,
 'blooms': 11}

In [24]:
test_sentence = "I write, erase, rewrite and still rewrite again"
encoded_sentence = vectorizer.encode(test_sentence)
encoded_sentence

[2, 3, 4, 5, 7, 1, 5, 6]

In [25]:
vectorizer.inverse_vocabulary

{0: '',
 1: '[UNK]',
 2: 'i',
 3: 'write',
 4: 'erase',
 5: 'rewrite',
 6: 'again',
 7: 'and',
 8: 'then',
 9: 'a',
 10: 'poppy',
 11: 'blooms'}

In [26]:
decode_sentence = vectorizer.decode(encoded_sentence)
decode_sentence

'i write erase rewrite and [UNK] rewrite again'

In [31]:
from tensorflow.keras.layers import TextVectorization
text_vectorization = TextVectorization(output_mode='int')

In [33]:
import re
import string
import tensorflow as tf

def custom_standardization_fn(string_tensor):
    lowercase_string =  tf.strings.lower(string_tensor)
    return tf.strings.regex_replace(
        lowercase_string, f"[{re.escape(string.punctuation)}]",""
    )
def custom_split_fn(string_tensor):
    return tf.strings.split(string_tensor)

text_vectorization = TextVectorization(
    output_mode='int',
    standardize=custom_standardization_fn,
    split=custom_split_fn
)

In [35]:
dataset = [
    'I write, erase, rewrite',
    'Erase again, and then',
    'A poppy blooms'
]
text_vectorization.adapt(dataset)

In [37]:
# 어휘 사전 출력
text_vectorization.get_vocabulary()

['',
 '[UNK]',
 'erase',
 'write',
 'then',
 'rewrite',
 'poppy',
 'i',
 'blooms',
 'and',
 'again',
 'a']

In [38]:
vocabulary = text_vectorization.get_vocabulary()
test_sentence = "I write, rewrite, and still rewrite again"
encode_sentence = text_vectorization(test_sentence)
encode_sentence

<tf.Tensor: shape=(7,), dtype=int64, numpy=array([ 7,  3,  5,  9,  1,  5, 10], dtype=int64)>

In [42]:
inverse_vocab = dict(enumerate(vocabulary))
decode_sentence = " ".join(inverse_vocab[int(i)] for i in encode_sentence)
decode_sentence

'i write rewrite and [UNK] rewrite again'

In [44]:
# 단어 그룹을 표현하는 두 가지 방법: 집합과 시퀀스
# IMDB 영화 리뷰 데이터 준비하기

In [48]:
# 리눅스나 코렙에서 사용가능
# !curl -O https://ai.stanford.edu/~amaas/data/sentiment/aclImdb_v1.tar.gz
# !tar -xf aclImdb_v1.tar.gz
# !rm -r aclImdb/train/unsup
# !cat aclImdb/train/pos/4077_10.txt

In [49]:
import urllib.request as req    
url = 'https://ai.stanford.edu/~amaas/data/sentiment/aclImdb_v1.tar.gz'
filename = 'aclImdb_v1.tar.gz'
with req.urlopen(url) as f:
    with open(filename,'wb') as of:
        of.write(f.read())

In [51]:
import tarfile
with tarfile.open(filename,'r:gz') as tr:
    tr.extractall()

In [52]:
import os, pathlib, shutil, random

In [55]:
base_dir = pathlib.Path('aclImdb')
val_dir = base_dir / 'val'
train_dir = base_dir / 'train'
for category in ('neg', 'pos'):
    os.makedirs(val_dir / category)
    files = os.listdir(train_dir / category)
    random.Random(1337).shuffle(files)
    num_val_samples = int(0.2*len(files))
    val_files = files[-num_val_samples:]
    for fname in val_files:
        shutil.move(train_dir / category / fname, val_dir / category / fname)

In [56]:
from tensorflow import keras
batch_size = 32

In [57]:
train_ds = keras.utils.text_dataset_from_directory(
    'aclImdb/train', batch_size=batch_size
)
val_ds = keras.utils.text_dataset_from_directory(
    'aclImdb/val', batch_size=batch_size
)
test_ds = keras.utils.text_dataset_from_directory(
    'aclImdb/test', batch_size=batch_size
)

Found 70000 files belonging to 3 classes.
Found 5000 files belonging to 2 classes.
Found 25000 files belonging to 2 classes.


In [59]:
for inputs,targets in train_ds:
    print(f"inputs.shape : {inputs.shape}")
    print(f"inputs.dtype : {inputs.dtype}")
    print(f"targets.shape : {targets.shape}")
    print(f"targets.dtype : {targets.dtype}")
    print(f"inputs[0] : {inputs[0]}")
    print(f"targets[0] : {targets[0]}")
    break;

inputs.shape : (32,)
inputs.dtype : <dtype: 'string'>
targets.shape : (32,)
targets.dtype : <dtype: 'int32'>
inputs[0] : b"This David Hamilton movie is something of a disaster. It has a half-decent story (coming-of-age/love triangle) and a mediocre script. All of Hamilton's usual trademarks are here - stunningly beautiful & innocent young girls, soft-focus photography, lots of nudity, gentle lesbianism, and a romantic soundtrack. There is/was a good movie to be made with the given material & cast, but this is not it. The editing is appalling, and at times the scene shifts seem somewhat disjointed.<br /><br />It's worth remembering that Hamilton was an iconic photographer of his time and experimented with taking his art into the movie field, in many ways pioneering the penchant for photographers in the adult industry to also dabble in video work.<br /><br />However, on the plus side, there are very good performances from Dawn Dunlap (as Laura) and James Mitchell (as Paul), and the sound

In [60]:
# 단어를 집합으로 처리  : BoW 방식
# TextVectorization층으로 데이터 전처리

In [63]:
text_vectmaprization =  TextVectorization(
    max_tokens=70000 ,
    output_mode='multi_hot'
)
text_only_train_ds =  train_ds.map(lambda x, y : x)
text_vectorization.adapt(text_only_train_ds)

In [65]:
binary_1gram_train_ds =  train_ds.map(lambda x, y : (text_vectorization(x), y),
             num_parallel_calls=4)
binary_1gram_val_ds =  val_ds.map(lambda x, y : (text_vectorization(x), y),
             num_parallel_calls=4)
binary_1gram_test_ds =  test_ds.map(lambda x, y : (text_vectorization(x), y),
             num_parallel_calls=4)

In [67]:
for inputs,targets in binary_1gram_test_ds:
    print(f"inputs.shape : {inputs.shape}")
    print(f"inputs.dtype : {inputs.dtype}")
    print(f"targets.shape : {targets.shape}")
    print(f"targets.dtype : {targets.dtype}")
    print(f"inputs[0] : {inputs[0]}")
    print(f"targets[0] : {targets[0]}")
    break;

inputs.shape : (32, 70000)
inputs.dtype : <dtype: 'float32'>
targets.shape : (32,)
targets.dtype : <dtype: 'int32'>
inputs[0] : [0. 1. 1. ... 0. 0. 0.]
targets[0] : 0


In [68]:
# 모델생성

In [73]:
from tensorflow import keras
from tensorflow.keras import layers
def get_model(max_tokens = 70000, hidden_dim = 16):
    inputs = keras.Input(shape=(max_tokens,))
    x = layers.Dense(hidden_dim, activation='sigmoid')(inputs)
    x = layers.Dropout(0.5)(x)
    outputs = layers.Dense(1,activation = 'sigmoid')(x)
    model = keras.Model(inputs,outputs)
    model.compile(optimizer = 'rmsprop', loss = 'binary_crossentropy',metrics=['accuracy'])
    return model

In [75]:
# 이진 유니그램 모델 훈련하고 테스트
model = get_model()
model.summary()

Model: "model_1"
_________________________________________________________________
 Layer (type)                Output Shape              Param #   
 input_4 (InputLayer)        [(None, 70000)]           0         
                                                                 
 dense_4 (Dense)             (None, 16)                1120016   
                                                                 
 dropout_3 (Dropout)         (None, 16)                0         
                                                                 
 dense_5 (Dense)             (None, 1)                 17        
                                                                 
Total params: 1,120,033
Trainable params: 1,120,033
Non-trainable params: 0
_________________________________________________________________


In [None]:
callbacks = [
    keras.callbacks.ModelCheckpoint('binary_1gram.keras', save_best_only=True)
]
model.fit(binary_1gram_train_ds.cache(),
          validation_data=binary_1gram_val_ds.cache(),
          epochs=10,
          callbacks=callbacks
         )

Epoch 1/10