In [1]:
# 딥러닝을 이용한 자연어처리
# 1. 데이터 준비
# 2. 텍스트를 표준화
# 3. 텍스트 분할(토큰화)
# 4. 어휘 인덱싱

In [1]:
import string

In [2]:
test_sentence = "I write, rewrite, and still rewrite again!!"

In [3]:
text = test_sentence.lower()
text

'i write, rewrite, and still rewrite again!!'

In [4]:
text = "".join([char for char in text if char not in string.punctuation])
text


'i write rewrite and still rewrite again'

In [5]:
# 데이터 표준화 함수
def standardize(text):
    text = text.lower()
    return "".join([char for char in text if char not in string.punctuation])

In [6]:
# 토큰화
def tokenize(text):
    return text.split()

In [7]:
# vocabulary 화
vocabulary = {"":0, "[UNK]":1}


In [8]:
dataset = [
    "I write, erase, rewrite",
    "Erase again, and then",
    "A poppy blooms.",
]

In [9]:
for text in dataset:
    text = standardize(text)
    tokens = tokenize(text)
    for token in tokens:
        if token not in vocabulary:
            vocabulary[token] = len(vocabulary)

In [10]:
dict((k,v) for k, v in vocabulary.items())

{'': 0,
 '[UNK]': 1,
 'i': 2,
 'write': 3,
 'erase': 4,
 'rewrite': 5,
 'again': 6,
 'and': 7,
 'then': 8,
 'a': 9,
 'poppy': 10,
 'blooms': 11}

In [11]:
vocabulary

{'': 0,
 '[UNK]': 1,
 'i': 2,
 'write': 3,
 'erase': 4,
 'rewrite': 5,
 'again': 6,
 'and': 7,
 'then': 8,
 'a': 9,
 'poppy': 10,
 'blooms': 11}

In [19]:
class Vectorizer:
    def standardize(self, text):
        text = text.lower()
        return "".join(char for char in text if char not in string.punctuation)
    def tokenize(self, text):
        return text.split()
    def make_vocabulary(self, dataset):
        self.vocabulary = {"":0, '[UNK]' : 1}
        for text in dataset:
            text = self.standardize(text)
            tokens = self.tokenize(text)
            for token in tokens:
                if token not in self.vocabulary:
                    self.vocabulary[token] = len(self.vocabulary)
        self.inverse_vocabulary = dict(
            (v,k) for k, v in self.vocabulary.items()
        )
    def encode(self,text):
        text = self.standardize(text)
        tokens = self.tokenize(text)
        return [self.vocabulary.get(token,1) for token in tokens]
    def decode(self, int_sequence):
        return " ".join(
            self.inverse_vocabulary.get(i,'[UNK]') for i in int_sequence
        )

In [20]:
vectorizer = Vectorizer()
dataset = [
    'I write, erase, rewrite',
    'Erase again, and then',
    'A poppy blooms'
]
vectorizer.make_vocabulary(dataset)

In [22]:
# 단어 집합에 없는 단어일 경우 UNK로 대체
vectorizer.vocabulary

{'': 0,
 '[UNK]': 1,
 'i': 2,
 'write': 3,
 'erase': 4,
 'rewrite': 5,
 'again': 6,
 'and': 7,
 'then': 8,
 'a': 9,
 'poppy': 10,
 'blooms': 11}

In [24]:
test_sentence = "I write, erase, rewrite and still rewrite again"
encoded_sentence = vectorizer.encode(test_sentence)
encoded_sentence

[2, 3, 4, 5, 7, 1, 5, 6]

In [25]:
vectorizer.inverse_vocabulary

{0: '',
 1: '[UNK]',
 2: 'i',
 3: 'write',
 4: 'erase',
 5: 'rewrite',
 6: 'again',
 7: 'and',
 8: 'then',
 9: 'a',
 10: 'poppy',
 11: 'blooms'}

In [26]:
decode_sentence = vectorizer.decode(encoded_sentence)
decode_sentence

'i write erase rewrite and [UNK] rewrite again'

In [31]:
from tensorflow.keras.layers import TextVectorization
text_vectorization = TextVectorization(output_mode='int')

In [33]:
import re
import string
import tensorflow as tf

def custom_standardization_fn(string_tensor):
    lowercase_string =  tf.strings.lower(string_tensor)
    return tf.strings.regex_replace(
        lowercase_string, f"[{re.escape(string.punctuation)}]",""
    )
def custom_split_fn(string_tensor):
    return tf.strings.split(string_tensor)

text_vectorization = TextVectorization(
    output_mode='int',
    standardize=custom_standardization_fn,
    split=custom_split_fn
)

In [35]:
dataset = [
    'I write, erase, rewrite',
    'Erase again, and then',
    'A poppy blooms'
]
text_vectorization.adapt(dataset)

In [37]:
# 어휘 사전 출력
text_vectorization.get_vocabulary()

['',
 '[UNK]',
 'erase',
 'write',
 'then',
 'rewrite',
 'poppy',
 'i',
 'blooms',
 'and',
 'again',
 'a']

In [38]:
vocabulary = text_vectorization.get_vocabulary()
test_sentence = "I write, rewrite, and still rewrite again"
encode_sentence = text_vectorization(test_sentence)
encode_sentence

<tf.Tensor: shape=(7,), dtype=int64, numpy=array([ 7,  3,  5,  9,  1,  5, 10], dtype=int64)>

In [42]:
inverse_vocab = dict(enumerate(vocabulary))
decode_sentence = " ".join(inverse_vocab[int(i)] for i in encode_sentence)
decode_sentence

'i write rewrite and [UNK] rewrite again'

In [44]:
# 단어 그룹을 표현하는 두 가지 방법: 집합과 시퀀스
# IMDB 영화 리뷰 데이터 준비하기

In [48]:
# 리눅스나 코렙에서 사용가능
# !curl -O https://ai.stanford.edu/~amaas/data/sentiment/aclImdb_v1.tar.gz
# !tar -xf aclImdb_v1.tar.gz
# !rm -r aclImdb/train/unsup
# !cat aclImdb/train/pos/4077_10.txt

In [49]:
import urllib.request as req    
url = 'https://ai.stanford.edu/~amaas/data/sentiment/aclImdb_v1.tar.gz'
filename = 'aclImdb_v1.tar.gz'
with req.urlopen(url) as f:
    with open(filename,'wb') as of:
        of.write(f.read())

In [51]:
import tarfile
with tarfile.open(filename,'r:gz') as tr:
    tr.extractall()

In [52]:
import os, pathlib, shutil, random

In [55]:
base_dir = pathlib.Path('aclImdb')
val_dir = base_dir / 'val'
train_dir = base_dir / 'train'
for category in ('neg', 'pos'):
    os.makedirs(val_dir / category)
    files = os.listdir(train_dir / category)
    random.Random(1337).shuffle(files)
    num_val_samples = int(0.2*len(files))
    val_files = files[-num_val_samples:]
    for fname in val_files:
        shutil.move(train_dir / category / fname, val_dir / category / fname)

In [56]:
from tensorflow import keras
batch_size = 32

In [77]:
train_ds = keras.utils.text_dataset_from_directory(
    'aclImdb/train', batch_size=batch_size
)
val_ds = keras.utils.text_dataset_from_directory(
    'aclImdb/val', batch_size=batch_size
)
test_ds = keras.utils.text_dataset_from_directory(
    'aclImdb/test', batch_size=batch_size
)

Found 20000 files belonging to 2 classes.
Found 5000 files belonging to 2 classes.
Found 25000 files belonging to 2 classes.


In [78]:
for inputs,targets in train_ds:
    print(f"inputs.shape : {inputs.shape}")
    print(f"inputs.dtype : {inputs.dtype}")
    print(f"targets.shape : {targets.shape}")
    print(f"targets.dtype : {targets.dtype}")
    print(f"inputs[0] : {inputs[0]}")
    print(f"targets[0] : {targets[0]}")
    break;

inputs.shape : (32,)
inputs.dtype : <dtype: 'string'>
targets.shape : (32,)
targets.dtype : <dtype: 'int32'>
inputs[0] : b'Despite being released on DVD by Blue Underground some five years ago, I have never come across this Italian "sword and sorcery" item on late-night Italian TV and, now that I have seen it for myself, I know exactly why. Not because of its director\'s typical predilection for extreme gore (of which there is some examples to be sure) or the fact that the handful of women in it parade topless all the time (it is set in the Dark Ages after all)\xc2\x85it is, quite simply, very poor stuff indeed. In fact, I would go so far as to say that it may very well be the worst of its kind that I have yet seen and, believe me, I have seen plenty (especially in the last few years i.e. following my excursion to the 2004 Venice Film Festival)! Reading about how the film\'s failure at the time of initial release is believed to have led to its director\'s subsequent (and regrettable) c

In [60]:
# 단어를 집합으로 처리  : BoW 방식
# TextVectorization층으로 데이터 전처리

In [87]:
text_vectorization =  TextVectorization(
    max_tokens=20000 ,
    output_mode='multi_hot'
)
text_only_train_ds =  train_ds.map(lambda x, y : x)
text_vectorization.adapt(text_only_train_ds)

In [88]:
binary_1gram_train_ds =  train_ds.map(lambda x, y : (text_vectorization(x), y),
             num_parallel_calls=4)
binary_1gram_val_ds =  val_ds.map(lambda x, y : (text_vectorization(x), y),
             num_parallel_calls=4)
binary_1gram_test_ds =  test_ds.map(lambda x, y : (text_vectorization(x), y),
             num_parallel_calls=4)

In [89]:
for inputs,targets in binary_1gram_train_ds:
    print(f"inputs.shape : {inputs.shape}")
    print(f"inputs.dtype : {inputs.dtype}")
    print(f"targets.shape : {targets.shape}")
    print(f"targets.dtype : {targets.dtype}")
    print(f"inputs[0] : {inputs[0]}")
    print(f"targets[0] : {targets[0]}")
    break;

inputs.shape : (32, 20000)
inputs.dtype : <dtype: 'float32'>
targets.shape : (32,)
targets.dtype : <dtype: 'int32'>
inputs[0] : [1. 1. 1. ... 0. 0. 0.]
targets[0] : 1


In [90]:
# 모델생성

In [91]:
from tensorflow import keras
from tensorflow.keras import layers
def get_model(max_tokens = 20000, hidden_dim = 16):
    inputs = keras.Input(shape=(max_tokens,))
    x = layers.Dense(hidden_dim, activation='sigmoid')(inputs)
    x = layers.Dropout(0.5)(x)
    outputs = layers.Dense(1,activation = 'sigmoid')(x)
    model = keras.Model(inputs,outputs)
    model.compile(optimizer = 'rmsprop', loss = 'binary_crossentropy',metrics=['accuracy'])
    return model

In [92]:
# 이진 유니그램 모델 훈련하고 테스트
model = get_model()
model.summary()

Model: "model_3"
_________________________________________________________________
 Layer (type)                Output Shape              Param #   
 input_6 (InputLayer)        [(None, 20000)]           0         
                                                                 
 dense_8 (Dense)             (None, 16)                320016    
                                                                 
 dropout_5 (Dropout)         (None, 16)                0         
                                                                 
 dense_9 (Dense)             (None, 1)                 17        
                                                                 
Total params: 320,033
Trainable params: 320,033
Non-trainable params: 0
_________________________________________________________________


In [93]:
callbacks = [
    keras.callbacks.ModelCheckpoint('binary_1gram.keras', save_best_only=True)
]
model.fit(binary_1gram_train_ds.cache(),
          validation_data=binary_1gram_val_ds.cache(),
          epochs=10,
          callbacks=callbacks
         )

Epoch 1/10
Epoch 2/10
Epoch 3/10
Epoch 4/10
Epoch 5/10
Epoch 6/10
Epoch 7/10
Epoch 8/10
Epoch 9/10
Epoch 10/10


<keras.callbacks.History at 0x262707b43d0>

In [94]:
# 이진 인코딩을 사용한 바이그램
#바이그램을 반환하는 TextVectorization 층 만들기
# 바이그램 : 문자나 음절 또는 단어인 코든 문자열에서 인접한 두 요소의 시권스

In [95]:
text_vectorization = TextVectorization(
    ngrams=2,
    max_tokens = 20000,
    output_mode = 'multi_hot'
)

In [97]:
text_only_train_ds =  train_ds.map(lambda x, y : x)
text_vectorization.adapt(text_only_train_ds)
binary_2gram_train_ds =  train_ds.map(lambda x, y : (text_vectorization(x), y),
             num_parallel_calls=4)
binary_2gram_val_ds =  val_ds.map(lambda x, y : (text_vectorization(x), y),
             num_parallel_calls=4)
binary_2gram_test_ds =  test_ds.map(lambda x, y : (text_vectorization(x), y),
             num_parallel_calls=4)
model = get_model()
callbacks = [
    keras.callbacks.ModelCheckpoint('binary_2gram.keras', save_best_only=True)
]
model.fit(binary_1gram_train_ds.cache(),
          validation_data=binary_1gram_val_ds.cache(),
          epochs=10,
          callbacks=callbacks
         )

Epoch 1/10
Epoch 2/10
Epoch 3/10
Epoch 4/10
Epoch 5/10
Epoch 6/10
Epoch 7/10
Epoch 8/10
Epoch 9/10
Epoch 10/10


<keras.callbacks.History at 0x26200245550>

In [98]:
# TF-IDF인코딩을 사용한 바이어그램
# 토큰 카운트를 반환하는 TextVectorization 층

In [99]:
text_vectorization = TextVectorization(
    ngrams=2,
    max_tokens = 20000,
    output_mode = 'count'
)

In [100]:
# TF-IDF 가중치가 적용된 출력을 반환하는 TextBectorization 층

In [101]:
text_vectorization =  TextVectorization(
    ngrams=2,
    max_tokens=20000,
    output_mode = 'tf_idf'
)

In [102]:
tf.__version__

'2.10.0'

In [103]:
# 텐서 2.8.xx 이상에서는 gup에서 오류 - 2.9에서는 해결
# with tf.device('cpu'):
#     text_vectorization.adapt(text_only_train_ds)

In [115]:
text_only_train_ds =  train_ds.map(lambda x, y : x)
text_vectorization.adapt(text_only_train_ds)
tfidf_2gram_train_ds =  train_ds.map(lambda x, y : (text_vectorization(x), y),
             num_parallel_calls=4)
tfidf_2gram_val_ds =  val_ds.map(lambda x, y : (text_vectorization(x), y),
             num_parallel_calls=4)
tfidf_2gram_test_ds =  test_ds.map(lambda x, y : (text_vectorization(x), y),
             num_parallel_calls=4)
model = get_model()
callbacks = [
    keras.callbacks.ModelCheckpoint('tfidf_2gram.keras', save_best_only=True)
]
model.fit(tfidf_2gram_train_ds.cache(),
          validation_data=tfidf_2gram_val_ds.cache(),
          epochs=10,
          callbacks=callbacks
         )

Epoch 1/10
Epoch 2/10
Epoch 3/10
Epoch 4/10
Epoch 5/10
Epoch 6/10
Epoch 7/10
Epoch 8/10
Epoch 9/10
Epoch 10/10


<keras.callbacks.History at 0x262007f5460>

In [116]:
inputs = keras.Input(shape=(1,), dtype='string')
processed_inputs = text_vectorization(inputs)
outputs = model(processed_inputs)
inference_model = keras.Model(inputs,outputs)

In [126]:
raw_text_data = tf.convert_to_tensor([
    ['That was an excellent movie, i hate it.']    
])
raw_text_data2 = tf.convert_to_tensor([
['The movie was boring in the first half, but it got more and more interesting. Disadvantages of having a lot of CG']
])    

In [127]:
predictions = inference_model(raw_text_data)
print(f"긍정적인 리뷰일  확율 : {float(predictions[0]*100)}")

predictions = inference_model(raw_text_data2)
print(f"긍정적인 리뷰일  확율 : {float(predictions[0]*100)}")

긍정적인 리뷰일  확율 : 98.97091674804688
긍정적인 리뷰일  확율 : 13.248270988464355
